summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-09-03 21:25:03 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-03 21:25:03 -0400
commit32dad03d164206ea886885d0740284ba215b0970 (patch)
tree5fd89fe27295bfbe47dce5f274aa645099741a71
parent357397a14117f0c2eeafcac06a1f8412a02aa6af (diff)
parentd1625964da51bda61306ad3ec45307a799c21f08 (diff)
Merge branch 'for-3.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: "A lot of activities on the cgroup front. Most changes aren't visible to userland at all at this point and are laying foundation for the planned unified hierarchy. - The biggest change is decoupling the lifetime management of css (cgroup_subsys_state) from that of cgroup's. Because controllers (cpu, memory, block and so on) will need to be dynamically enabled and disabled, css which is the association point between a cgroup and a controller may come and go dynamically across the lifetime of a cgroup. Till now, css's were created when the associated cgroup was created and stayed till the cgroup got destroyed. Assumptions around this tight coupling permeated through cgroup core and controllers. These assumptions are gradually removed, which consists bulk of patches, and css destruction path is completely decoupled from cgroup destruction path. Note that decoupling of creation path is relatively easy on top of these changes and the patchset is pending for the next window. - cgroup has its own event mechanism cgroup.event_control, which is only used by memcg. It is overly complex trying to achieve high flexibility whose benefits seem dubious at best. Going forward, new events will simply generate file modified event and the existing mechanism is being made specific to memcg. This pull request contains prepatory patches for such change. - Various fixes and cleanups" Fixed up conflict in kernel/cgroup.c as per Tejun. * 'for-3.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (69 commits) cgroup: fix cgroup_css() invocation in css_from_id() cgroup: make cgroup_write_event_control() use css_from_dir() instead of __d_cgrp() cgroup: make cgroup_event hold onto cgroup_subsys_state instead of cgroup cgroup: implement CFTYPE_NO_PREFIX cgroup: make cgroup_css() take cgroup_subsys * instead and allow NULL subsys cgroup: rename cgroup_css_from_dir() to css_from_dir() and update its syntax cgroup: fix cgroup_write_event_control() cgroup: fix subsystem file accesses on the root cgroup cgroup: change cgroup_from_id() to css_from_id() cgroup: use css_get() in cgroup_create() to check CSS_ROOT cpuset: remove an unncessary forward declaration cgroup: RCU protect each cgroup_subsys_state release cgroup: move subsys file removal to kill_css() cgroup: factor out kill_css() cgroup: decouple cgroup_subsys_state destruction from cgroup destruction cgroup: replace cgroup->css_kill_cnt with ->nr_css cgroup: bounce cgroup_subsys_state ref kill confirmation to a work item cgroup: move cgroup->subsys[] assignment to online_css() cgroup: reorganize css init / exit paths cgroup: add __rcu modifier to cgroup->subsys[] ...
-rw-r--r--block/blk-cgroup.c49
-rw-r--r--block/blk-cgroup.h38
-rw-r--r--block/blk-throttle.c43
-rw-r--r--block/cfq-iosched.c90
-rw-r--r--fs/bio.c2
-rw-r--r--include/linux/cgroup.h303
-rw-r--r--include/linux/memcontrol.h2
-rw-r--r--include/linux/vmpressure.h6
-rw-r--r--include/net/cls_cgroup.h4
-rw-r--r--include/net/netprio_cgroup.h8
-rw-r--r--kernel/cgroup.c1643
-rw-r--r--kernel/cgroup_freezer.c155
-rw-r--r--kernel/cpuset.c317
-rw-r--r--kernel/events/core.c27
-rw-r--r--kernel/sched/core.c113
-rw-r--r--kernel/sched/cpuacct.c51
-rw-r--r--kernel/sched/sched.h6
-rw-r--r--mm/hugetlb_cgroup.c69
-rw-r--r--mm/memcontrol.c223
-rw-r--r--mm/vmpressure.c25
-rw-r--r--net/core/netprio_cgroup.c72
-rw-r--r--net/ipv4/tcp_memcontrol.c12
-rw-r--r--net/sched/cls_cgroup.c39
-rw-r--r--security/device_cgroup.c65
24 files changed, 1751 insertions, 1611 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 290792a13e3c..e90c7c164c83 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -437,10 +437,10 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
437 return &blkg->rl; 437 return &blkg->rl;
438} 438}
439 439
440static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, 440static int blkcg_reset_stats(struct cgroup_subsys_state *css,
441 u64 val) 441 struct cftype *cftype, u64 val)
442{ 442{
443 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 443 struct blkcg *blkcg = css_to_blkcg(css);
444 struct blkcg_gq *blkg; 444 struct blkcg_gq *blkg;
445 int i; 445 int i;
446 446
@@ -614,15 +614,13 @@ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
614{ 614{
615 struct blkcg_policy *pol = blkcg_policy[pd->plid]; 615 struct blkcg_policy *pol = blkcg_policy[pd->plid];
616 struct blkcg_gq *pos_blkg; 616 struct blkcg_gq *pos_blkg;
617 struct cgroup *pos_cgrp; 617 struct cgroup_subsys_state *pos_css;
618 u64 sum; 618 u64 sum = 0;
619 619
620 lockdep_assert_held(pd->blkg->q->queue_lock); 620 lockdep_assert_held(pd->blkg->q->queue_lock);
621 621
622 sum = blkg_stat_read((void *)pd + off);
623
624 rcu_read_lock(); 622 rcu_read_lock();
625 blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { 623 blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
626 struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); 624 struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
627 struct blkg_stat *stat = (void *)pos_pd + off; 625 struct blkg_stat *stat = (void *)pos_pd + off;
628 626
@@ -649,16 +647,14 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
649{ 647{
650 struct blkcg_policy *pol = blkcg_policy[pd->plid]; 648 struct blkcg_policy *pol = blkcg_policy[pd->plid];
651 struct blkcg_gq *pos_blkg; 649 struct blkcg_gq *pos_blkg;
652 struct cgroup *pos_cgrp; 650 struct cgroup_subsys_state *pos_css;
653 struct blkg_rwstat sum; 651 struct blkg_rwstat sum = { };
654 int i; 652 int i;
655 653
656 lockdep_assert_held(pd->blkg->q->queue_lock); 654 lockdep_assert_held(pd->blkg->q->queue_lock);
657 655
658 sum = blkg_rwstat_read((void *)pd + off);
659
660 rcu_read_lock(); 656 rcu_read_lock();
661 blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { 657 blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
662 struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); 658 struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
663 struct blkg_rwstat *rwstat = (void *)pos_pd + off; 659 struct blkg_rwstat *rwstat = (void *)pos_pd + off;
664 struct blkg_rwstat tmp; 660 struct blkg_rwstat tmp;
@@ -765,18 +761,18 @@ struct cftype blkcg_files[] = {
765 761
766/** 762/**
767 * blkcg_css_offline - cgroup css_offline callback 763 * blkcg_css_offline - cgroup css_offline callback
768 * @cgroup: cgroup of interest 764 * @css: css of interest
769 * 765 *
770 * This function is called when @cgroup is about to go away and responsible 766 * This function is called when @css is about to go away and responsible
771 * for shooting down all blkgs associated with @cgroup. blkgs should be 767 * for shooting down all blkgs associated with @css. blkgs should be
772 * removed while holding both q and blkcg locks. As blkcg lock is nested 768 * removed while holding both q and blkcg locks. As blkcg lock is nested
773 * inside q lock, this function performs reverse double lock dancing. 769 * inside q lock, this function performs reverse double lock dancing.
774 * 770 *
775 * This is the blkcg counterpart of ioc_release_fn(). 771 * This is the blkcg counterpart of ioc_release_fn().
776 */ 772 */
777static void blkcg_css_offline(struct cgroup *cgroup) 773static void blkcg_css_offline(struct cgroup_subsys_state *css)
778{ 774{
779 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 775 struct blkcg *blkcg = css_to_blkcg(css);
780 776
781 spin_lock_irq(&blkcg->lock); 777 spin_lock_irq(&blkcg->lock);
782 778
@@ -798,21 +794,21 @@ static void blkcg_css_offline(struct cgroup *cgroup)
798 spin_unlock_irq(&blkcg->lock); 794 spin_unlock_irq(&blkcg->lock);
799} 795}
800 796
801static void blkcg_css_free(struct cgroup *cgroup) 797static void blkcg_css_free(struct cgroup_subsys_state *css)
802{ 798{
803 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 799 struct blkcg *blkcg = css_to_blkcg(css);
804 800
805 if (blkcg != &blkcg_root) 801 if (blkcg != &blkcg_root)
806 kfree(blkcg); 802 kfree(blkcg);
807} 803}
808 804
809static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup) 805static struct cgroup_subsys_state *
806blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
810{ 807{
811 static atomic64_t id_seq = ATOMIC64_INIT(0); 808 static atomic64_t id_seq = ATOMIC64_INIT(0);
812 struct blkcg *blkcg; 809 struct blkcg *blkcg;
813 struct cgroup *parent = cgroup->parent;
814 810
815 if (!parent) { 811 if (!parent_css) {
816 blkcg = &blkcg_root; 812 blkcg = &blkcg_root;
817 goto done; 813 goto done;
818 } 814 }
@@ -883,14 +879,15 @@ void blkcg_exit_queue(struct request_queue *q)
883 * of the main cic data structures. For now we allow a task to change 879 * of the main cic data structures. For now we allow a task to change
884 * its cgroup only if it's the only owner of its ioc. 880 * its cgroup only if it's the only owner of its ioc.
885 */ 881 */
886static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 882static int blkcg_can_attach(struct cgroup_subsys_state *css,
883 struct cgroup_taskset *tset)
887{ 884{
888 struct task_struct *task; 885 struct task_struct *task;
889 struct io_context *ioc; 886 struct io_context *ioc;
890 int ret = 0; 887 int ret = 0;
891 888
892 /* task_lock() is needed to avoid races with exit_io_context() */ 889 /* task_lock() is needed to avoid races with exit_io_context() */
893 cgroup_taskset_for_each(task, cgrp, tset) { 890 cgroup_taskset_for_each(task, css, tset) {
894 task_lock(task); 891 task_lock(task);
895 ioc = task->io_context; 892 ioc = task->io_context;
896 if (ioc && atomic_read(&ioc->nr_tasks) > 1) 893 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
@@ -1127,7 +1124,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
1127 1124
1128 /* kill the intf files first */ 1125 /* kill the intf files first */
1129 if (pol->cftypes) 1126 if (pol->cftypes)
1130 cgroup_rm_cftypes(&blkio_subsys, pol->cftypes); 1127 cgroup_rm_cftypes(pol->cftypes);
1131 1128
1132 /* unregister and update blkgs */ 1129 /* unregister and update blkgs */
1133 blkcg_policy[pol->plid] = NULL; 1130 blkcg_policy[pol->plid] = NULL;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 8056c03a3382..ae6969a7ffd4 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -179,22 +179,20 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
179void blkg_conf_finish(struct blkg_conf_ctx *ctx); 179void blkg_conf_finish(struct blkg_conf_ctx *ctx);
180 180
181 181
182static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) 182static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
183{ 183{
184 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), 184 return css ? container_of(css, struct blkcg, css) : NULL;
185 struct blkcg, css);
186} 185}
187 186
188static inline struct blkcg *task_blkcg(struct task_struct *tsk) 187static inline struct blkcg *task_blkcg(struct task_struct *tsk)
189{ 188{
190 return container_of(task_subsys_state(tsk, blkio_subsys_id), 189 return css_to_blkcg(task_css(tsk, blkio_subsys_id));
191 struct blkcg, css);
192} 190}
193 191
194static inline struct blkcg *bio_blkcg(struct bio *bio) 192static inline struct blkcg *bio_blkcg(struct bio *bio)
195{ 193{
196 if (bio && bio->bi_css) 194 if (bio && bio->bi_css)
197 return container_of(bio->bi_css, struct blkcg, css); 195 return css_to_blkcg(bio->bi_css);
198 return task_blkcg(current); 196 return task_blkcg(current);
199} 197}
200 198
@@ -206,9 +204,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
206 */ 204 */
207static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) 205static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
208{ 206{
209 struct cgroup *pcg = blkcg->css.cgroup->parent; 207 return css_to_blkcg(css_parent(&blkcg->css));
210
211 return pcg ? cgroup_to_blkcg(pcg) : NULL;
212} 208}
213 209
214/** 210/**
@@ -288,32 +284,33 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
288/** 284/**
289 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants 285 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
290 * @d_blkg: loop cursor pointing to the current descendant 286 * @d_blkg: loop cursor pointing to the current descendant
291 * @pos_cgrp: used for iteration 287 * @pos_css: used for iteration
292 * @p_blkg: target blkg to walk descendants of 288 * @p_blkg: target blkg to walk descendants of
293 * 289 *
294 * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU 290 * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
295 * read locked. If called under either blkcg or queue lock, the iteration 291 * read locked. If called under either blkcg or queue lock, the iteration
296 * is guaranteed to include all and only online blkgs. The caller may 292 * is guaranteed to include all and only online blkgs. The caller may
297 * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip 293 * update @pos_css by calling css_rightmost_descendant() to skip subtree.
298 * subtree. 294 * @p_blkg is included in the iteration and the first node to be visited.
299 */ 295 */
300#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \ 296#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \
301 cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ 297 css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \
302 if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ 298 if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
303 (p_blkg)->q, false))) 299 (p_blkg)->q, false)))
304 300
305/** 301/**
306 * blkg_for_each_descendant_post - post-order walk of a blkg's descendants 302 * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
307 * @d_blkg: loop cursor pointing to the current descendant 303 * @d_blkg: loop cursor pointing to the current descendant
308 * @pos_cgrp: used for iteration 304 * @pos_css: used for iteration
309 * @p_blkg: target blkg to walk descendants of 305 * @p_blkg: target blkg to walk descendants of
310 * 306 *
311 * Similar to blkg_for_each_descendant_pre() but performs post-order 307 * Similar to blkg_for_each_descendant_pre() but performs post-order
312 * traversal instead. Synchronization rules are the same. 308 * traversal instead. Synchronization rules are the same. @p_blkg is
309 * included in the iteration and the last node to be visited.
313 */ 310 */
314#define blkg_for_each_descendant_post(d_blkg, pos_cgrp, p_blkg) \ 311#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \
315 cgroup_for_each_descendant_post((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ 312 css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \
316 if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ 313 if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
317 (p_blkg)->q, false))) 314 (p_blkg)->q, false)))
318 315
319/** 316/**
@@ -576,7 +573,6 @@ static inline int blkcg_activate_policy(struct request_queue *q,
576static inline void blkcg_deactivate_policy(struct request_queue *q, 573static inline void blkcg_deactivate_policy(struct request_queue *q,
577 const struct blkcg_policy *pol) { } 574 const struct blkcg_policy *pol) { }
578 575
579static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
580static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } 576static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
581 577
582static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, 578static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 08a32dfd3844..8331aba9426f 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1293,10 +1293,10 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
1293 return __blkg_prfill_rwstat(sf, pd, &rwstat); 1293 return __blkg_prfill_rwstat(sf, pd, &rwstat);
1294} 1294}
1295 1295
1296static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, 1296static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css,
1297 struct seq_file *sf) 1297 struct cftype *cft, struct seq_file *sf)
1298{ 1298{
1299 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1299 struct blkcg *blkcg = css_to_blkcg(css);
1300 1300
1301 blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, 1301 blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
1302 cft->private, true); 1302 cft->private, true);
@@ -1325,31 +1325,31 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
1325 return __blkg_prfill_u64(sf, pd, v); 1325 return __blkg_prfill_u64(sf, pd, v);
1326} 1326}
1327 1327
1328static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft, 1328static int tg_print_conf_u64(struct cgroup_subsys_state *css,
1329 struct seq_file *sf) 1329 struct cftype *cft, struct seq_file *sf)
1330{ 1330{
1331 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64, 1331 blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64,
1332 &blkcg_policy_throtl, cft->private, false); 1332 &blkcg_policy_throtl, cft->private, false);
1333 return 0; 1333 return 0;
1334} 1334}
1335 1335
1336static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft, 1336static int tg_print_conf_uint(struct cgroup_subsys_state *css,
1337 struct seq_file *sf) 1337 struct cftype *cft, struct seq_file *sf)
1338{ 1338{
1339 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint, 1339 blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint,
1340 &blkcg_policy_throtl, cft->private, false); 1340 &blkcg_policy_throtl, cft->private, false);
1341 return 0; 1341 return 0;
1342} 1342}
1343 1343
1344static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, 1344static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft,
1345 bool is_u64) 1345 const char *buf, bool is_u64)
1346{ 1346{
1347 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1347 struct blkcg *blkcg = css_to_blkcg(css);
1348 struct blkg_conf_ctx ctx; 1348 struct blkg_conf_ctx ctx;
1349 struct throtl_grp *tg; 1349 struct throtl_grp *tg;
1350 struct throtl_service_queue *sq; 1350 struct throtl_service_queue *sq;
1351 struct blkcg_gq *blkg; 1351 struct blkcg_gq *blkg;
1352 struct cgroup *pos_cgrp; 1352 struct cgroup_subsys_state *pos_css;
1353 int ret; 1353 int ret;
1354 1354
1355 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); 1355 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
@@ -1379,8 +1379,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
1379 * restrictions in the whole hierarchy and allows them to bypass 1379 * restrictions in the whole hierarchy and allows them to bypass
1380 * blk-throttle. 1380 * blk-throttle.
1381 */ 1381 */
1382 tg_update_has_rules(tg); 1382 blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg)
1383 blkg_for_each_descendant_pre(blkg, pos_cgrp, ctx.blkg)
1384 tg_update_has_rules(blkg_to_tg(blkg)); 1383 tg_update_has_rules(blkg_to_tg(blkg));
1385 1384
1386 /* 1385 /*
@@ -1403,16 +1402,16 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
1403 return 0; 1402 return 0;
1404} 1403}
1405 1404
1406static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft, 1405static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1407 const char *buf) 1406 const char *buf)
1408{ 1407{
1409 return tg_set_conf(cgrp, cft, buf, true); 1408 return tg_set_conf(css, cft, buf, true);
1410} 1409}
1411 1410
1412static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft, 1411static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft,
1413 const char *buf) 1412 const char *buf)
1414{ 1413{
1415 return tg_set_conf(cgrp, cft, buf, false); 1414 return tg_set_conf(css, cft, buf, false);
1416} 1415}
1417 1416
1418static struct cftype throtl_files[] = { 1417static struct cftype throtl_files[] = {
@@ -1623,7 +1622,7 @@ void blk_throtl_drain(struct request_queue *q)
1623{ 1622{
1624 struct throtl_data *td = q->td; 1623 struct throtl_data *td = q->td;
1625 struct blkcg_gq *blkg; 1624 struct blkcg_gq *blkg;
1626 struct cgroup *pos_cgrp; 1625 struct cgroup_subsys_state *pos_css;
1627 struct bio *bio; 1626 struct bio *bio;
1628 int rw; 1627 int rw;
1629 1628
@@ -1636,11 +1635,9 @@ void blk_throtl_drain(struct request_queue *q)
1636 * better to walk service_queue tree directly but blkg walk is 1635 * better to walk service_queue tree directly but blkg walk is
1637 * easier. 1636 * easier.
1638 */ 1637 */
1639 blkg_for_each_descendant_post(blkg, pos_cgrp, td->queue->root_blkg) 1638 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg)
1640 tg_drain_bios(&blkg_to_tg(blkg)->service_queue); 1639 tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
1641 1640
1642 tg_drain_bios(&td_root_tg(td)->service_queue);
1643
1644 /* finally, transfer bios from top-level tg's into the td */ 1641 /* finally, transfer bios from top-level tg's into the td */
1645 tg_drain_bios(&td->service_queue); 1642 tg_drain_bios(&td->service_queue);
1646 1643
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d5bbdcfd0dab..dabb9d02cf9a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1607,12 +1607,11 @@ static u64 cfqg_prfill_weight_device(struct seq_file *sf,
1607 return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); 1607 return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1608} 1608}
1609 1609
1610static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft, 1610static int cfqg_print_weight_device(struct cgroup_subsys_state *css,
1611 struct seq_file *sf) 1611 struct cftype *cft, struct seq_file *sf)
1612{ 1612{
1613 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), 1613 blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_weight_device,
1614 cfqg_prfill_weight_device, &blkcg_policy_cfq, 0, 1614 &blkcg_policy_cfq, 0, false);
1615 false);
1616 return 0; 1615 return 0;
1617} 1616}
1618 1617
@@ -1626,35 +1625,34 @@ static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
1626 return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); 1625 return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
1627} 1626}
1628 1627
1629static int cfqg_print_leaf_weight_device(struct cgroup *cgrp, 1628static int cfqg_print_leaf_weight_device(struct cgroup_subsys_state *css,
1630 struct cftype *cft, 1629 struct cftype *cft,
1631 struct seq_file *sf) 1630 struct seq_file *sf)
1632{ 1631{
1633 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), 1632 blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_leaf_weight_device,
1634 cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0, 1633 &blkcg_policy_cfq, 0, false);
1635 false);
1636 return 0; 1634 return 0;
1637} 1635}
1638 1636
1639static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, 1637static int cfq_print_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1640 struct seq_file *sf) 1638 struct seq_file *sf)
1641{ 1639{
1642 seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight); 1640 seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_weight);
1643 return 0; 1641 return 0;
1644} 1642}
1645 1643
1646static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft, 1644static int cfq_print_leaf_weight(struct cgroup_subsys_state *css,
1647 struct seq_file *sf) 1645 struct cftype *cft, struct seq_file *sf)
1648{ 1646{
1649 seq_printf(sf, "%u\n", 1647 seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_leaf_weight);
1650 cgroup_to_blkcg(cgrp)->cfq_leaf_weight);
1651 return 0; 1648 return 0;
1652} 1649}
1653 1650
1654static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, 1651static int __cfqg_set_weight_device(struct cgroup_subsys_state *css,
1655 const char *buf, bool is_leaf_weight) 1652 struct cftype *cft, const char *buf,
1653 bool is_leaf_weight)
1656{ 1654{
1657 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1655 struct blkcg *blkcg = css_to_blkcg(css);
1658 struct blkg_conf_ctx ctx; 1656 struct blkg_conf_ctx ctx;
1659 struct cfq_group *cfqg; 1657 struct cfq_group *cfqg;
1660 int ret; 1658 int ret;
@@ -1680,22 +1678,22 @@ static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1680 return ret; 1678 return ret;
1681} 1679}
1682 1680
1683static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, 1681static int cfqg_set_weight_device(struct cgroup_subsys_state *css,
1684 const char *buf) 1682 struct cftype *cft, const char *buf)
1685{ 1683{
1686 return __cfqg_set_weight_device(cgrp, cft, buf, false); 1684 return __cfqg_set_weight_device(css, cft, buf, false);
1687} 1685}
1688 1686
1689static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft, 1687static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css,
1690 const char *buf) 1688 struct cftype *cft, const char *buf)
1691{ 1689{
1692 return __cfqg_set_weight_device(cgrp, cft, buf, true); 1690 return __cfqg_set_weight_device(css, cft, buf, true);
1693} 1691}
1694 1692
1695static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val, 1693static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1696 bool is_leaf_weight) 1694 u64 val, bool is_leaf_weight)
1697{ 1695{
1698 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1696 struct blkcg *blkcg = css_to_blkcg(css);
1699 struct blkcg_gq *blkg; 1697 struct blkcg_gq *blkg;
1700 1698
1701 if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) 1699 if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
@@ -1727,30 +1725,32 @@ static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val,
1727 return 0; 1725 return 0;
1728} 1726}
1729 1727
1730static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) 1728static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1729 u64 val)
1731{ 1730{
1732 return __cfq_set_weight(cgrp, cft, val, false); 1731 return __cfq_set_weight(css, cft, val, false);
1733} 1732}
1734 1733
1735static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) 1734static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
1735 struct cftype *cft, u64 val)
1736{ 1736{
1737 return __cfq_set_weight(cgrp, cft, val, true); 1737 return __cfq_set_weight(css, cft, val, true);
1738} 1738}
1739 1739
1740static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft, 1740static int cfqg_print_stat(struct cgroup_subsys_state *css, struct cftype *cft,
1741 struct seq_file *sf) 1741 struct seq_file *sf)
1742{ 1742{
1743 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1743 struct blkcg *blkcg = css_to_blkcg(css);
1744 1744
1745 blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq, 1745 blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq,
1746 cft->private, false); 1746 cft->private, false);
1747 return 0; 1747 return 0;
1748} 1748}
1749 1749
1750static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft, 1750static int cfqg_print_rwstat(struct cgroup_subsys_state *css,
1751 struct seq_file *sf) 1751 struct cftype *cft, struct seq_file *sf)
1752{ 1752{
1753 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1753 struct blkcg *blkcg = css_to_blkcg(css);
1754 1754
1755 blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq, 1755 blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
1756 cft->private, true); 1756 cft->private, true);
@@ -1773,20 +1773,20 @@ static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
1773 return __blkg_prfill_rwstat(sf, pd, &sum); 1773 return __blkg_prfill_rwstat(sf, pd, &sum);
1774} 1774}
1775 1775
1776static int cfqg_print_stat_recursive(struct cgroup *cgrp, struct cftype *cft, 1776static int cfqg_print_stat_recursive(struct cgroup_subsys_state *css,
1777 struct seq_file *sf) 1777 struct cftype *cft, struct seq_file *sf)
1778{ 1778{
1779 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1779 struct blkcg *blkcg = css_to_blkcg(css);
1780 1780
1781 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive, 1781 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive,
1782 &blkcg_policy_cfq, cft->private, false); 1782 &blkcg_policy_cfq, cft->private, false);
1783 return 0; 1783 return 0;
1784} 1784}
1785 1785
1786static int cfqg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft, 1786static int cfqg_print_rwstat_recursive(struct cgroup_subsys_state *css,
1787 struct seq_file *sf) 1787 struct cftype *cft, struct seq_file *sf)
1788{ 1788{
1789 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1789 struct blkcg *blkcg = css_to_blkcg(css);
1790 1790
1791 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive, 1791 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive,
1792 &blkcg_policy_cfq, cft->private, true); 1792 &blkcg_policy_cfq, cft->private, true);
@@ -1810,10 +1810,10 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1810} 1810}
1811 1811
1812/* print avg_queue_size */ 1812/* print avg_queue_size */
1813static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft, 1813static int cfqg_print_avg_queue_size(struct cgroup_subsys_state *css,
1814 struct seq_file *sf) 1814 struct cftype *cft, struct seq_file *sf)
1815{ 1815{
1816 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1816 struct blkcg *blkcg = css_to_blkcg(css);
1817 1817
1818 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size, 1818 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
1819 &blkcg_policy_cfq, 0, false); 1819 &blkcg_policy_cfq, 0, false);
diff --git a/fs/bio.c b/fs/bio.c
index c5eae7251490..b3b20ed9510e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1956,7 +1956,7 @@ int bio_associate_current(struct bio *bio)
1956 1956
1957 /* associate blkcg if exists */ 1957 /* associate blkcg if exists */
1958 rcu_read_lock(); 1958 rcu_read_lock();
1959 css = task_subsys_state(current, blkio_subsys_id); 1959 css = task_css(current, blkio_subsys_id);
1960 if (css && css_tryget(css)) 1960 if (css && css_tryget(css))
1961 bio->bi_css = css; 1961 bio->bi_css = css;
1962 rcu_read_unlock(); 1962 rcu_read_unlock();
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e9ac882868c0..3561d305b1e0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -66,22 +66,25 @@ enum cgroup_subsys_id {
66 66
67/* Per-subsystem/per-cgroup state maintained by the system. */ 67/* Per-subsystem/per-cgroup state maintained by the system. */
68struct cgroup_subsys_state { 68struct cgroup_subsys_state {
69 /* 69 /* the cgroup that this css is attached to */
70 * The cgroup that this subsystem is attached to. Useful
71 * for subsystems that want to know about the cgroup
72 * hierarchy structure
73 */
74 struct cgroup *cgroup; 70 struct cgroup *cgroup;
75 71
72 /* the cgroup subsystem that this css is attached to */
73 struct cgroup_subsys *ss;
74
76 /* reference count - access via css_[try]get() and css_put() */ 75 /* reference count - access via css_[try]get() and css_put() */
77 struct percpu_ref refcnt; 76 struct percpu_ref refcnt;
78 77
78 /* the parent css */
79 struct cgroup_subsys_state *parent;
80
79 unsigned long flags; 81 unsigned long flags;
80 /* ID for this css, if possible */ 82 /* ID for this css, if possible */
81 struct css_id __rcu *id; 83 struct css_id __rcu *id;
82 84
83 /* Used to put @cgroup->dentry on the last css_put() */ 85 /* percpu_ref killing and RCU release */
84 struct work_struct dput_work; 86 struct rcu_head rcu_head;
87 struct work_struct destroy_work;
85}; 88};
86 89
87/* bits in struct cgroup_subsys_state flags field */ 90/* bits in struct cgroup_subsys_state flags field */
@@ -161,7 +164,16 @@ struct cgroup_name {
161struct cgroup { 164struct cgroup {
162 unsigned long flags; /* "unsigned long" so bitops work */ 165 unsigned long flags; /* "unsigned long" so bitops work */
163 166
164 int id; /* ida allocated in-hierarchy ID */ 167 /*
168 * idr allocated in-hierarchy ID.
169 *
170 * The ID of the root cgroup is always 0, and a new cgroup
171 * will be assigned with a smallest available ID.
172 */
173 int id;
174
175 /* the number of attached css's */
176 int nr_css;
165 177
166 /* 178 /*
167 * We link our 'sibling' struct into our parent's 'children'. 179 * We link our 'sibling' struct into our parent's 'children'.
@@ -196,7 +208,7 @@ struct cgroup {
196 struct cgroup_name __rcu *name; 208 struct cgroup_name __rcu *name;
197 209
198 /* Private pointers for each registered subsystem */ 210 /* Private pointers for each registered subsystem */
199 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 211 struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
200 212
201 struct cgroupfs_root *root; 213 struct cgroupfs_root *root;
202 214
@@ -220,10 +232,12 @@ struct cgroup {
220 struct list_head pidlists; 232 struct list_head pidlists;
221 struct mutex pidlist_mutex; 233 struct mutex pidlist_mutex;
222 234
235 /* dummy css with NULL ->ss, points back to this cgroup */
236 struct cgroup_subsys_state dummy_css;
237
223 /* For css percpu_ref killing and RCU-protected deletion */ 238 /* For css percpu_ref killing and RCU-protected deletion */
224 struct rcu_head rcu_head; 239 struct rcu_head rcu_head;
225 struct work_struct destroy_work; 240 struct work_struct destroy_work;
226 atomic_t css_kill_cnt;
227 241
228 /* List of events which userspace want to receive */ 242 /* List of events which userspace want to receive */
229 struct list_head event_list; 243 struct list_head event_list;
@@ -322,7 +336,7 @@ struct cgroupfs_root {
322 unsigned long flags; 336 unsigned long flags;
323 337
324 /* IDs for cgroups in this hierarchy */ 338 /* IDs for cgroups in this hierarchy */
325 struct ida cgroup_ida; 339 struct idr cgroup_idr;
326 340
327 /* The path to use for release notifications. */ 341 /* The path to use for release notifications. */
328 char release_agent_path[PATH_MAX]; 342 char release_agent_path[PATH_MAX];
@@ -394,9 +408,10 @@ struct cgroup_map_cb {
394 408
395/* cftype->flags */ 409/* cftype->flags */
396enum { 410enum {
397 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cg */ 411 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */
398 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cg */ 412 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */
399 CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */ 413 CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */
414 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
400}; 415};
401 416
402#define MAX_CFTYPE_NAME 64 417#define MAX_CFTYPE_NAME 64
@@ -424,35 +439,41 @@ struct cftype {
424 /* CFTYPE_* flags */ 439 /* CFTYPE_* flags */
425 unsigned int flags; 440 unsigned int flags;
426 441
442 /*
443 * The subsys this file belongs to. Initialized automatically
444 * during registration. NULL for cgroup core files.
445 */
446 struct cgroup_subsys *ss;
447
427 int (*open)(struct inode *inode, struct file *file); 448 int (*open)(struct inode *inode, struct file *file);
428 ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, 449 ssize_t (*read)(struct cgroup_subsys_state *css, struct cftype *cft,
429 struct file *file, 450 struct file *file,
430 char __user *buf, size_t nbytes, loff_t *ppos); 451 char __user *buf, size_t nbytes, loff_t *ppos);
431 /* 452 /*
432 * read_u64() is a shortcut for the common case of returning a 453 * read_u64() is a shortcut for the common case of returning a
433 * single integer. Use it in place of read() 454 * single integer. Use it in place of read()
434 */ 455 */
435 u64 (*read_u64)(struct cgroup *cgrp, struct cftype *cft); 456 u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft);
436 /* 457 /*
437 * read_s64() is a signed version of read_u64() 458 * read_s64() is a signed version of read_u64()
438 */ 459 */
439 s64 (*read_s64)(struct cgroup *cgrp, struct cftype *cft); 460 s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
440 /* 461 /*
441 * read_map() is used for defining a map of key/value 462 * read_map() is used for defining a map of key/value
442 * pairs. It should call cb->fill(cb, key, value) for each 463 * pairs. It should call cb->fill(cb, key, value) for each
443 * entry. The key/value pairs (and their ordering) should not 464 * entry. The key/value pairs (and their ordering) should not
444 * change between reboots. 465 * change between reboots.
445 */ 466 */
446 int (*read_map)(struct cgroup *cgrp, struct cftype *cft, 467 int (*read_map)(struct cgroup_subsys_state *css, struct cftype *cft,
447 struct cgroup_map_cb *cb); 468 struct cgroup_map_cb *cb);
448 /* 469 /*
449 * read_seq_string() is used for outputting a simple sequence 470 * read_seq_string() is used for outputting a simple sequence
450 * using seqfile. 471 * using seqfile.
451 */ 472 */
452 int (*read_seq_string)(struct cgroup *cgrp, struct cftype *cft, 473 int (*read_seq_string)(struct cgroup_subsys_state *css,
453 struct seq_file *m); 474 struct cftype *cft, struct seq_file *m);
454 475
455 ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft, 476 ssize_t (*write)(struct cgroup_subsys_state *css, struct cftype *cft,
456 struct file *file, 477 struct file *file,
457 const char __user *buf, size_t nbytes, loff_t *ppos); 478 const char __user *buf, size_t nbytes, loff_t *ppos);
458 479
@@ -461,18 +482,20 @@ struct cftype {
461 * a single integer (as parsed by simple_strtoull) from 482 * a single integer (as parsed by simple_strtoull) from
462 * userspace. Use in place of write(); return 0 or error. 483 * userspace. Use in place of write(); return 0 or error.
463 */ 484 */
464 int (*write_u64)(struct cgroup *cgrp, struct cftype *cft, u64 val); 485 int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft,
486 u64 val);
465 /* 487 /*
466 * write_s64() is a signed version of write_u64() 488 * write_s64() is a signed version of write_u64()
467 */ 489 */
468 int (*write_s64)(struct cgroup *cgrp, struct cftype *cft, s64 val); 490 int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft,
491 s64 val);
469 492
470 /* 493 /*
471 * write_string() is passed a nul-terminated kernelspace 494 * write_string() is passed a nul-terminated kernelspace
472 * buffer of maximum length determined by max_write_len. 495 * buffer of maximum length determined by max_write_len.
473 * Returns 0 or -ve error code. 496 * Returns 0 or -ve error code.
474 */ 497 */
475 int (*write_string)(struct cgroup *cgrp, struct cftype *cft, 498 int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft,
476 const char *buffer); 499 const char *buffer);
477 /* 500 /*
478 * trigger() callback can be used to get some kick from the 501 * trigger() callback can be used to get some kick from the
@@ -480,7 +503,7 @@ struct cftype {
480 * at all. The private field can be used to determine the 503 * at all. The private field can be used to determine the
481 * kick type for multiplexing. 504 * kick type for multiplexing.
482 */ 505 */
483 int (*trigger)(struct cgroup *cgrp, unsigned int event); 506 int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
484 507
485 int (*release)(struct inode *inode, struct file *file); 508 int (*release)(struct inode *inode, struct file *file);
486 509
@@ -490,16 +513,18 @@ struct cftype {
490 * you want to provide this functionality. Use eventfd_signal() 513 * you want to provide this functionality. Use eventfd_signal()
491 * on eventfd to send notification to userspace. 514 * on eventfd to send notification to userspace.
492 */ 515 */
493 int (*register_event)(struct cgroup *cgrp, struct cftype *cft, 516 int (*register_event)(struct cgroup_subsys_state *css,
494 struct eventfd_ctx *eventfd, const char *args); 517 struct cftype *cft, struct eventfd_ctx *eventfd,
518 const char *args);
495 /* 519 /*
496 * unregister_event() callback will be called when userspace 520 * unregister_event() callback will be called when userspace
497 * closes the eventfd or on cgroup removing. 521 * closes the eventfd or on cgroup removing.
498 * This callback must be implemented, if you want provide 522 * This callback must be implemented, if you want provide
499 * notification functionality. 523 * notification functionality.
500 */ 524 */
501 void (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, 525 void (*unregister_event)(struct cgroup_subsys_state *css,
502 struct eventfd_ctx *eventfd); 526 struct cftype *cft,
527 struct eventfd_ctx *eventfd);
503}; 528};
504 529
505/* 530/*
@@ -512,15 +537,6 @@ struct cftype_set {
512 struct cftype *cfts; 537 struct cftype *cfts;
513}; 538};
514 539
515struct cgroup_scanner {
516 struct cgroup *cg;
517 int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
518 void (*process_task)(struct task_struct *p,
519 struct cgroup_scanner *scan);
520 struct ptr_heap *heap;
521 void *data;
522};
523
524/* 540/*
525 * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This 541 * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This
526 * function can be called as long as @cgrp is accessible. 542 * function can be called as long as @cgrp is accessible.
@@ -537,7 +553,7 @@ static inline const char *cgroup_name(const struct cgroup *cgrp)
537} 553}
538 554
539int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 555int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
540int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 556int cgroup_rm_cftypes(struct cftype *cfts);
541 557
542bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); 558bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
543 559
@@ -553,20 +569,22 @@ int cgroup_task_count(const struct cgroup *cgrp);
553struct cgroup_taskset; 569struct cgroup_taskset;
554struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); 570struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
555struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); 571struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
556struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset); 572struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
573 int subsys_id);
557int cgroup_taskset_size(struct cgroup_taskset *tset); 574int cgroup_taskset_size(struct cgroup_taskset *tset);
558 575
559/** 576/**
560 * cgroup_taskset_for_each - iterate cgroup_taskset 577 * cgroup_taskset_for_each - iterate cgroup_taskset
561 * @task: the loop cursor 578 * @task: the loop cursor
562 * @skip_cgrp: skip if task's cgroup matches this, %NULL to iterate through all 579 * @skip_css: skip if task's css matches this, %NULL to iterate through all
563 * @tset: taskset to iterate 580 * @tset: taskset to iterate
564 */ 581 */
565#define cgroup_taskset_for_each(task, skip_cgrp, tset) \ 582#define cgroup_taskset_for_each(task, skip_css, tset) \
566 for ((task) = cgroup_taskset_first((tset)); (task); \ 583 for ((task) = cgroup_taskset_first((tset)); (task); \
567 (task) = cgroup_taskset_next((tset))) \ 584 (task) = cgroup_taskset_next((tset))) \
568 if (!(skip_cgrp) || \ 585 if (!(skip_css) || \
569 cgroup_taskset_cur_cgroup((tset)) != (skip_cgrp)) 586 cgroup_taskset_cur_css((tset), \
587 (skip_css)->ss->subsys_id) != (skip_css))
570 588
571/* 589/*
572 * Control Group subsystem type. 590 * Control Group subsystem type.
@@ -574,18 +592,22 @@ int cgroup_taskset_size(struct cgroup_taskset *tset);
574 */ 592 */
575 593
576struct cgroup_subsys { 594struct cgroup_subsys {
577 struct cgroup_subsys_state *(*css_alloc)(struct cgroup *cgrp); 595 struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
578 int (*css_online)(struct cgroup *cgrp); 596 int (*css_online)(struct cgroup_subsys_state *css);
579 void (*css_offline)(struct cgroup *cgrp); 597 void (*css_offline)(struct cgroup_subsys_state *css);
580 void (*css_free)(struct cgroup *cgrp); 598 void (*css_free)(struct cgroup_subsys_state *css);
581 599
582 int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); 600 int (*can_attach)(struct cgroup_subsys_state *css,
583 void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); 601 struct cgroup_taskset *tset);
584 void (*attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); 602 void (*cancel_attach)(struct cgroup_subsys_state *css,
603 struct cgroup_taskset *tset);
604 void (*attach)(struct cgroup_subsys_state *css,
605 struct cgroup_taskset *tset);
585 void (*fork)(struct task_struct *task); 606 void (*fork)(struct task_struct *task);
586 void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp, 607 void (*exit)(struct cgroup_subsys_state *css,
608 struct cgroup_subsys_state *old_css,
587 struct task_struct *task); 609 struct task_struct *task);
588 void (*bind)(struct cgroup *root); 610 void (*bind)(struct cgroup_subsys_state *root_css);
589 611
590 int subsys_id; 612 int subsys_id;
591 int disabled; 613 int disabled;
@@ -641,10 +663,17 @@ struct cgroup_subsys {
641#undef IS_SUBSYS_ENABLED 663#undef IS_SUBSYS_ENABLED
642#undef SUBSYS 664#undef SUBSYS
643 665
644static inline struct cgroup_subsys_state *cgroup_subsys_state( 666/**
645 struct cgroup *cgrp, int subsys_id) 667 * css_parent - find the parent css
668 * @css: the target cgroup_subsys_state
669 *
670 * Return the parent css of @css. This function is guaranteed to return
671 * non-NULL parent as long as @css isn't the root.
672 */
673static inline
674struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css)
646{ 675{
647 return cgrp->subsys[subsys_id]; 676 return css->parent;
648} 677}
649 678
650/** 679/**
@@ -672,7 +701,7 @@ extern struct mutex cgroup_mutex;
672#endif 701#endif
673 702
674/** 703/**
675 * task_subsys_state_check - obtain css for (task, subsys) w/ extra access conds 704 * task_css_check - obtain css for (task, subsys) w/ extra access conds
676 * @task: the target task 705 * @task: the target task
677 * @subsys_id: the target subsystem ID 706 * @subsys_id: the target subsystem ID
678 * @__c: extra condition expression to be passed to rcu_dereference_check() 707 * @__c: extra condition expression to be passed to rcu_dereference_check()
@@ -680,7 +709,7 @@ extern struct mutex cgroup_mutex;
680 * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The 709 * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The
681 * synchronization rules are the same as task_css_set_check(). 710 * synchronization rules are the same as task_css_set_check().
682 */ 711 */
683#define task_subsys_state_check(task, subsys_id, __c) \ 712#define task_css_check(task, subsys_id, __c) \
684 task_css_set_check((task), (__c))->subsys[(subsys_id)] 713 task_css_set_check((task), (__c))->subsys[(subsys_id)]
685 714
686/** 715/**
@@ -695,87 +724,92 @@ static inline struct css_set *task_css_set(struct task_struct *task)
695} 724}
696 725
697/** 726/**
698 * task_subsys_state - obtain css for (task, subsys) 727 * task_css - obtain css for (task, subsys)
699 * @task: the target task 728 * @task: the target task
700 * @subsys_id: the target subsystem ID 729 * @subsys_id: the target subsystem ID
701 * 730 *
702 * See task_subsys_state_check(). 731 * See task_css_check().
703 */ 732 */
704static inline struct cgroup_subsys_state * 733static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
705task_subsys_state(struct task_struct *task, int subsys_id) 734 int subsys_id)
706{ 735{
707 return task_subsys_state_check(task, subsys_id, false); 736 return task_css_check(task, subsys_id, false);
708} 737}
709 738
710static inline struct cgroup* task_cgroup(struct task_struct *task, 739static inline struct cgroup *task_cgroup(struct task_struct *task,
711 int subsys_id) 740 int subsys_id)
712{ 741{
713 return task_subsys_state(task, subsys_id)->cgroup; 742 return task_css(task, subsys_id)->cgroup;
714} 743}
715 744
716struct cgroup *cgroup_next_sibling(struct cgroup *pos); 745struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
746 struct cgroup_subsys_state *parent);
747
748struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
717 749
718/** 750/**
719 * cgroup_for_each_child - iterate through children of a cgroup 751 * css_for_each_child - iterate through children of a css
720 * @pos: the cgroup * to use as the loop cursor 752 * @pos: the css * to use as the loop cursor
721 * @cgrp: cgroup whose children to walk 753 * @parent: css whose children to walk
722 * 754 *
723 * Walk @cgrp's children. Must be called under rcu_read_lock(). A child 755 * Walk @parent's children. Must be called under rcu_read_lock(). A child
724 * cgroup which hasn't finished ->css_online() or already has finished 756 * css which hasn't finished ->css_online() or already has finished
725 * ->css_offline() may show up during traversal and it's each subsystem's 757 * ->css_offline() may show up during traversal and it's each subsystem's
726 * responsibility to verify that each @pos is alive. 758 * responsibility to verify that each @pos is alive.
727 * 759 *
728 * If a subsystem synchronizes against the parent in its ->css_online() and 760 * If a subsystem synchronizes against the parent in its ->css_online() and
729 * before starting iterating, a cgroup which finished ->css_online() is 761 * before starting iterating, a css which finished ->css_online() is
730 * guaranteed to be visible in the future iterations. 762 * guaranteed to be visible in the future iterations.
731 * 763 *
732 * It is allowed to temporarily drop RCU read lock during iteration. The 764 * It is allowed to temporarily drop RCU read lock during iteration. The
733 * caller is responsible for ensuring that @pos remains accessible until 765 * caller is responsible for ensuring that @pos remains accessible until
734 * the start of the next iteration by, for example, bumping the css refcnt. 766 * the start of the next iteration by, for example, bumping the css refcnt.
735 */ 767 */
736#define cgroup_for_each_child(pos, cgrp) \ 768#define css_for_each_child(pos, parent) \
737 for ((pos) = list_first_or_null_rcu(&(cgrp)->children, \ 769 for ((pos) = css_next_child(NULL, (parent)); (pos); \
738 struct cgroup, sibling); \ 770 (pos) = css_next_child((pos), (parent)))
739 (pos); (pos) = cgroup_next_sibling((pos))) 771
772struct cgroup_subsys_state *
773css_next_descendant_pre(struct cgroup_subsys_state *pos,
774 struct cgroup_subsys_state *css);
740 775
741struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, 776struct cgroup_subsys_state *
742 struct cgroup *cgroup); 777css_rightmost_descendant(struct cgroup_subsys_state *pos);
743struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos);
744 778
745/** 779/**
746 * cgroup_for_each_descendant_pre - pre-order walk of a cgroup's descendants 780 * css_for_each_descendant_pre - pre-order walk of a css's descendants
747 * @pos: the cgroup * to use as the loop cursor 781 * @pos: the css * to use as the loop cursor
748 * @cgroup: cgroup whose descendants to walk 782 * @root: css whose descendants to walk
749 * 783 *
750 * Walk @cgroup's descendants. Must be called under rcu_read_lock(). A 784 * Walk @root's descendants. @root is included in the iteration and the
751 * descendant cgroup which hasn't finished ->css_online() or already has 785 * first node to be visited. Must be called under rcu_read_lock(). A
786 * descendant css which hasn't finished ->css_online() or already has
752 * finished ->css_offline() may show up during traversal and it's each 787 * finished ->css_offline() may show up during traversal and it's each
753 * subsystem's responsibility to verify that each @pos is alive. 788 * subsystem's responsibility to verify that each @pos is alive.
754 * 789 *
755 * If a subsystem synchronizes against the parent in its ->css_online() and 790 * If a subsystem synchronizes against the parent in its ->css_online() and
756 * before starting iterating, and synchronizes against @pos on each 791 * before starting iterating, and synchronizes against @pos on each
757 * iteration, any descendant cgroup which finished ->css_online() is 792 * iteration, any descendant css which finished ->css_online() is
758 * guaranteed to be visible in the future iterations. 793 * guaranteed to be visible in the future iterations.
759 * 794 *
760 * In other words, the following guarantees that a descendant can't escape 795 * In other words, the following guarantees that a descendant can't escape
761 * state updates of its ancestors. 796 * state updates of its ancestors.
762 * 797 *
763 * my_online(@cgrp) 798 * my_online(@css)
764 * { 799 * {
765 * Lock @cgrp->parent and @cgrp; 800 * Lock @css's parent and @css;
766 * Inherit state from @cgrp->parent; 801 * Inherit state from the parent;
767 * Unlock both. 802 * Unlock both.
768 * } 803 * }
769 * 804 *
770 * my_update_state(@cgrp) 805 * my_update_state(@css)
771 * { 806 * {
772 * Lock @cgrp; 807 * css_for_each_descendant_pre(@pos, @css) {
773 * Update @cgrp's state;
774 * Unlock @cgrp;
775 *
776 * cgroup_for_each_descendant_pre(@pos, @cgrp) {
777 * Lock @pos; 808 * Lock @pos;
778 * Verify @pos is alive and inherit state from @pos->parent; 809 * if (@pos == @css)
810 * Update @css's state;
811 * else
812 * Verify @pos is alive and inherit state from its parent;
779 * Unlock @pos; 813 * Unlock @pos;
780 * } 814 * }
781 * } 815 * }
@@ -786,8 +820,7 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos);
786 * visible by walking order and, as long as inheriting operations to the 820 * visible by walking order and, as long as inheriting operations to the
787 * same @pos are atomic to each other, multiple updates racing each other 821 * same @pos are atomic to each other, multiple updates racing each other
788 * still result in the correct state. It's guaranateed that at least one 822 * still result in the correct state. It's guaranateed that at least one
789 * inheritance happens for any cgroup after the latest update to its 823 * inheritance happens for any css after the latest update to its parent.
790 * parent.
791 * 824 *
792 * If checking parent's state requires locking the parent, each inheriting 825 * If checking parent's state requires locking the parent, each inheriting
793 * iteration should lock and unlock both @pos->parent and @pos. 826 * iteration should lock and unlock both @pos->parent and @pos.
@@ -800,52 +833,45 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos);
800 * caller is responsible for ensuring that @pos remains accessible until 833 * caller is responsible for ensuring that @pos remains accessible until
801 * the start of the next iteration by, for example, bumping the css refcnt. 834 * the start of the next iteration by, for example, bumping the css refcnt.
802 */ 835 */
803#define cgroup_for_each_descendant_pre(pos, cgroup) \ 836#define css_for_each_descendant_pre(pos, css) \
804 for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos); \ 837 for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \
805 pos = cgroup_next_descendant_pre((pos), (cgroup))) 838 (pos) = css_next_descendant_pre((pos), (css)))
806 839
807struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, 840struct cgroup_subsys_state *
808 struct cgroup *cgroup); 841css_next_descendant_post(struct cgroup_subsys_state *pos,
842 struct cgroup_subsys_state *css);
809 843
810/** 844/**
811 * cgroup_for_each_descendant_post - post-order walk of a cgroup's descendants 845 * css_for_each_descendant_post - post-order walk of a css's descendants
812 * @pos: the cgroup * to use as the loop cursor 846 * @pos: the css * to use as the loop cursor
813 * @cgroup: cgroup whose descendants to walk 847 * @css: css whose descendants to walk
814 * 848 *
815 * Similar to cgroup_for_each_descendant_pre() but performs post-order 849 * Similar to css_for_each_descendant_pre() but performs post-order
816 * traversal instead. Note that the walk visibility guarantee described in 850 * traversal instead. @root is included in the iteration and the last
817 * pre-order walk doesn't apply the same to post-order walks. 851 * node to be visited. Note that the walk visibility guarantee described
852 * in pre-order walk doesn't apply the same to post-order walks.
818 */ 853 */
819#define cgroup_for_each_descendant_post(pos, cgroup) \ 854#define css_for_each_descendant_post(pos, css) \
820 for (pos = cgroup_next_descendant_post(NULL, (cgroup)); (pos); \ 855 for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \
821 pos = cgroup_next_descendant_post((pos), (cgroup))) 856 (pos) = css_next_descendant_post((pos), (css)))
822 857
823/* A cgroup_iter should be treated as an opaque object */ 858/* A css_task_iter should be treated as an opaque object */
824struct cgroup_iter { 859struct css_task_iter {
825 struct list_head *cset_link; 860 struct cgroup_subsys_state *origin_css;
826 struct list_head *task; 861 struct list_head *cset_link;
862 struct list_head *task;
827}; 863};
828 864
829/* 865void css_task_iter_start(struct cgroup_subsys_state *css,
830 * To iterate across the tasks in a cgroup: 866 struct css_task_iter *it);
831 * 867struct task_struct *css_task_iter_next(struct css_task_iter *it);
832 * 1) call cgroup_iter_start to initialize an iterator 868void css_task_iter_end(struct css_task_iter *it);
833 * 869
834 * 2) call cgroup_iter_next() to retrieve member tasks until it 870int css_scan_tasks(struct cgroup_subsys_state *css,
835 * returns NULL or until you want to end the iteration 871 bool (*test)(struct task_struct *, void *),
836 * 872 void (*process)(struct task_struct *, void *),
837 * 3) call cgroup_iter_end() to destroy the iterator. 873 void *data, struct ptr_heap *heap);
838 * 874
839 * Or, call cgroup_scan_tasks() to iterate through every task in a
840 * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling
841 * the test_task() callback, but not while calling the process_task()
842 * callback.
843 */
844void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it);
845struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
846 struct cgroup_iter *it);
847void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
848int cgroup_scan_tasks(struct cgroup_scanner *scan);
849int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); 875int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
850int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); 876int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
851 877
@@ -878,7 +904,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg,
878 904
879/* Get id and depth of css */ 905/* Get id and depth of css */
880unsigned short css_id(struct cgroup_subsys_state *css); 906unsigned short css_id(struct cgroup_subsys_state *css);
881struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id); 907struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
908 struct cgroup_subsys *ss);
882 909
883#else /* !CONFIG_CGROUPS */ 910#else /* !CONFIG_CGROUPS */
884 911
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7b4d9d79570b..6c416092e324 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -85,7 +85,7 @@ extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
85extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm); 85extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm);
86 86
87extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); 87extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
88extern struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont); 88extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css);
89 89
90static inline 90static inline
91bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg) 91bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg)
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 7dc17e2456de..3f3788d49362 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -34,10 +34,12 @@ extern void vmpressure_cleanup(struct vmpressure *vmpr);
34extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); 34extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
35extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); 35extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
36extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); 36extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css);
37extern int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, 37extern int vmpressure_register_event(struct cgroup_subsys_state *css,
38 struct cftype *cft,
38 struct eventfd_ctx *eventfd, 39 struct eventfd_ctx *eventfd,
39 const char *args); 40 const char *args);
40extern void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, 41extern void vmpressure_unregister_event(struct cgroup_subsys_state *css,
42 struct cftype *cft,
41 struct eventfd_ctx *eventfd); 43 struct eventfd_ctx *eventfd);
42#else 44#else
43static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, 45static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index 0fee0617fb7d..52adaa75dac9 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -35,7 +35,7 @@ static inline u32 task_cls_classid(struct task_struct *p)
35 return 0; 35 return 0;
36 36
37 rcu_read_lock(); 37 rcu_read_lock();
38 classid = container_of(task_subsys_state(p, net_cls_subsys_id), 38 classid = container_of(task_css(p, net_cls_subsys_id),
39 struct cgroup_cls_state, css)->classid; 39 struct cgroup_cls_state, css)->classid;
40 rcu_read_unlock(); 40 rcu_read_unlock();
41 41
@@ -51,7 +51,7 @@ static inline u32 task_cls_classid(struct task_struct *p)
51 return 0; 51 return 0;
52 52
53 rcu_read_lock(); 53 rcu_read_lock();
54 css = task_subsys_state(p, net_cls_subsys_id); 54 css = task_css(p, net_cls_subsys_id);
55 if (css) 55 if (css)
56 classid = container_of(css, 56 classid = container_of(css,
57 struct cgroup_cls_state, css)->classid; 57 struct cgroup_cls_state, css)->classid;
diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h
index 50ab8c26ab59..a24f8bb3ca47 100644
--- a/include/net/netprio_cgroup.h
+++ b/include/net/netprio_cgroup.h
@@ -25,10 +25,6 @@ struct netprio_map {
25 u32 priomap[]; 25 u32 priomap[];
26}; 26};
27 27
28struct cgroup_netprio_state {
29 struct cgroup_subsys_state css;
30};
31
32extern void sock_update_netprioidx(struct sock *sk); 28extern void sock_update_netprioidx(struct sock *sk);
33 29
34#if IS_BUILTIN(CONFIG_NETPRIO_CGROUP) 30#if IS_BUILTIN(CONFIG_NETPRIO_CGROUP)
@@ -39,7 +35,7 @@ static inline u32 task_netprioidx(struct task_struct *p)
39 u32 idx; 35 u32 idx;
40 36
41 rcu_read_lock(); 37 rcu_read_lock();
42 css = task_subsys_state(p, net_prio_subsys_id); 38 css = task_css(p, net_prio_subsys_id);
43 idx = css->cgroup->id; 39 idx = css->cgroup->id;
44 rcu_read_unlock(); 40 rcu_read_unlock();
45 return idx; 41 return idx;
@@ -53,7 +49,7 @@ static inline u32 task_netprioidx(struct task_struct *p)
53 u32 idx = 0; 49 u32 idx = 0;
54 50
55 rcu_read_lock(); 51 rcu_read_lock();
56 css = task_subsys_state(p, net_prio_subsys_id); 52 css = task_css(p, net_prio_subsys_id);
57 if (css) 53 if (css)
58 idx = css->cgroup->id; 54 idx = css->cgroup->id;
59 rcu_read_unlock(); 55 rcu_read_unlock();
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e91963302c0d..e0aeb32415ff 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -81,7 +81,7 @@
81 */ 81 */
82#ifdef CONFIG_PROVE_RCU 82#ifdef CONFIG_PROVE_RCU
83DEFINE_MUTEX(cgroup_mutex); 83DEFINE_MUTEX(cgroup_mutex);
84EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ 84EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */
85#else 85#else
86static DEFINE_MUTEX(cgroup_mutex); 86static DEFINE_MUTEX(cgroup_mutex);
87#endif 87#endif
@@ -117,6 +117,7 @@ struct cfent {
117 struct list_head node; 117 struct list_head node;
118 struct dentry *dentry; 118 struct dentry *dentry;
119 struct cftype *type; 119 struct cftype *type;
120 struct cgroup_subsys_state *css;
120 121
121 /* file xattrs */ 122 /* file xattrs */
122 struct simple_xattrs xattrs; 123 struct simple_xattrs xattrs;
@@ -159,9 +160,9 @@ struct css_id {
159 */ 160 */
160struct cgroup_event { 161struct cgroup_event {
161 /* 162 /*
162 * Cgroup which the event belongs to. 163 * css which the event belongs to.
163 */ 164 */
164 struct cgroup *cgrp; 165 struct cgroup_subsys_state *css;
165 /* 166 /*
166 * Control file which the event associated. 167 * Control file which the event associated.
167 */ 168 */
@@ -215,10 +216,33 @@ static u64 cgroup_serial_nr_next = 1;
215 */ 216 */
216static int need_forkexit_callback __read_mostly; 217static int need_forkexit_callback __read_mostly;
217 218
218static void cgroup_offline_fn(struct work_struct *work); 219static struct cftype cgroup_base_files[];
220
221static void cgroup_destroy_css_killed(struct cgroup *cgrp);
219static int cgroup_destroy_locked(struct cgroup *cgrp); 222static int cgroup_destroy_locked(struct cgroup *cgrp);
220static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 223static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
221 struct cftype cfts[], bool is_add); 224 bool is_add);
225
226/**
227 * cgroup_css - obtain a cgroup's css for the specified subsystem
228 * @cgrp: the cgroup of interest
229 * @ss: the subsystem of interest (%NULL returns the dummy_css)
230 *
231 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
232 * function must be called either under cgroup_mutex or rcu_read_lock() and
233 * the caller is responsible for pinning the returned css if it wants to
234 * keep accessing it outside the said locks. This function may return
235 * %NULL if @cgrp doesn't have @subsys_id enabled.
236 */
237static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
238 struct cgroup_subsys *ss)
239{
240 if (ss)
241 return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
242 lockdep_is_held(&cgroup_mutex));
243 else
244 return &cgrp->dummy_css;
245}
222 246
223/* convenient tests for these bits */ 247/* convenient tests for these bits */
224static inline bool cgroup_is_dead(const struct cgroup *cgrp) 248static inline bool cgroup_is_dead(const struct cgroup *cgrp)
@@ -365,9 +389,11 @@ static struct cgrp_cset_link init_cgrp_cset_link;
365static int cgroup_init_idr(struct cgroup_subsys *ss, 389static int cgroup_init_idr(struct cgroup_subsys *ss,
366 struct cgroup_subsys_state *css); 390 struct cgroup_subsys_state *css);
367 391
368/* css_set_lock protects the list of css_set objects, and the 392/*
369 * chain of tasks off each css_set. Nests outside task->alloc_lock 393 * css_set_lock protects the list of css_set objects, and the chain of
370 * due to cgroup_iter_start() */ 394 * tasks off each css_set. Nests outside task->alloc_lock due to
395 * css_task_iter_start().
396 */
371static DEFINE_RWLOCK(css_set_lock); 397static DEFINE_RWLOCK(css_set_lock);
372static int css_set_count; 398static int css_set_count;
373 399
@@ -392,10 +418,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
392 return key; 418 return key;
393} 419}
394 420
395/* We don't maintain the lists running through each css_set to its 421/*
396 * task until after the first call to cgroup_iter_start(). This 422 * We don't maintain the lists running through each css_set to its task
397 * reduces the fork()/exit() overhead for people who have cgroups 423 * until after the first call to css_task_iter_start(). This reduces the
398 * compiled into their kernel but not actually in use */ 424 * fork()/exit() overhead for people who have cgroups compiled into their
425 * kernel but not actually in use.
426 */
399static int use_task_css_set_links __read_mostly; 427static int use_task_css_set_links __read_mostly;
400 428
401static void __put_css_set(struct css_set *cset, int taskexit) 429static void __put_css_set(struct css_set *cset, int taskexit)
@@ -464,7 +492,7 @@ static inline void put_css_set_taskexit(struct css_set *cset)
464 * @new_cgrp: cgroup that's being entered by the task 492 * @new_cgrp: cgroup that's being entered by the task
465 * @template: desired set of css pointers in css_set (pre-calculated) 493 * @template: desired set of css pointers in css_set (pre-calculated)
466 * 494 *
467 * Returns true if "cg" matches "old_cg" except for the hierarchy 495 * Returns true if "cset" matches "old_cset" except for the hierarchy
468 * which "new_cgrp" belongs to, for which it should match "new_cgrp". 496 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
469 */ 497 */
470static bool compare_css_sets(struct css_set *cset, 498static bool compare_css_sets(struct css_set *cset,
@@ -555,7 +583,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
555 /* Subsystem is in this hierarchy. So we want 583 /* Subsystem is in this hierarchy. So we want
556 * the subsystem state from the new 584 * the subsystem state from the new
557 * cgroup */ 585 * cgroup */
558 template[i] = cgrp->subsys[i]; 586 template[i] = cgroup_css(cgrp, ss);
559 } else { 587 } else {
560 /* Subsystem is not in this hierarchy, so we 588 /* Subsystem is not in this hierarchy, so we
561 * don't want to change the subsystem state */ 589 * don't want to change the subsystem state */
@@ -803,8 +831,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
803 831
804static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); 832static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
805static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 833static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
806static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 834static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
807 unsigned long subsys_mask);
808static const struct inode_operations cgroup_dir_inode_operations; 835static const struct inode_operations cgroup_dir_inode_operations;
809static const struct file_operations proc_cgroupstats_operations; 836static const struct file_operations proc_cgroupstats_operations;
810 837
@@ -813,8 +840,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {
813 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 840 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
814}; 841};
815 842
816static int alloc_css_id(struct cgroup_subsys *ss, 843static int alloc_css_id(struct cgroup_subsys_state *child_css);
817 struct cgroup *parent, struct cgroup *child);
818 844
819static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) 845static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
820{ 846{
@@ -845,15 +871,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
845static void cgroup_free_fn(struct work_struct *work) 871static void cgroup_free_fn(struct work_struct *work)
846{ 872{
847 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 873 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
848 struct cgroup_subsys *ss;
849 874
850 mutex_lock(&cgroup_mutex); 875 mutex_lock(&cgroup_mutex);
851 /*
852 * Release the subsystem state objects.
853 */
854 for_each_root_subsys(cgrp->root, ss)
855 ss->css_free(cgrp);
856
857 cgrp->root->number_of_cgroups--; 876 cgrp->root->number_of_cgroups--;
858 mutex_unlock(&cgroup_mutex); 877 mutex_unlock(&cgroup_mutex);
859 878
@@ -864,8 +883,6 @@ static void cgroup_free_fn(struct work_struct *work)
864 */ 883 */
865 dput(cgrp->parent->dentry); 884 dput(cgrp->parent->dentry);
866 885
867 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
868
869 /* 886 /*
870 * Drop the active superblock reference that we took when we 887 * Drop the active superblock reference that we took when we
871 * created the cgroup. This will free cgrp->root, if we are 888 * created the cgroup. This will free cgrp->root, if we are
@@ -956,27 +973,22 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
956} 973}
957 974
958/** 975/**
959 * cgroup_clear_directory - selective removal of base and subsystem files 976 * cgroup_clear_dir - remove subsys files in a cgroup directory
960 * @dir: directory containing the files 977 * @cgrp: target cgroup
961 * @base_files: true if the base files should be removed
962 * @subsys_mask: mask of the subsystem ids whose files should be removed 978 * @subsys_mask: mask of the subsystem ids whose files should be removed
963 */ 979 */
964static void cgroup_clear_directory(struct dentry *dir, bool base_files, 980static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
965 unsigned long subsys_mask)
966{ 981{
967 struct cgroup *cgrp = __d_cgrp(dir);
968 struct cgroup_subsys *ss; 982 struct cgroup_subsys *ss;
983 int i;
969 984
970 for_each_root_subsys(cgrp->root, ss) { 985 for_each_subsys(ss, i) {
971 struct cftype_set *set; 986 struct cftype_set *set;
972 if (!test_bit(ss->subsys_id, &subsys_mask)) 987
988 if (!test_bit(i, &subsys_mask))
973 continue; 989 continue;
974 list_for_each_entry(set, &ss->cftsets, node) 990 list_for_each_entry(set, &ss->cftsets, node)
975 cgroup_addrm_files(cgrp, NULL, set->cfts, false); 991 cgroup_addrm_files(cgrp, set->cfts, false);
976 }
977 if (base_files) {
978 while (!list_empty(&cgrp->files))
979 cgroup_rm_file(cgrp, NULL);
980 } 992 }
981} 993}
982 994
@@ -986,9 +998,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
986static void cgroup_d_remove_dir(struct dentry *dentry) 998static void cgroup_d_remove_dir(struct dentry *dentry)
987{ 999{
988 struct dentry *parent; 1000 struct dentry *parent;
989 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
990
991 cgroup_clear_directory(dentry, true, root->subsys_mask);
992 1001
993 parent = dentry->d_parent; 1002 parent = dentry->d_parent;
994 spin_lock(&parent->d_lock); 1003 spin_lock(&parent->d_lock);
@@ -1009,79 +1018,84 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1009{ 1018{
1010 struct cgroup *cgrp = &root->top_cgroup; 1019 struct cgroup *cgrp = &root->top_cgroup;
1011 struct cgroup_subsys *ss; 1020 struct cgroup_subsys *ss;
1012 int i; 1021 unsigned long pinned = 0;
1022 int i, ret;
1013 1023
1014 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1024 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1015 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 1025 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
1016 1026
1017 /* Check that any added subsystems are currently free */ 1027 /* Check that any added subsystems are currently free */
1018 for_each_subsys(ss, i) { 1028 for_each_subsys(ss, i) {
1019 unsigned long bit = 1UL << i; 1029 if (!(added_mask & (1 << i)))
1020
1021 if (!(bit & added_mask))
1022 continue; 1030 continue;
1023 1031
1032 /* is the subsystem mounted elsewhere? */
1024 if (ss->root != &cgroup_dummy_root) { 1033 if (ss->root != &cgroup_dummy_root) {
1025 /* Subsystem isn't free */ 1034 ret = -EBUSY;
1026 return -EBUSY; 1035 goto out_put;
1036 }
1037
1038 /* pin the module */
1039 if (!try_module_get(ss->module)) {
1040 ret = -ENOENT;
1041 goto out_put;
1027 } 1042 }
1043 pinned |= 1 << i;
1028 } 1044 }
1029 1045
1030 /* Currently we don't handle adding/removing subsystems when 1046 /* subsys could be missing if unloaded between parsing and here */
1031 * any child cgroups exist. This is theoretically supportable 1047 if (added_mask != pinned) {
1032 * but involves complex error handling, so it's being left until 1048 ret = -ENOENT;
1033 * later */ 1049 goto out_put;
1034 if (root->number_of_cgroups > 1) 1050 }
1035 return -EBUSY; 1051
1052 ret = cgroup_populate_dir(cgrp, added_mask);
1053 if (ret)
1054 goto out_put;
1055
1056 /*
1057 * Nothing can fail from this point on. Remove files for the
1058 * removed subsystems and rebind each subsystem.
1059 */
1060 cgroup_clear_dir(cgrp, removed_mask);
1036 1061
1037 /* Process each subsystem */
1038 for_each_subsys(ss, i) { 1062 for_each_subsys(ss, i) {
1039 unsigned long bit = 1UL << i; 1063 unsigned long bit = 1UL << i;
1040 1064
1041 if (bit & added_mask) { 1065 if (bit & added_mask) {
1042 /* We're binding this subsystem to this hierarchy */ 1066 /* We're binding this subsystem to this hierarchy */
1043 BUG_ON(cgrp->subsys[i]); 1067 BUG_ON(cgroup_css(cgrp, ss));
1044 BUG_ON(!cgroup_dummy_top->subsys[i]); 1068 BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
1045 BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); 1069 BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
1070
1071 rcu_assign_pointer(cgrp->subsys[i],
1072 cgroup_css(cgroup_dummy_top, ss));
1073 cgroup_css(cgrp, ss)->cgroup = cgrp;
1046 1074
1047 cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
1048 cgrp->subsys[i]->cgroup = cgrp;
1049 list_move(&ss->sibling, &root->subsys_list); 1075 list_move(&ss->sibling, &root->subsys_list);
1050 ss->root = root; 1076 ss->root = root;
1051 if (ss->bind) 1077 if (ss->bind)
1052 ss->bind(cgrp); 1078 ss->bind(cgroup_css(cgrp, ss));
1053 1079
1054 /* refcount was already taken, and we're keeping it */ 1080 /* refcount was already taken, and we're keeping it */
1055 root->subsys_mask |= bit; 1081 root->subsys_mask |= bit;
1056 } else if (bit & removed_mask) { 1082 } else if (bit & removed_mask) {
1057 /* We're removing this subsystem */ 1083 /* We're removing this subsystem */
1058 BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); 1084 BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
1059 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1085 BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
1060 1086
1061 if (ss->bind) 1087 if (ss->bind)
1062 ss->bind(cgroup_dummy_top); 1088 ss->bind(cgroup_css(cgroup_dummy_top, ss));
1063 cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; 1089
1064 cgrp->subsys[i] = NULL; 1090 cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top;
1091 RCU_INIT_POINTER(cgrp->subsys[i], NULL);
1092
1065 cgroup_subsys[i]->root = &cgroup_dummy_root; 1093 cgroup_subsys[i]->root = &cgroup_dummy_root;
1066 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); 1094 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1067 1095
1068 /* subsystem is now free - drop reference on module */ 1096 /* subsystem is now free - drop reference on module */
1069 module_put(ss->module); 1097 module_put(ss->module);
1070 root->subsys_mask &= ~bit; 1098 root->subsys_mask &= ~bit;
1071 } else if (bit & root->subsys_mask) {
1072 /* Subsystem state should already exist */
1073 BUG_ON(!cgrp->subsys[i]);
1074 /*
1075 * a refcount was taken, but we already had one, so
1076 * drop the extra reference.
1077 */
1078 module_put(ss->module);
1079#ifdef CONFIG_MODULE_UNLOAD
1080 BUG_ON(ss->module && !module_refcount(ss->module));
1081#endif
1082 } else {
1083 /* Subsystem state shouldn't exist */
1084 BUG_ON(cgrp->subsys[i]);
1085 } 1099 }
1086 } 1100 }
1087 1101
@@ -1092,6 +1106,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1092 root->flags |= CGRP_ROOT_SUBSYS_BOUND; 1106 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1093 1107
1094 return 0; 1108 return 0;
1109
1110out_put:
1111 for_each_subsys(ss, i)
1112 if (pinned & (1 << i))
1113 module_put(ss->module);
1114 return ret;
1095} 1115}
1096 1116
1097static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) 1117static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
@@ -1142,7 +1162,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1142 char *token, *o = data; 1162 char *token, *o = data;
1143 bool all_ss = false, one_ss = false; 1163 bool all_ss = false, one_ss = false;
1144 unsigned long mask = (unsigned long)-1; 1164 unsigned long mask = (unsigned long)-1;
1145 bool module_pin_failed = false;
1146 struct cgroup_subsys *ss; 1165 struct cgroup_subsys *ss;
1147 int i; 1166 int i;
1148 1167
@@ -1285,52 +1304,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1285 if (!opts->subsys_mask && !opts->name) 1304 if (!opts->subsys_mask && !opts->name)
1286 return -EINVAL; 1305 return -EINVAL;
1287 1306
1288 /*
1289 * Grab references on all the modules we'll need, so the subsystems
1290 * don't dance around before rebind_subsystems attaches them. This may
1291 * take duplicate reference counts on a subsystem that's already used,
1292 * but rebind_subsystems handles this case.
1293 */
1294 for_each_subsys(ss, i) {
1295 if (!(opts->subsys_mask & (1UL << i)))
1296 continue;
1297 if (!try_module_get(cgroup_subsys[i]->module)) {
1298 module_pin_failed = true;
1299 break;
1300 }
1301 }
1302 if (module_pin_failed) {
1303 /*
1304 * oops, one of the modules was going away. this means that we
1305 * raced with a module_delete call, and to the user this is
1306 * essentially a "subsystem doesn't exist" case.
1307 */
1308 for (i--; i >= 0; i--) {
1309 /* drop refcounts only on the ones we took */
1310 unsigned long bit = 1UL << i;
1311
1312 if (!(bit & opts->subsys_mask))
1313 continue;
1314 module_put(cgroup_subsys[i]->module);
1315 }
1316 return -ENOENT;
1317 }
1318
1319 return 0; 1307 return 0;
1320} 1308}
1321 1309
1322static void drop_parsed_module_refcounts(unsigned long subsys_mask)
1323{
1324 struct cgroup_subsys *ss;
1325 int i;
1326
1327 mutex_lock(&cgroup_mutex);
1328 for_each_subsys(ss, i)
1329 if (subsys_mask & (1UL << i))
1330 module_put(cgroup_subsys[i]->module);
1331 mutex_unlock(&cgroup_mutex);
1332}
1333
1334static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1310static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1335{ 1311{
1336 int ret = 0; 1312 int ret = 0;
@@ -1370,22 +1346,15 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1370 goto out_unlock; 1346 goto out_unlock;
1371 } 1347 }
1372 1348
1373 /* 1349 /* remounting is not allowed for populated hierarchies */
1374 * Clear out the files of subsystems that should be removed, do 1350 if (root->number_of_cgroups > 1) {
1375 * this before rebind_subsystems, since rebind_subsystems may 1351 ret = -EBUSY;
1376 * change this hierarchy's subsys_list.
1377 */
1378 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1379
1380 ret = rebind_subsystems(root, added_mask, removed_mask);
1381 if (ret) {
1382 /* rebind_subsystems failed, re-populate the removed files */
1383 cgroup_populate_dir(cgrp, false, removed_mask);
1384 goto out_unlock; 1352 goto out_unlock;
1385 } 1353 }
1386 1354
1387 /* re-populate subsystem files */ 1355 ret = rebind_subsystems(root, added_mask, removed_mask);
1388 cgroup_populate_dir(cgrp, false, added_mask); 1356 if (ret)
1357 goto out_unlock;
1389 1358
1390 if (opts.release_agent) 1359 if (opts.release_agent)
1391 strcpy(root->release_agent_path, opts.release_agent); 1360 strcpy(root->release_agent_path, opts.release_agent);
@@ -1395,8 +1364,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1395 mutex_unlock(&cgroup_root_mutex); 1364 mutex_unlock(&cgroup_root_mutex);
1396 mutex_unlock(&cgroup_mutex); 1365 mutex_unlock(&cgroup_mutex);
1397 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1366 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1398 if (ret)
1399 drop_parsed_module_refcounts(opts.subsys_mask);
1400 return ret; 1367 return ret;
1401} 1368}
1402 1369
@@ -1416,6 +1383,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1416 INIT_LIST_HEAD(&cgrp->release_list); 1383 INIT_LIST_HEAD(&cgrp->release_list);
1417 INIT_LIST_HEAD(&cgrp->pidlists); 1384 INIT_LIST_HEAD(&cgrp->pidlists);
1418 mutex_init(&cgrp->pidlist_mutex); 1385 mutex_init(&cgrp->pidlist_mutex);
1386 cgrp->dummy_css.cgroup = cgrp;
1419 INIT_LIST_HEAD(&cgrp->event_list); 1387 INIT_LIST_HEAD(&cgrp->event_list);
1420 spin_lock_init(&cgrp->event_list_lock); 1388 spin_lock_init(&cgrp->event_list_lock);
1421 simple_xattrs_init(&cgrp->xattrs); 1389 simple_xattrs_init(&cgrp->xattrs);
@@ -1431,6 +1399,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1431 cgrp->root = root; 1399 cgrp->root = root;
1432 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); 1400 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1433 init_cgroup_housekeeping(cgrp); 1401 init_cgroup_housekeeping(cgrp);
1402 idr_init(&root->cgroup_idr);
1434} 1403}
1435 1404
1436static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) 1405static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
@@ -1503,7 +1472,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1503 */ 1472 */
1504 root->subsys_mask = opts->subsys_mask; 1473 root->subsys_mask = opts->subsys_mask;
1505 root->flags = opts->flags; 1474 root->flags = opts->flags;
1506 ida_init(&root->cgroup_ida);
1507 if (opts->release_agent) 1475 if (opts->release_agent)
1508 strcpy(root->release_agent_path, opts->release_agent); 1476 strcpy(root->release_agent_path, opts->release_agent);
1509 if (opts->name) 1477 if (opts->name)
@@ -1519,7 +1487,7 @@ static void cgroup_free_root(struct cgroupfs_root *root)
1519 /* hierarhcy ID shoulid already have been released */ 1487 /* hierarhcy ID shoulid already have been released */
1520 WARN_ON_ONCE(root->hierarchy_id); 1488 WARN_ON_ONCE(root->hierarchy_id);
1521 1489
1522 ida_destroy(&root->cgroup_ida); 1490 idr_destroy(&root->cgroup_idr);
1523 kfree(root); 1491 kfree(root);
1524 } 1492 }
1525} 1493}
@@ -1584,7 +1552,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1584 int ret = 0; 1552 int ret = 0;
1585 struct super_block *sb; 1553 struct super_block *sb;
1586 struct cgroupfs_root *new_root; 1554 struct cgroupfs_root *new_root;
1555 struct list_head tmp_links;
1587 struct inode *inode; 1556 struct inode *inode;
1557 const struct cred *cred;
1588 1558
1589 /* First find the desired set of subsystems */ 1559 /* First find the desired set of subsystems */
1590 mutex_lock(&cgroup_mutex); 1560 mutex_lock(&cgroup_mutex);
@@ -1600,7 +1570,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1600 new_root = cgroup_root_from_opts(&opts); 1570 new_root = cgroup_root_from_opts(&opts);
1601 if (IS_ERR(new_root)) { 1571 if (IS_ERR(new_root)) {
1602 ret = PTR_ERR(new_root); 1572 ret = PTR_ERR(new_root);
1603 goto drop_modules; 1573 goto out_err;
1604 } 1574 }
1605 opts.new_root = new_root; 1575 opts.new_root = new_root;
1606 1576
@@ -1609,17 +1579,15 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1609 if (IS_ERR(sb)) { 1579 if (IS_ERR(sb)) {
1610 ret = PTR_ERR(sb); 1580 ret = PTR_ERR(sb);
1611 cgroup_free_root(opts.new_root); 1581 cgroup_free_root(opts.new_root);
1612 goto drop_modules; 1582 goto out_err;
1613 } 1583 }
1614 1584
1615 root = sb->s_fs_info; 1585 root = sb->s_fs_info;
1616 BUG_ON(!root); 1586 BUG_ON(!root);
1617 if (root == opts.new_root) { 1587 if (root == opts.new_root) {
1618 /* We used the new root structure, so this is a new hierarchy */ 1588 /* We used the new root structure, so this is a new hierarchy */
1619 struct list_head tmp_links;
1620 struct cgroup *root_cgrp = &root->top_cgroup; 1589 struct cgroup *root_cgrp = &root->top_cgroup;
1621 struct cgroupfs_root *existing_root; 1590 struct cgroupfs_root *existing_root;
1622 const struct cred *cred;
1623 int i; 1591 int i;
1624 struct css_set *cset; 1592 struct css_set *cset;
1625 1593
@@ -1634,6 +1602,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1634 mutex_lock(&cgroup_mutex); 1602 mutex_lock(&cgroup_mutex);
1635 mutex_lock(&cgroup_root_mutex); 1603 mutex_lock(&cgroup_root_mutex);
1636 1604
1605 root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
1606 0, 1, GFP_KERNEL);
1607 if (root_cgrp->id < 0)
1608 goto unlock_drop;
1609
1637 /* Check for name clashes with existing mounts */ 1610 /* Check for name clashes with existing mounts */
1638 ret = -EBUSY; 1611 ret = -EBUSY;
1639 if (strlen(root->name)) 1612 if (strlen(root->name))
@@ -1657,26 +1630,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1657 if (ret) 1630 if (ret)
1658 goto unlock_drop; 1631 goto unlock_drop;
1659 1632
1633 sb->s_root->d_fsdata = root_cgrp;
1634 root_cgrp->dentry = sb->s_root;
1635
1636 /*
1637 * We're inside get_sb() and will call lookup_one_len() to
1638 * create the root files, which doesn't work if SELinux is
1639 * in use. The following cred dancing somehow works around
1640 * it. See 2ce9738ba ("cgroupfs: use init_cred when
1641 * populating new cgroupfs mount") for more details.
1642 */
1643 cred = override_creds(&init_cred);
1644
1645 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1646 if (ret)
1647 goto rm_base_files;
1648
1660 ret = rebind_subsystems(root, root->subsys_mask, 0); 1649 ret = rebind_subsystems(root, root->subsys_mask, 0);
1661 if (ret == -EBUSY) { 1650 if (ret)
1662 free_cgrp_cset_links(&tmp_links); 1651 goto rm_base_files;
1663 goto unlock_drop; 1652
1664 } 1653 revert_creds(cred);
1654
1665 /* 1655 /*
1666 * There must be no failure case after here, since rebinding 1656 * There must be no failure case after here, since rebinding
1667 * takes care of subsystems' refcounts, which are explicitly 1657 * takes care of subsystems' refcounts, which are explicitly
1668 * dropped in the failure exit path. 1658 * dropped in the failure exit path.
1669 */ 1659 */
1670 1660
1671 /* EBUSY should be the only error here */
1672 BUG_ON(ret);
1673
1674 list_add(&root->root_list, &cgroup_roots); 1661 list_add(&root->root_list, &cgroup_roots);
1675 cgroup_root_count++; 1662 cgroup_root_count++;
1676 1663
1677 sb->s_root->d_fsdata = root_cgrp;
1678 root->top_cgroup.dentry = sb->s_root;
1679
1680 /* Link the top cgroup in this hierarchy into all 1664 /* Link the top cgroup in this hierarchy into all
1681 * the css_set objects */ 1665 * the css_set objects */
1682 write_lock(&css_set_lock); 1666 write_lock(&css_set_lock);
@@ -1689,9 +1673,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1689 BUG_ON(!list_empty(&root_cgrp->children)); 1673 BUG_ON(!list_empty(&root_cgrp->children));
1690 BUG_ON(root->number_of_cgroups != 1); 1674 BUG_ON(root->number_of_cgroups != 1);
1691 1675
1692 cred = override_creds(&init_cred);
1693 cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
1694 revert_creds(cred);
1695 mutex_unlock(&cgroup_root_mutex); 1676 mutex_unlock(&cgroup_root_mutex);
1696 mutex_unlock(&cgroup_mutex); 1677 mutex_unlock(&cgroup_mutex);
1697 mutex_unlock(&inode->i_mutex); 1678 mutex_unlock(&inode->i_mutex);
@@ -1711,15 +1692,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1711 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1692 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1712 } 1693 }
1713 } 1694 }
1714
1715 /* no subsys rebinding, so refcounts don't change */
1716 drop_parsed_module_refcounts(opts.subsys_mask);
1717 } 1695 }
1718 1696
1719 kfree(opts.release_agent); 1697 kfree(opts.release_agent);
1720 kfree(opts.name); 1698 kfree(opts.name);
1721 return dget(sb->s_root); 1699 return dget(sb->s_root);
1722 1700
1701 rm_base_files:
1702 free_cgrp_cset_links(&tmp_links);
1703 cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
1704 revert_creds(cred);
1723 unlock_drop: 1705 unlock_drop:
1724 cgroup_exit_root_id(root); 1706 cgroup_exit_root_id(root);
1725 mutex_unlock(&cgroup_root_mutex); 1707 mutex_unlock(&cgroup_root_mutex);
@@ -1727,8 +1709,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1727 mutex_unlock(&inode->i_mutex); 1709 mutex_unlock(&inode->i_mutex);
1728 drop_new_super: 1710 drop_new_super:
1729 deactivate_locked_super(sb); 1711 deactivate_locked_super(sb);
1730 drop_modules:
1731 drop_parsed_module_refcounts(opts.subsys_mask);
1732 out_err: 1712 out_err:
1733 kfree(opts.release_agent); 1713 kfree(opts.release_agent);
1734 kfree(opts.name); 1714 kfree(opts.name);
@@ -1746,6 +1726,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1746 BUG_ON(root->number_of_cgroups != 1); 1726 BUG_ON(root->number_of_cgroups != 1);
1747 BUG_ON(!list_empty(&cgrp->children)); 1727 BUG_ON(!list_empty(&cgrp->children));
1748 1728
1729 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1749 mutex_lock(&cgroup_mutex); 1730 mutex_lock(&cgroup_mutex);
1750 mutex_lock(&cgroup_root_mutex); 1731 mutex_lock(&cgroup_root_mutex);
1751 1732
@@ -1778,6 +1759,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1778 1759
1779 mutex_unlock(&cgroup_root_mutex); 1760 mutex_unlock(&cgroup_root_mutex);
1780 mutex_unlock(&cgroup_mutex); 1761 mutex_unlock(&cgroup_mutex);
1762 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1781 1763
1782 simple_xattrs_free(&cgrp->xattrs); 1764 simple_xattrs_free(&cgrp->xattrs);
1783 1765
@@ -1889,7 +1871,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path);
1889struct task_and_cgroup { 1871struct task_and_cgroup {
1890 struct task_struct *task; 1872 struct task_struct *task;
1891 struct cgroup *cgrp; 1873 struct cgroup *cgrp;
1892 struct css_set *cg; 1874 struct css_set *cset;
1893}; 1875};
1894 1876
1895struct cgroup_taskset { 1877struct cgroup_taskset {
@@ -1939,18 +1921,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1939EXPORT_SYMBOL_GPL(cgroup_taskset_next); 1921EXPORT_SYMBOL_GPL(cgroup_taskset_next);
1940 1922
1941/** 1923/**
1942 * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task 1924 * cgroup_taskset_cur_css - return the matching css for the current task
1943 * @tset: taskset of interest 1925 * @tset: taskset of interest
1926 * @subsys_id: the ID of the target subsystem
1944 * 1927 *
1945 * Return the cgroup for the current (last returned) task of @tset. This 1928 * Return the css for the current (last returned) task of @tset for
1946 * function must be preceded by either cgroup_taskset_first() or 1929 * subsystem specified by @subsys_id. This function must be preceded by
1947 * cgroup_taskset_next(). 1930 * either cgroup_taskset_first() or cgroup_taskset_next().
1948 */ 1931 */
1949struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) 1932struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
1933 int subsys_id)
1950{ 1934{
1951 return tset->cur_cgrp; 1935 return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
1952} 1936}
1953EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); 1937EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
1954 1938
1955/** 1939/**
1956 * cgroup_taskset_size - return the number of tasks in taskset 1940 * cgroup_taskset_size - return the number of tasks in taskset
@@ -2089,8 +2073,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2089 * step 1: check that we can legitimately attach to the cgroup. 2073 * step 1: check that we can legitimately attach to the cgroup.
2090 */ 2074 */
2091 for_each_root_subsys(root, ss) { 2075 for_each_root_subsys(root, ss) {
2076 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2077
2092 if (ss->can_attach) { 2078 if (ss->can_attach) {
2093 retval = ss->can_attach(cgrp, &tset); 2079 retval = ss->can_attach(css, &tset);
2094 if (retval) { 2080 if (retval) {
2095 failed_ss = ss; 2081 failed_ss = ss;
2096 goto out_cancel_attach; 2082 goto out_cancel_attach;
@@ -2107,8 +2093,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2107 2093
2108 tc = flex_array_get(group, i); 2094 tc = flex_array_get(group, i);
2109 old_cset = task_css_set(tc->task); 2095 old_cset = task_css_set(tc->task);
2110 tc->cg = find_css_set(old_cset, cgrp); 2096 tc->cset = find_css_set(old_cset, cgrp);
2111 if (!tc->cg) { 2097 if (!tc->cset) {
2112 retval = -ENOMEM; 2098 retval = -ENOMEM;
2113 goto out_put_css_set_refs; 2099 goto out_put_css_set_refs;
2114 } 2100 }
@@ -2121,7 +2107,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2121 */ 2107 */
2122 for (i = 0; i < group_size; i++) { 2108 for (i = 0; i < group_size; i++) {
2123 tc = flex_array_get(group, i); 2109 tc = flex_array_get(group, i);
2124 cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); 2110 cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
2125 } 2111 }
2126 /* nothing is sensitive to fork() after this point. */ 2112 /* nothing is sensitive to fork() after this point. */
2127 2113
@@ -2129,8 +2115,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2129 * step 4: do subsystem attach callbacks. 2115 * step 4: do subsystem attach callbacks.
2130 */ 2116 */
2131 for_each_root_subsys(root, ss) { 2117 for_each_root_subsys(root, ss) {
2118 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2119
2132 if (ss->attach) 2120 if (ss->attach)
2133 ss->attach(cgrp, &tset); 2121 ss->attach(css, &tset);
2134 } 2122 }
2135 2123
2136 /* 2124 /*
@@ -2141,18 +2129,20 @@ out_put_css_set_refs:
2141 if (retval) { 2129 if (retval) {
2142 for (i = 0; i < group_size; i++) { 2130 for (i = 0; i < group_size; i++) {
2143 tc = flex_array_get(group, i); 2131 tc = flex_array_get(group, i);
2144 if (!tc->cg) 2132 if (!tc->cset)
2145 break; 2133 break;
2146 put_css_set(tc->cg); 2134 put_css_set(tc->cset);
2147 } 2135 }
2148 } 2136 }
2149out_cancel_attach: 2137out_cancel_attach:
2150 if (retval) { 2138 if (retval) {
2151 for_each_root_subsys(root, ss) { 2139 for_each_root_subsys(root, ss) {
2140 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2141
2152 if (ss == failed_ss) 2142 if (ss == failed_ss)
2153 break; 2143 break;
2154 if (ss->cancel_attach) 2144 if (ss->cancel_attach)
2155 ss->cancel_attach(cgrp, &tset); 2145 ss->cancel_attach(css, &tset);
2156 } 2146 }
2157 } 2147 }
2158out_free_group_list: 2148out_free_group_list:
@@ -2253,9 +2243,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2253 2243
2254 mutex_lock(&cgroup_mutex); 2244 mutex_lock(&cgroup_mutex);
2255 for_each_active_root(root) { 2245 for_each_active_root(root) {
2256 struct cgroup *from_cg = task_cgroup_from_root(from, root); 2246 struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
2257 2247
2258 retval = cgroup_attach_task(from_cg, tsk, false); 2248 retval = cgroup_attach_task(from_cgrp, tsk, false);
2259 if (retval) 2249 if (retval)
2260 break; 2250 break;
2261 } 2251 }
@@ -2265,34 +2255,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2265} 2255}
2266EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 2256EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2267 2257
2268static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2258static int cgroup_tasks_write(struct cgroup_subsys_state *css,
2259 struct cftype *cft, u64 pid)
2269{ 2260{
2270 return attach_task_by_pid(cgrp, pid, false); 2261 return attach_task_by_pid(css->cgroup, pid, false);
2271} 2262}
2272 2263
2273static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) 2264static int cgroup_procs_write(struct cgroup_subsys_state *css,
2265 struct cftype *cft, u64 tgid)
2274{ 2266{
2275 return attach_task_by_pid(cgrp, tgid, true); 2267 return attach_task_by_pid(css->cgroup, tgid, true);
2276} 2268}
2277 2269
2278static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 2270static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2279 const char *buffer) 2271 struct cftype *cft, const char *buffer)
2280{ 2272{
2281 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); 2273 BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX);
2282 if (strlen(buffer) >= PATH_MAX) 2274 if (strlen(buffer) >= PATH_MAX)
2283 return -EINVAL; 2275 return -EINVAL;
2284 if (!cgroup_lock_live_group(cgrp)) 2276 if (!cgroup_lock_live_group(css->cgroup))
2285 return -ENODEV; 2277 return -ENODEV;
2286 mutex_lock(&cgroup_root_mutex); 2278 mutex_lock(&cgroup_root_mutex);
2287 strcpy(cgrp->root->release_agent_path, buffer); 2279 strcpy(css->cgroup->root->release_agent_path, buffer);
2288 mutex_unlock(&cgroup_root_mutex); 2280 mutex_unlock(&cgroup_root_mutex);
2289 mutex_unlock(&cgroup_mutex); 2281 mutex_unlock(&cgroup_mutex);
2290 return 0; 2282 return 0;
2291} 2283}
2292 2284
2293static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, 2285static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
2294 struct seq_file *seq) 2286 struct cftype *cft, struct seq_file *seq)
2295{ 2287{
2288 struct cgroup *cgrp = css->cgroup;
2289
2296 if (!cgroup_lock_live_group(cgrp)) 2290 if (!cgroup_lock_live_group(cgrp))
2297 return -ENODEV; 2291 return -ENODEV;
2298 seq_puts(seq, cgrp->root->release_agent_path); 2292 seq_puts(seq, cgrp->root->release_agent_path);
@@ -2301,20 +2295,20 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
2301 return 0; 2295 return 0;
2302} 2296}
2303 2297
2304static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, 2298static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css,
2305 struct seq_file *seq) 2299 struct cftype *cft, struct seq_file *seq)
2306{ 2300{
2307 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); 2301 seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup));
2308 return 0; 2302 return 0;
2309} 2303}
2310 2304
2311/* A buffer size big enough for numbers or short strings */ 2305/* A buffer size big enough for numbers or short strings */
2312#define CGROUP_LOCAL_BUFFER_SIZE 64 2306#define CGROUP_LOCAL_BUFFER_SIZE 64
2313 2307
2314static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, 2308static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css,
2315 struct file *file, 2309 struct cftype *cft, struct file *file,
2316 const char __user *userbuf, 2310 const char __user *userbuf, size_t nbytes,
2317 size_t nbytes, loff_t *unused_ppos) 2311 loff_t *unused_ppos)
2318{ 2312{
2319 char buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2313 char buffer[CGROUP_LOCAL_BUFFER_SIZE];
2320 int retval = 0; 2314 int retval = 0;
@@ -2332,22 +2326,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
2332 u64 val = simple_strtoull(strstrip(buffer), &end, 0); 2326 u64 val = simple_strtoull(strstrip(buffer), &end, 0);
2333 if (*end) 2327 if (*end)
2334 return -EINVAL; 2328 return -EINVAL;
2335 retval = cft->write_u64(cgrp, cft, val); 2329 retval = cft->write_u64(css, cft, val);
2336 } else { 2330 } else {
2337 s64 val = simple_strtoll(strstrip(buffer), &end, 0); 2331 s64 val = simple_strtoll(strstrip(buffer), &end, 0);
2338 if (*end) 2332 if (*end)
2339 return -EINVAL; 2333 return -EINVAL;
2340 retval = cft->write_s64(cgrp, cft, val); 2334 retval = cft->write_s64(css, cft, val);
2341 } 2335 }
2342 if (!retval) 2336 if (!retval)
2343 retval = nbytes; 2337 retval = nbytes;
2344 return retval; 2338 return retval;
2345} 2339}
2346 2340
2347static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, 2341static ssize_t cgroup_write_string(struct cgroup_subsys_state *css,
2348 struct file *file, 2342 struct cftype *cft, struct file *file,
2349 const char __user *userbuf, 2343 const char __user *userbuf, size_t nbytes,
2350 size_t nbytes, loff_t *unused_ppos) 2344 loff_t *unused_ppos)
2351{ 2345{
2352 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2346 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
2353 int retval = 0; 2347 int retval = 0;
@@ -2370,7 +2364,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
2370 } 2364 }
2371 2365
2372 buffer[nbytes] = 0; /* nul-terminate */ 2366 buffer[nbytes] = 0; /* nul-terminate */
2373 retval = cft->write_string(cgrp, cft, strstrip(buffer)); 2367 retval = cft->write_string(css, cft, strstrip(buffer));
2374 if (!retval) 2368 if (!retval)
2375 retval = nbytes; 2369 retval = nbytes;
2376out: 2370out:
@@ -2380,65 +2374,60 @@ out:
2380} 2374}
2381 2375
2382static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 2376static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2383 size_t nbytes, loff_t *ppos) 2377 size_t nbytes, loff_t *ppos)
2384{ 2378{
2379 struct cfent *cfe = __d_cfe(file->f_dentry);
2385 struct cftype *cft = __d_cft(file->f_dentry); 2380 struct cftype *cft = __d_cft(file->f_dentry);
2386 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2381 struct cgroup_subsys_state *css = cfe->css;
2387 2382
2388 if (cgroup_is_dead(cgrp))
2389 return -ENODEV;
2390 if (cft->write) 2383 if (cft->write)
2391 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 2384 return cft->write(css, cft, file, buf, nbytes, ppos);
2392 if (cft->write_u64 || cft->write_s64) 2385 if (cft->write_u64 || cft->write_s64)
2393 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); 2386 return cgroup_write_X64(css, cft, file, buf, nbytes, ppos);
2394 if (cft->write_string) 2387 if (cft->write_string)
2395 return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); 2388 return cgroup_write_string(css, cft, file, buf, nbytes, ppos);
2396 if (cft->trigger) { 2389 if (cft->trigger) {
2397 int ret = cft->trigger(cgrp, (unsigned int)cft->private); 2390 int ret = cft->trigger(css, (unsigned int)cft->private);
2398 return ret ? ret : nbytes; 2391 return ret ? ret : nbytes;
2399 } 2392 }
2400 return -EINVAL; 2393 return -EINVAL;
2401} 2394}
2402 2395
2403static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, 2396static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css,
2404 struct file *file, 2397 struct cftype *cft, struct file *file,
2405 char __user *buf, size_t nbytes, 2398 char __user *buf, size_t nbytes, loff_t *ppos)
2406 loff_t *ppos)
2407{ 2399{
2408 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2400 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2409 u64 val = cft->read_u64(cgrp, cft); 2401 u64 val = cft->read_u64(css, cft);
2410 int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 2402 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2411 2403
2412 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2404 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2413} 2405}
2414 2406
2415static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, 2407static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css,
2416 struct file *file, 2408 struct cftype *cft, struct file *file,
2417 char __user *buf, size_t nbytes, 2409 char __user *buf, size_t nbytes, loff_t *ppos)
2418 loff_t *ppos)
2419{ 2410{
2420 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2411 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2421 s64 val = cft->read_s64(cgrp, cft); 2412 s64 val = cft->read_s64(css, cft);
2422 int len = sprintf(tmp, "%lld\n", (long long) val); 2413 int len = sprintf(tmp, "%lld\n", (long long) val);
2423 2414
2424 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2415 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2425} 2416}
2426 2417
2427static ssize_t cgroup_file_read(struct file *file, char __user *buf, 2418static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2428 size_t nbytes, loff_t *ppos) 2419 size_t nbytes, loff_t *ppos)
2429{ 2420{
2421 struct cfent *cfe = __d_cfe(file->f_dentry);
2430 struct cftype *cft = __d_cft(file->f_dentry); 2422 struct cftype *cft = __d_cft(file->f_dentry);
2431 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2423 struct cgroup_subsys_state *css = cfe->css;
2432
2433 if (cgroup_is_dead(cgrp))
2434 return -ENODEV;
2435 2424
2436 if (cft->read) 2425 if (cft->read)
2437 return cft->read(cgrp, cft, file, buf, nbytes, ppos); 2426 return cft->read(css, cft, file, buf, nbytes, ppos);
2438 if (cft->read_u64) 2427 if (cft->read_u64)
2439 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); 2428 return cgroup_read_u64(css, cft, file, buf, nbytes, ppos);
2440 if (cft->read_s64) 2429 if (cft->read_s64)
2441 return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); 2430 return cgroup_read_s64(css, cft, file, buf, nbytes, ppos);
2442 return -EINVAL; 2431 return -EINVAL;
2443} 2432}
2444 2433
@@ -2447,11 +2436,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2447 * supports string->u64 maps, but can be extended in future. 2436 * supports string->u64 maps, but can be extended in future.
2448 */ 2437 */
2449 2438
2450struct cgroup_seqfile_state {
2451 struct cftype *cft;
2452 struct cgroup *cgroup;
2453};
2454
2455static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) 2439static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2456{ 2440{
2457 struct seq_file *sf = cb->state; 2441 struct seq_file *sf = cb->state;
@@ -2460,69 +2444,86 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2460 2444
2461static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2445static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2462{ 2446{
2463 struct cgroup_seqfile_state *state = m->private; 2447 struct cfent *cfe = m->private;
2464 struct cftype *cft = state->cft; 2448 struct cftype *cft = cfe->type;
2449 struct cgroup_subsys_state *css = cfe->css;
2450
2465 if (cft->read_map) { 2451 if (cft->read_map) {
2466 struct cgroup_map_cb cb = { 2452 struct cgroup_map_cb cb = {
2467 .fill = cgroup_map_add, 2453 .fill = cgroup_map_add,
2468 .state = m, 2454 .state = m,
2469 }; 2455 };
2470 return cft->read_map(state->cgroup, cft, &cb); 2456 return cft->read_map(css, cft, &cb);
2471 } 2457 }
2472 return cft->read_seq_string(state->cgroup, cft, m); 2458 return cft->read_seq_string(css, cft, m);
2473}
2474
2475static int cgroup_seqfile_release(struct inode *inode, struct file *file)
2476{
2477 struct seq_file *seq = file->private_data;
2478 kfree(seq->private);
2479 return single_release(inode, file);
2480} 2459}
2481 2460
2482static const struct file_operations cgroup_seqfile_operations = { 2461static const struct file_operations cgroup_seqfile_operations = {
2483 .read = seq_read, 2462 .read = seq_read,
2484 .write = cgroup_file_write, 2463 .write = cgroup_file_write,
2485 .llseek = seq_lseek, 2464 .llseek = seq_lseek,
2486 .release = cgroup_seqfile_release, 2465 .release = single_release,
2487}; 2466};
2488 2467
2489static int cgroup_file_open(struct inode *inode, struct file *file) 2468static int cgroup_file_open(struct inode *inode, struct file *file)
2490{ 2469{
2470 struct cfent *cfe = __d_cfe(file->f_dentry);
2471 struct cftype *cft = __d_cft(file->f_dentry);
2472 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
2473 struct cgroup_subsys_state *css;
2491 int err; 2474 int err;
2492 struct cftype *cft;
2493 2475
2494 err = generic_file_open(inode, file); 2476 err = generic_file_open(inode, file);
2495 if (err) 2477 if (err)
2496 return err; 2478 return err;
2497 cft = __d_cft(file->f_dentry);
2498 2479
2499 if (cft->read_map || cft->read_seq_string) { 2480 /*
2500 struct cgroup_seqfile_state *state; 2481 * If the file belongs to a subsystem, pin the css. Will be
2482 * unpinned either on open failure or release. This ensures that
2483 * @css stays alive for all file operations.
2484 */
2485 rcu_read_lock();
2486 css = cgroup_css(cgrp, cft->ss);
2487 if (cft->ss && !css_tryget(css))
2488 css = NULL;
2489 rcu_read_unlock();
2501 2490
2502 state = kzalloc(sizeof(*state), GFP_USER); 2491 if (!css)
2503 if (!state) 2492 return -ENODEV;
2504 return -ENOMEM; 2493
2494 /*
2495 * @cfe->css is used by read/write/close to determine the
2496 * associated css. @file->private_data would be a better place but
2497 * that's already used by seqfile. Multiple accessors may use it
2498 * simultaneously which is okay as the association never changes.
2499 */
2500 WARN_ON_ONCE(cfe->css && cfe->css != css);
2501 cfe->css = css;
2505 2502
2506 state->cft = cft; 2503 if (cft->read_map || cft->read_seq_string) {
2507 state->cgroup = __d_cgrp(file->f_dentry->d_parent);
2508 file->f_op = &cgroup_seqfile_operations; 2504 file->f_op = &cgroup_seqfile_operations;
2509 err = single_open(file, cgroup_seqfile_show, state); 2505 err = single_open(file, cgroup_seqfile_show, cfe);
2510 if (err < 0) 2506 } else if (cft->open) {
2511 kfree(state);
2512 } else if (cft->open)
2513 err = cft->open(inode, file); 2507 err = cft->open(inode, file);
2514 else 2508 }
2515 err = 0;
2516 2509
2510 if (css->ss && err)
2511 css_put(css);
2517 return err; 2512 return err;
2518} 2513}
2519 2514
2520static int cgroup_file_release(struct inode *inode, struct file *file) 2515static int cgroup_file_release(struct inode *inode, struct file *file)
2521{ 2516{
2517 struct cfent *cfe = __d_cfe(file->f_dentry);
2522 struct cftype *cft = __d_cft(file->f_dentry); 2518 struct cftype *cft = __d_cft(file->f_dentry);
2519 struct cgroup_subsys_state *css = cfe->css;
2520 int ret = 0;
2521
2523 if (cft->release) 2522 if (cft->release)
2524 return cft->release(inode, file); 2523 ret = cft->release(inode, file);
2525 return 0; 2524 if (css->ss)
2525 css_put(css);
2526 return ret;
2526} 2527}
2527 2528
2528/* 2529/*
@@ -2736,8 +2737,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2736 return mode; 2737 return mode;
2737} 2738}
2738 2739
2739static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2740static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2740 struct cftype *cft)
2741{ 2741{
2742 struct dentry *dir = cgrp->dentry; 2742 struct dentry *dir = cgrp->dentry;
2743 struct cgroup *parent = __d_cgrp(dir); 2743 struct cgroup *parent = __d_cgrp(dir);
@@ -2747,8 +2747,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2747 umode_t mode; 2747 umode_t mode;
2748 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2748 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2749 2749
2750 if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { 2750 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
2751 strcpy(name, subsys->name); 2751 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2752 strcpy(name, cft->ss->name);
2752 strcat(name, "."); 2753 strcat(name, ".");
2753 } 2754 }
2754 strcat(name, cft->name); 2755 strcat(name, cft->name);
@@ -2782,11 +2783,25 @@ out:
2782 return error; 2783 return error;
2783} 2784}
2784 2785
2785static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2786/**
2786 struct cftype cfts[], bool is_add) 2787 * cgroup_addrm_files - add or remove files to a cgroup directory
2788 * @cgrp: the target cgroup
2789 * @cfts: array of cftypes to be added
2790 * @is_add: whether to add or remove
2791 *
2792 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
2793 * For removals, this function never fails. If addition fails, this
2794 * function doesn't remove files already added. The caller is responsible
2795 * for cleaning up.
2796 */
2797static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2798 bool is_add)
2787{ 2799{
2788 struct cftype *cft; 2800 struct cftype *cft;
2789 int err, ret = 0; 2801 int ret;
2802
2803 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
2804 lockdep_assert_held(&cgroup_mutex);
2790 2805
2791 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2806 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2792 /* does cft->flags tell us to skip this file on @cgrp? */ 2807 /* does cft->flags tell us to skip this file on @cgrp? */
@@ -2798,16 +2813,17 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2798 continue; 2813 continue;
2799 2814
2800 if (is_add) { 2815 if (is_add) {
2801 err = cgroup_add_file(cgrp, subsys, cft); 2816 ret = cgroup_add_file(cgrp, cft);
2802 if (err) 2817 if (ret) {
2803 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", 2818 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2804 cft->name, err); 2819 cft->name, ret);
2805 ret = err; 2820 return ret;
2821 }
2806 } else { 2822 } else {
2807 cgroup_rm_file(cgrp, cft); 2823 cgroup_rm_file(cgrp, cft);
2808 } 2824 }
2809 } 2825 }
2810 return ret; 2826 return 0;
2811} 2827}
2812 2828
2813static void cgroup_cfts_prepare(void) 2829static void cgroup_cfts_prepare(void)
@@ -2816,28 +2832,30 @@ static void cgroup_cfts_prepare(void)
2816 /* 2832 /*
2817 * Thanks to the entanglement with vfs inode locking, we can't walk 2833 * Thanks to the entanglement with vfs inode locking, we can't walk
2818 * the existing cgroups under cgroup_mutex and create files. 2834 * the existing cgroups under cgroup_mutex and create files.
2819 * Instead, we use cgroup_for_each_descendant_pre() and drop RCU 2835 * Instead, we use css_for_each_descendant_pre() and drop RCU read
2820 * read lock before calling cgroup_addrm_files(). 2836 * lock before calling cgroup_addrm_files().
2821 */ 2837 */
2822 mutex_lock(&cgroup_mutex); 2838 mutex_lock(&cgroup_mutex);
2823} 2839}
2824 2840
2825static void cgroup_cfts_commit(struct cgroup_subsys *ss, 2841static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2826 struct cftype *cfts, bool is_add)
2827 __releases(&cgroup_mutex) 2842 __releases(&cgroup_mutex)
2828{ 2843{
2829 LIST_HEAD(pending); 2844 LIST_HEAD(pending);
2830 struct cgroup *cgrp, *root = &ss->root->top_cgroup; 2845 struct cgroup_subsys *ss = cfts[0].ss;
2846 struct cgroup *root = &ss->root->top_cgroup;
2831 struct super_block *sb = ss->root->sb; 2847 struct super_block *sb = ss->root->sb;
2832 struct dentry *prev = NULL; 2848 struct dentry *prev = NULL;
2833 struct inode *inode; 2849 struct inode *inode;
2850 struct cgroup_subsys_state *css;
2834 u64 update_before; 2851 u64 update_before;
2852 int ret = 0;
2835 2853
2836 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2854 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2837 if (!cfts || ss->root == &cgroup_dummy_root || 2855 if (!cfts || ss->root == &cgroup_dummy_root ||
2838 !atomic_inc_not_zero(&sb->s_active)) { 2856 !atomic_inc_not_zero(&sb->s_active)) {
2839 mutex_unlock(&cgroup_mutex); 2857 mutex_unlock(&cgroup_mutex);
2840 return; 2858 return 0;
2841 } 2859 }
2842 2860
2843 /* 2861 /*
@@ -2849,17 +2867,11 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2849 2867
2850 mutex_unlock(&cgroup_mutex); 2868 mutex_unlock(&cgroup_mutex);
2851 2869
2852 /* @root always needs to be updated */
2853 inode = root->dentry->d_inode;
2854 mutex_lock(&inode->i_mutex);
2855 mutex_lock(&cgroup_mutex);
2856 cgroup_addrm_files(root, ss, cfts, is_add);
2857 mutex_unlock(&cgroup_mutex);
2858 mutex_unlock(&inode->i_mutex);
2859
2860 /* add/rm files for all cgroups created before */ 2870 /* add/rm files for all cgroups created before */
2861 rcu_read_lock(); 2871 rcu_read_lock();
2862 cgroup_for_each_descendant_pre(cgrp, root) { 2872 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2873 struct cgroup *cgrp = css->cgroup;
2874
2863 if (cgroup_is_dead(cgrp)) 2875 if (cgroup_is_dead(cgrp))
2864 continue; 2876 continue;
2865 2877
@@ -2873,15 +2885,18 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2873 mutex_lock(&inode->i_mutex); 2885 mutex_lock(&inode->i_mutex);
2874 mutex_lock(&cgroup_mutex); 2886 mutex_lock(&cgroup_mutex);
2875 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) 2887 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2876 cgroup_addrm_files(cgrp, ss, cfts, is_add); 2888 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2877 mutex_unlock(&cgroup_mutex); 2889 mutex_unlock(&cgroup_mutex);
2878 mutex_unlock(&inode->i_mutex); 2890 mutex_unlock(&inode->i_mutex);
2879 2891
2880 rcu_read_lock(); 2892 rcu_read_lock();
2893 if (ret)
2894 break;
2881 } 2895 }
2882 rcu_read_unlock(); 2896 rcu_read_unlock();
2883 dput(prev); 2897 dput(prev);
2884 deactivate_super(sb); 2898 deactivate_super(sb);
2899 return ret;
2885} 2900}
2886 2901
2887/** 2902/**
@@ -2901,49 +2916,56 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2901int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2916int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2902{ 2917{
2903 struct cftype_set *set; 2918 struct cftype_set *set;
2919 struct cftype *cft;
2920 int ret;
2904 2921
2905 set = kzalloc(sizeof(*set), GFP_KERNEL); 2922 set = kzalloc(sizeof(*set), GFP_KERNEL);
2906 if (!set) 2923 if (!set)
2907 return -ENOMEM; 2924 return -ENOMEM;
2908 2925
2926 for (cft = cfts; cft->name[0] != '\0'; cft++)
2927 cft->ss = ss;
2928
2909 cgroup_cfts_prepare(); 2929 cgroup_cfts_prepare();
2910 set->cfts = cfts; 2930 set->cfts = cfts;
2911 list_add_tail(&set->node, &ss->cftsets); 2931 list_add_tail(&set->node, &ss->cftsets);
2912 cgroup_cfts_commit(ss, cfts, true); 2932 ret = cgroup_cfts_commit(cfts, true);
2913 2933 if (ret)
2914 return 0; 2934 cgroup_rm_cftypes(cfts);
2935 return ret;
2915} 2936}
2916EXPORT_SYMBOL_GPL(cgroup_add_cftypes); 2937EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2917 2938
2918/** 2939/**
2919 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem 2940 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2920 * @ss: target cgroup subsystem
2921 * @cfts: zero-length name terminated array of cftypes 2941 * @cfts: zero-length name terminated array of cftypes
2922 * 2942 *
2923 * Unregister @cfts from @ss. Files described by @cfts are removed from 2943 * Unregister @cfts. Files described by @cfts are removed from all
2924 * all existing cgroups to which @ss is attached and all future cgroups 2944 * existing cgroups and all future cgroups won't have them either. This
2925 * won't have them either. This function can be called anytime whether @ss 2945 * function can be called anytime whether @cfts' subsys is attached or not.
2926 * is attached or not.
2927 * 2946 *
2928 * Returns 0 on successful unregistration, -ENOENT if @cfts is not 2947 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2929 * registered with @ss. 2948 * registered.
2930 */ 2949 */
2931int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2950int cgroup_rm_cftypes(struct cftype *cfts)
2932{ 2951{
2933 struct cftype_set *set; 2952 struct cftype_set *set;
2934 2953
2954 if (!cfts || !cfts[0].ss)
2955 return -ENOENT;
2956
2935 cgroup_cfts_prepare(); 2957 cgroup_cfts_prepare();
2936 2958
2937 list_for_each_entry(set, &ss->cftsets, node) { 2959 list_for_each_entry(set, &cfts[0].ss->cftsets, node) {
2938 if (set->cfts == cfts) { 2960 if (set->cfts == cfts) {
2939 list_del(&set->node); 2961 list_del(&set->node);
2940 kfree(set); 2962 kfree(set);
2941 cgroup_cfts_commit(ss, cfts, false); 2963 cgroup_cfts_commit(cfts, false);
2942 return 0; 2964 return 0;
2943 } 2965 }
2944 } 2966 }
2945 2967
2946 cgroup_cfts_commit(ss, NULL, false); 2968 cgroup_cfts_commit(NULL, false);
2947 return -ENOENT; 2969 return -ENOENT;
2948} 2970}
2949 2971
@@ -2966,34 +2988,10 @@ int cgroup_task_count(const struct cgroup *cgrp)
2966} 2988}
2967 2989
2968/* 2990/*
2969 * Advance a list_head iterator. The iterator should be positioned at 2991 * To reduce the fork() overhead for systems that are not actually using
2970 * the start of a css_set 2992 * their cgroups capability, we don't maintain the lists running through
2971 */ 2993 * each css_set to its tasks until we see the list actually used - in other
2972static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) 2994 * words after the first call to css_task_iter_start().
2973{
2974 struct list_head *l = it->cset_link;
2975 struct cgrp_cset_link *link;
2976 struct css_set *cset;
2977
2978 /* Advance to the next non-empty css_set */
2979 do {
2980 l = l->next;
2981 if (l == &cgrp->cset_links) {
2982 it->cset_link = NULL;
2983 return;
2984 }
2985 link = list_entry(l, struct cgrp_cset_link, cset_link);
2986 cset = link->cset;
2987 } while (list_empty(&cset->tasks));
2988 it->cset_link = l;
2989 it->task = cset->tasks.next;
2990}
2991
2992/*
2993 * To reduce the fork() overhead for systems that are not actually
2994 * using their cgroups capability, we don't maintain the lists running
2995 * through each css_set to its tasks until we see the list actually
2996 * used - in other words after the first call to cgroup_iter_start().
2997 */ 2995 */
2998static void cgroup_enable_task_cg_lists(void) 2996static void cgroup_enable_task_cg_lists(void)
2999{ 2997{
@@ -3024,16 +3022,21 @@ static void cgroup_enable_task_cg_lists(void)
3024} 3022}
3025 3023
3026/** 3024/**
3027 * cgroup_next_sibling - find the next sibling of a given cgroup 3025 * css_next_child - find the next child of a given css
3028 * @pos: the current cgroup 3026 * @pos_css: the current position (%NULL to initiate traversal)
3027 * @parent_css: css whose children to walk
3029 * 3028 *
3030 * This function returns the next sibling of @pos and should be called 3029 * This function returns the next child of @parent_css and should be called
3031 * under RCU read lock. The only requirement is that @pos is accessible. 3030 * under RCU read lock. The only requirement is that @parent_css and
3032 * The next sibling is guaranteed to be returned regardless of @pos's 3031 * @pos_css are accessible. The next sibling is guaranteed to be returned
3033 * state. 3032 * regardless of their states.
3034 */ 3033 */
3035struct cgroup *cgroup_next_sibling(struct cgroup *pos) 3034struct cgroup_subsys_state *
3035css_next_child(struct cgroup_subsys_state *pos_css,
3036 struct cgroup_subsys_state *parent_css)
3036{ 3037{
3038 struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
3039 struct cgroup *cgrp = parent_css->cgroup;
3037 struct cgroup *next; 3040 struct cgroup *next;
3038 3041
3039 WARN_ON_ONCE(!rcu_read_lock_held()); 3042 WARN_ON_ONCE(!rcu_read_lock_held());
@@ -3048,78 +3051,81 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos)
3048 * safe to dereference from this RCU critical section. If 3051 * safe to dereference from this RCU critical section. If
3049 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed 3052 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
3050 * to be visible as %true here. 3053 * to be visible as %true here.
3054 *
3055 * If @pos is dead, its next pointer can't be dereferenced;
3056 * however, as each cgroup is given a monotonically increasing
3057 * unique serial number and always appended to the sibling list,
3058 * the next one can be found by walking the parent's children until
3059 * we see a cgroup with higher serial number than @pos's. While
3060 * this path can be slower, it's taken only when either the current
3061 * cgroup is removed or iteration and removal race.
3051 */ 3062 */
3052 if (likely(!cgroup_is_dead(pos))) { 3063 if (!pos) {
3064 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
3065 } else if (likely(!cgroup_is_dead(pos))) {
3053 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3066 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3054 if (&next->sibling != &pos->parent->children) 3067 } else {
3055 return next; 3068 list_for_each_entry_rcu(next, &cgrp->children, sibling)
3056 return NULL; 3069 if (next->serial_nr > pos->serial_nr)
3070 break;
3057 } 3071 }
3058 3072
3059 /* 3073 if (&next->sibling == &cgrp->children)
3060 * Can't dereference the next pointer. Each cgroup is given a 3074 return NULL;
3061 * monotonically increasing unique serial number and always 3075
3062 * appended to the sibling list, so the next one can be found by 3076 return cgroup_css(next, parent_css->ss);
3063 * walking the parent's children until we see a cgroup with higher
3064 * serial number than @pos's.
3065 *
3066 * While this path can be slow, it's taken only when either the
3067 * current cgroup is removed or iteration and removal race.
3068 */
3069 list_for_each_entry_rcu(next, &pos->parent->children, sibling)
3070 if (next->serial_nr > pos->serial_nr)
3071 return next;
3072 return NULL;
3073} 3077}
3074EXPORT_SYMBOL_GPL(cgroup_next_sibling); 3078EXPORT_SYMBOL_GPL(css_next_child);
3075 3079
3076/** 3080/**
3077 * cgroup_next_descendant_pre - find the next descendant for pre-order walk 3081 * css_next_descendant_pre - find the next descendant for pre-order walk
3078 * @pos: the current position (%NULL to initiate traversal) 3082 * @pos: the current position (%NULL to initiate traversal)
3079 * @cgroup: cgroup whose descendants to walk 3083 * @root: css whose descendants to walk
3080 * 3084 *
3081 * To be used by cgroup_for_each_descendant_pre(). Find the next 3085 * To be used by css_for_each_descendant_pre(). Find the next descendant
3082 * descendant to visit for pre-order traversal of @cgroup's descendants. 3086 * to visit for pre-order traversal of @root's descendants. @root is
3087 * included in the iteration and the first node to be visited.
3083 * 3088 *
3084 * While this function requires RCU read locking, it doesn't require the 3089 * While this function requires RCU read locking, it doesn't require the
3085 * whole traversal to be contained in a single RCU critical section. This 3090 * whole traversal to be contained in a single RCU critical section. This
3086 * function will return the correct next descendant as long as both @pos 3091 * function will return the correct next descendant as long as both @pos
3087 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3092 * and @root are accessible and @pos is a descendant of @root.
3088 */ 3093 */
3089struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, 3094struct cgroup_subsys_state *
3090 struct cgroup *cgroup) 3095css_next_descendant_pre(struct cgroup_subsys_state *pos,
3096 struct cgroup_subsys_state *root)
3091{ 3097{
3092 struct cgroup *next; 3098 struct cgroup_subsys_state *next;
3093 3099
3094 WARN_ON_ONCE(!rcu_read_lock_held()); 3100 WARN_ON_ONCE(!rcu_read_lock_held());
3095 3101
3096 /* if first iteration, pretend we just visited @cgroup */ 3102 /* if first iteration, visit @root */
3097 if (!pos) 3103 if (!pos)
3098 pos = cgroup; 3104 return root;
3099 3105
3100 /* visit the first child if exists */ 3106 /* visit the first child if exists */
3101 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); 3107 next = css_next_child(NULL, pos);
3102 if (next) 3108 if (next)
3103 return next; 3109 return next;
3104 3110
3105 /* no child, visit my or the closest ancestor's next sibling */ 3111 /* no child, visit my or the closest ancestor's next sibling */
3106 while (pos != cgroup) { 3112 while (pos != root) {
3107 next = cgroup_next_sibling(pos); 3113 next = css_next_child(pos, css_parent(pos));
3108 if (next) 3114 if (next)
3109 return next; 3115 return next;
3110 pos = pos->parent; 3116 pos = css_parent(pos);
3111 } 3117 }
3112 3118
3113 return NULL; 3119 return NULL;
3114} 3120}
3115EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); 3121EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3116 3122
3117/** 3123/**
3118 * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup 3124 * css_rightmost_descendant - return the rightmost descendant of a css
3119 * @pos: cgroup of interest 3125 * @pos: css of interest
3120 * 3126 *
3121 * Return the rightmost descendant of @pos. If there's no descendant, 3127 * Return the rightmost descendant of @pos. If there's no descendant, @pos
3122 * @pos is returned. This can be used during pre-order traversal to skip 3128 * is returned. This can be used during pre-order traversal to skip
3123 * subtree of @pos. 3129 * subtree of @pos.
3124 * 3130 *
3125 * While this function requires RCU read locking, it doesn't require the 3131 * While this function requires RCU read locking, it doesn't require the
@@ -3127,9 +3133,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3127 * function will return the correct rightmost descendant as long as @pos is 3133 * function will return the correct rightmost descendant as long as @pos is
3128 * accessible. 3134 * accessible.
3129 */ 3135 */
3130struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) 3136struct cgroup_subsys_state *
3137css_rightmost_descendant(struct cgroup_subsys_state *pos)
3131{ 3138{
3132 struct cgroup *last, *tmp; 3139 struct cgroup_subsys_state *last, *tmp;
3133 3140
3134 WARN_ON_ONCE(!rcu_read_lock_held()); 3141 WARN_ON_ONCE(!rcu_read_lock_held());
3135 3142
@@ -3137,82 +3144,138 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
3137 last = pos; 3144 last = pos;
3138 /* ->prev isn't RCU safe, walk ->next till the end */ 3145 /* ->prev isn't RCU safe, walk ->next till the end */
3139 pos = NULL; 3146 pos = NULL;
3140 list_for_each_entry_rcu(tmp, &last->children, sibling) 3147 css_for_each_child(tmp, last)
3141 pos = tmp; 3148 pos = tmp;
3142 } while (pos); 3149 } while (pos);
3143 3150
3144 return last; 3151 return last;
3145} 3152}
3146EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); 3153EXPORT_SYMBOL_GPL(css_rightmost_descendant);
3147 3154
3148static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) 3155static struct cgroup_subsys_state *
3156css_leftmost_descendant(struct cgroup_subsys_state *pos)
3149{ 3157{
3150 struct cgroup *last; 3158 struct cgroup_subsys_state *last;
3151 3159
3152 do { 3160 do {
3153 last = pos; 3161 last = pos;
3154 pos = list_first_or_null_rcu(&pos->children, struct cgroup, 3162 pos = css_next_child(NULL, pos);
3155 sibling);
3156 } while (pos); 3163 } while (pos);
3157 3164
3158 return last; 3165 return last;
3159} 3166}
3160 3167
3161/** 3168/**
3162 * cgroup_next_descendant_post - find the next descendant for post-order walk 3169 * css_next_descendant_post - find the next descendant for post-order walk
3163 * @pos: the current position (%NULL to initiate traversal) 3170 * @pos: the current position (%NULL to initiate traversal)
3164 * @cgroup: cgroup whose descendants to walk 3171 * @root: css whose descendants to walk
3165 * 3172 *
3166 * To be used by cgroup_for_each_descendant_post(). Find the next 3173 * To be used by css_for_each_descendant_post(). Find the next descendant
3167 * descendant to visit for post-order traversal of @cgroup's descendants. 3174 * to visit for post-order traversal of @root's descendants. @root is
3175 * included in the iteration and the last node to be visited.
3168 * 3176 *
3169 * While this function requires RCU read locking, it doesn't require the 3177 * While this function requires RCU read locking, it doesn't require the
3170 * whole traversal to be contained in a single RCU critical section. This 3178 * whole traversal to be contained in a single RCU critical section. This
3171 * function will return the correct next descendant as long as both @pos 3179 * function will return the correct next descendant as long as both @pos
3172 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3180 * and @cgroup are accessible and @pos is a descendant of @cgroup.
3173 */ 3181 */
3174struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, 3182struct cgroup_subsys_state *
3175 struct cgroup *cgroup) 3183css_next_descendant_post(struct cgroup_subsys_state *pos,
3184 struct cgroup_subsys_state *root)
3176{ 3185{
3177 struct cgroup *next; 3186 struct cgroup_subsys_state *next;
3178 3187
3179 WARN_ON_ONCE(!rcu_read_lock_held()); 3188 WARN_ON_ONCE(!rcu_read_lock_held());
3180 3189
3181 /* if first iteration, visit the leftmost descendant */ 3190 /* if first iteration, visit the leftmost descendant */
3182 if (!pos) { 3191 if (!pos) {
3183 next = cgroup_leftmost_descendant(cgroup); 3192 next = css_leftmost_descendant(root);
3184 return next != cgroup ? next : NULL; 3193 return next != root ? next : NULL;
3185 } 3194 }
3186 3195
3196 /* if we visited @root, we're done */
3197 if (pos == root)
3198 return NULL;
3199
3187 /* if there's an unvisited sibling, visit its leftmost descendant */ 3200 /* if there's an unvisited sibling, visit its leftmost descendant */
3188 next = cgroup_next_sibling(pos); 3201 next = css_next_child(pos, css_parent(pos));
3189 if (next) 3202 if (next)
3190 return cgroup_leftmost_descendant(next); 3203 return css_leftmost_descendant(next);
3191 3204
3192 /* no sibling left, visit parent */ 3205 /* no sibling left, visit parent */
3193 next = pos->parent; 3206 return css_parent(pos);
3194 return next != cgroup ? next : NULL;
3195} 3207}
3196EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); 3208EXPORT_SYMBOL_GPL(css_next_descendant_post);
3197 3209
3198void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 3210/**
3211 * css_advance_task_iter - advance a task itererator to the next css_set
3212 * @it: the iterator to advance
3213 *
3214 * Advance @it to the next css_set to walk.
3215 */
3216static void css_advance_task_iter(struct css_task_iter *it)
3217{
3218 struct list_head *l = it->cset_link;
3219 struct cgrp_cset_link *link;
3220 struct css_set *cset;
3221
3222 /* Advance to the next non-empty css_set */
3223 do {
3224 l = l->next;
3225 if (l == &it->origin_css->cgroup->cset_links) {
3226 it->cset_link = NULL;
3227 return;
3228 }
3229 link = list_entry(l, struct cgrp_cset_link, cset_link);
3230 cset = link->cset;
3231 } while (list_empty(&cset->tasks));
3232 it->cset_link = l;
3233 it->task = cset->tasks.next;
3234}
3235
3236/**
3237 * css_task_iter_start - initiate task iteration
3238 * @css: the css to walk tasks of
3239 * @it: the task iterator to use
3240 *
3241 * Initiate iteration through the tasks of @css. The caller can call
3242 * css_task_iter_next() to walk through the tasks until the function
3243 * returns NULL. On completion of iteration, css_task_iter_end() must be
3244 * called.
3245 *
3246 * Note that this function acquires a lock which is released when the
3247 * iteration finishes. The caller can't sleep while iteration is in
3248 * progress.
3249 */
3250void css_task_iter_start(struct cgroup_subsys_state *css,
3251 struct css_task_iter *it)
3199 __acquires(css_set_lock) 3252 __acquires(css_set_lock)
3200{ 3253{
3201 /* 3254 /*
3202 * The first time anyone tries to iterate across a cgroup, 3255 * The first time anyone tries to iterate across a css, we need to
3203 * we need to enable the list linking each css_set to its 3256 * enable the list linking each css_set to its tasks, and fix up
3204 * tasks, and fix up all existing tasks. 3257 * all existing tasks.
3205 */ 3258 */
3206 if (!use_task_css_set_links) 3259 if (!use_task_css_set_links)
3207 cgroup_enable_task_cg_lists(); 3260 cgroup_enable_task_cg_lists();
3208 3261
3209 read_lock(&css_set_lock); 3262 read_lock(&css_set_lock);
3210 it->cset_link = &cgrp->cset_links; 3263
3211 cgroup_advance_iter(cgrp, it); 3264 it->origin_css = css;
3265 it->cset_link = &css->cgroup->cset_links;
3266
3267 css_advance_task_iter(it);
3212} 3268}
3213 3269
3214struct task_struct *cgroup_iter_next(struct cgroup *cgrp, 3270/**
3215 struct cgroup_iter *it) 3271 * css_task_iter_next - return the next task for the iterator
3272 * @it: the task iterator being iterated
3273 *
3274 * The "next" function for task iteration. @it should have been
3275 * initialized via css_task_iter_start(). Returns NULL when the iteration
3276 * reaches the end.
3277 */
3278struct task_struct *css_task_iter_next(struct css_task_iter *it)
3216{ 3279{
3217 struct task_struct *res; 3280 struct task_struct *res;
3218 struct list_head *l = it->task; 3281 struct list_head *l = it->task;
@@ -3226,16 +3289,24 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
3226 l = l->next; 3289 l = l->next;
3227 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); 3290 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
3228 if (l == &link->cset->tasks) { 3291 if (l == &link->cset->tasks) {
3229 /* We reached the end of this task list - move on to 3292 /*
3230 * the next cg_cgroup_link */ 3293 * We reached the end of this task list - move on to the
3231 cgroup_advance_iter(cgrp, it); 3294 * next cgrp_cset_link.
3295 */
3296 css_advance_task_iter(it);
3232 } else { 3297 } else {
3233 it->task = l; 3298 it->task = l;
3234 } 3299 }
3235 return res; 3300 return res;
3236} 3301}
3237 3302
3238void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) 3303/**
3304 * css_task_iter_end - finish task iteration
3305 * @it: the task iterator to finish
3306 *
3307 * Finish task iteration started by css_task_iter_start().
3308 */
3309void css_task_iter_end(struct css_task_iter *it)
3239 __releases(css_set_lock) 3310 __releases(css_set_lock)
3240{ 3311{
3241 read_unlock(&css_set_lock); 3312 read_unlock(&css_set_lock);
@@ -3276,46 +3347,49 @@ static inline int started_after(void *p1, void *p2)
3276} 3347}
3277 3348
3278/** 3349/**
3279 * cgroup_scan_tasks - iterate though all the tasks in a cgroup 3350 * css_scan_tasks - iterate though all the tasks in a css
3280 * @scan: struct cgroup_scanner containing arguments for the scan 3351 * @css: the css to iterate tasks of
3352 * @test: optional test callback
3353 * @process: process callback
3354 * @data: data passed to @test and @process
3355 * @heap: optional pre-allocated heap used for task iteration
3356 *
3357 * Iterate through all the tasks in @css, calling @test for each, and if it
3358 * returns %true, call @process for it also.
3359 *
3360 * @test may be NULL, meaning always true (select all tasks), which
3361 * effectively duplicates css_task_iter_{start,next,end}() but does not
3362 * lock css_set_lock for the call to @process.
3363 *
3364 * It is guaranteed that @process will act on every task that is a member
3365 * of @css for the duration of this call. This function may or may not
3366 * call @process for tasks that exit or move to a different css during the
3367 * call, or are forked or move into the css during the call.
3281 * 3368 *
3282 * Arguments include pointers to callback functions test_task() and 3369 * Note that @test may be called with locks held, and may in some
3283 * process_task(). 3370 * situations be called multiple times for the same task, so it should be
3284 * Iterate through all the tasks in a cgroup, calling test_task() for each, 3371 * cheap.
3285 * and if it returns true, call process_task() for it also.
3286 * The test_task pointer may be NULL, meaning always true (select all tasks).
3287 * Effectively duplicates cgroup_iter_{start,next,end}()
3288 * but does not lock css_set_lock for the call to process_task().
3289 * The struct cgroup_scanner may be embedded in any structure of the caller's
3290 * creation.
3291 * It is guaranteed that process_task() will act on every task that
3292 * is a member of the cgroup for the duration of this call. This
3293 * function may or may not call process_task() for tasks that exit
3294 * or move to a different cgroup during the call, or are forked or
3295 * move into the cgroup during the call.
3296 * 3372 *
3297 * Note that test_task() may be called with locks held, and may in some 3373 * If @heap is non-NULL, a heap has been pre-allocated and will be used for
3298 * situations be called multiple times for the same task, so it should 3374 * heap operations (and its "gt" member will be overwritten), else a
3299 * be cheap. 3375 * temporary heap will be used (allocation of which may cause this function
3300 * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been 3376 * to fail).
3301 * pre-allocated and will be used for heap operations (and its "gt" member will
3302 * be overwritten), else a temporary heap will be used (allocation of which
3303 * may cause this function to fail).
3304 */ 3377 */
3305int cgroup_scan_tasks(struct cgroup_scanner *scan) 3378int css_scan_tasks(struct cgroup_subsys_state *css,
3379 bool (*test)(struct task_struct *, void *),
3380 void (*process)(struct task_struct *, void *),
3381 void *data, struct ptr_heap *heap)
3306{ 3382{
3307 int retval, i; 3383 int retval, i;
3308 struct cgroup_iter it; 3384 struct css_task_iter it;
3309 struct task_struct *p, *dropped; 3385 struct task_struct *p, *dropped;
3310 /* Never dereference latest_task, since it's not refcounted */ 3386 /* Never dereference latest_task, since it's not refcounted */
3311 struct task_struct *latest_task = NULL; 3387 struct task_struct *latest_task = NULL;
3312 struct ptr_heap tmp_heap; 3388 struct ptr_heap tmp_heap;
3313 struct ptr_heap *heap;
3314 struct timespec latest_time = { 0, 0 }; 3389 struct timespec latest_time = { 0, 0 };
3315 3390
3316 if (scan->heap) { 3391 if (heap) {
3317 /* The caller supplied our heap and pre-allocated its memory */ 3392 /* The caller supplied our heap and pre-allocated its memory */
3318 heap = scan->heap;
3319 heap->gt = &started_after; 3393 heap->gt = &started_after;
3320 } else { 3394 } else {
3321 /* We need to allocate our own heap memory */ 3395 /* We need to allocate our own heap memory */
@@ -3328,25 +3402,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3328 3402
3329 again: 3403 again:
3330 /* 3404 /*
3331 * Scan tasks in the cgroup, using the scanner's "test_task" callback 3405 * Scan tasks in the css, using the @test callback to determine
3332 * to determine which are of interest, and using the scanner's 3406 * which are of interest, and invoking @process callback on the
3333 * "process_task" callback to process any of them that need an update. 3407 * ones which need an update. Since we don't want to hold any
3334 * Since we don't want to hold any locks during the task updates, 3408 * locks during the task updates, gather tasks to be processed in a
3335 * gather tasks to be processed in a heap structure. 3409 * heap structure. The heap is sorted by descending task start
3336 * The heap is sorted by descending task start time. 3410 * time. If the statically-sized heap fills up, we overflow tasks
3337 * If the statically-sized heap fills up, we overflow tasks that 3411 * that started later, and in future iterations only consider tasks
3338 * started later, and in future iterations only consider tasks that 3412 * that started after the latest task in the previous pass. This
3339 * started after the latest task in the previous pass. This
3340 * guarantees forward progress and that we don't miss any tasks. 3413 * guarantees forward progress and that we don't miss any tasks.
3341 */ 3414 */
3342 heap->size = 0; 3415 heap->size = 0;
3343 cgroup_iter_start(scan->cg, &it); 3416 css_task_iter_start(css, &it);
3344 while ((p = cgroup_iter_next(scan->cg, &it))) { 3417 while ((p = css_task_iter_next(&it))) {
3345 /* 3418 /*
3346 * Only affect tasks that qualify per the caller's callback, 3419 * Only affect tasks that qualify per the caller's callback,
3347 * if he provided one 3420 * if he provided one
3348 */ 3421 */
3349 if (scan->test_task && !scan->test_task(p, scan)) 3422 if (test && !test(p, data))
3350 continue; 3423 continue;
3351 /* 3424 /*
3352 * Only process tasks that started after the last task 3425 * Only process tasks that started after the last task
@@ -3374,7 +3447,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3374 * the heap and wasn't inserted 3447 * the heap and wasn't inserted
3375 */ 3448 */
3376 } 3449 }
3377 cgroup_iter_end(scan->cg, &it); 3450 css_task_iter_end(&it);
3378 3451
3379 if (heap->size) { 3452 if (heap->size) {
3380 for (i = 0; i < heap->size; i++) { 3453 for (i = 0; i < heap->size; i++) {
@@ -3384,7 +3457,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3384 latest_task = q; 3457 latest_task = q;
3385 } 3458 }
3386 /* Process the task per the caller's callback */ 3459 /* Process the task per the caller's callback */
3387 scan->process_task(q, scan); 3460 process(q, data);
3388 put_task_struct(q); 3461 put_task_struct(q);
3389 } 3462 }
3390 /* 3463 /*
@@ -3401,10 +3474,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3401 return 0; 3474 return 0;
3402} 3475}
3403 3476
3404static void cgroup_transfer_one_task(struct task_struct *task, 3477static void cgroup_transfer_one_task(struct task_struct *task, void *data)
3405 struct cgroup_scanner *scan)
3406{ 3478{
3407 struct cgroup *new_cgroup = scan->data; 3479 struct cgroup *new_cgroup = data;
3408 3480
3409 mutex_lock(&cgroup_mutex); 3481 mutex_lock(&cgroup_mutex);
3410 cgroup_attach_task(new_cgroup, task, false); 3482 cgroup_attach_task(new_cgroup, task, false);
@@ -3418,15 +3490,8 @@ static void cgroup_transfer_one_task(struct task_struct *task,
3418 */ 3490 */
3419int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) 3491int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3420{ 3492{
3421 struct cgroup_scanner scan; 3493 return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
3422 3494 to, NULL);
3423 scan.cg = from;
3424 scan.test_task = NULL; /* select all tasks in cgroup */
3425 scan.process_task = cgroup_transfer_one_task;
3426 scan.heap = NULL;
3427 scan.data = to;
3428
3429 return cgroup_scan_tasks(&scan);
3430} 3495}
3431 3496
3432/* 3497/*
@@ -3468,7 +3533,7 @@ struct cgroup_pidlist {
3468 /* pointer to the cgroup we belong to, for list removal purposes */ 3533 /* pointer to the cgroup we belong to, for list removal purposes */
3469 struct cgroup *owner; 3534 struct cgroup *owner;
3470 /* protects the other fields */ 3535 /* protects the other fields */
3471 struct rw_semaphore mutex; 3536 struct rw_semaphore rwsem;
3472}; 3537};
3473 3538
3474/* 3539/*
@@ -3541,7 +3606,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3541 struct pid_namespace *ns = task_active_pid_ns(current); 3606 struct pid_namespace *ns = task_active_pid_ns(current);
3542 3607
3543 /* 3608 /*
3544 * We can't drop the pidlist_mutex before taking the l->mutex in case 3609 * We can't drop the pidlist_mutex before taking the l->rwsem in case
3545 * the last ref-holder is trying to remove l from the list at the same 3610 * the last ref-holder is trying to remove l from the list at the same
3546 * time. Holding the pidlist_mutex precludes somebody taking whichever 3611 * time. Holding the pidlist_mutex precludes somebody taking whichever
3547 * list we find out from under us - compare release_pid_array(). 3612 * list we find out from under us - compare release_pid_array().
@@ -3550,7 +3615,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3550 list_for_each_entry(l, &cgrp->pidlists, links) { 3615 list_for_each_entry(l, &cgrp->pidlists, links) {
3551 if (l->key.type == type && l->key.ns == ns) { 3616 if (l->key.type == type && l->key.ns == ns) {
3552 /* make sure l doesn't vanish out from under us */ 3617 /* make sure l doesn't vanish out from under us */
3553 down_write(&l->mutex); 3618 down_write(&l->rwsem);
3554 mutex_unlock(&cgrp->pidlist_mutex); 3619 mutex_unlock(&cgrp->pidlist_mutex);
3555 return l; 3620 return l;
3556 } 3621 }
@@ -3561,8 +3626,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3561 mutex_unlock(&cgrp->pidlist_mutex); 3626 mutex_unlock(&cgrp->pidlist_mutex);
3562 return l; 3627 return l;
3563 } 3628 }
3564 init_rwsem(&l->mutex); 3629 init_rwsem(&l->rwsem);
3565 down_write(&l->mutex); 3630 down_write(&l->rwsem);
3566 l->key.type = type; 3631 l->key.type = type;
3567 l->key.ns = get_pid_ns(ns); 3632 l->key.ns = get_pid_ns(ns);
3568 l->owner = cgrp; 3633 l->owner = cgrp;
@@ -3580,7 +3645,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3580 pid_t *array; 3645 pid_t *array;
3581 int length; 3646 int length;
3582 int pid, n = 0; /* used for populating the array */ 3647 int pid, n = 0; /* used for populating the array */
3583 struct cgroup_iter it; 3648 struct css_task_iter it;
3584 struct task_struct *tsk; 3649 struct task_struct *tsk;
3585 struct cgroup_pidlist *l; 3650 struct cgroup_pidlist *l;
3586 3651
@@ -3595,8 +3660,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3595 if (!array) 3660 if (!array)
3596 return -ENOMEM; 3661 return -ENOMEM;
3597 /* now, populate the array */ 3662 /* now, populate the array */
3598 cgroup_iter_start(cgrp, &it); 3663 css_task_iter_start(&cgrp->dummy_css, &it);
3599 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3664 while ((tsk = css_task_iter_next(&it))) {
3600 if (unlikely(n == length)) 3665 if (unlikely(n == length))
3601 break; 3666 break;
3602 /* get tgid or pid for procs or tasks file respectively */ 3667 /* get tgid or pid for procs or tasks file respectively */
@@ -3607,7 +3672,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3607 if (pid > 0) /* make sure to only use valid results */ 3672 if (pid > 0) /* make sure to only use valid results */
3608 array[n++] = pid; 3673 array[n++] = pid;
3609 } 3674 }
3610 cgroup_iter_end(cgrp, &it); 3675 css_task_iter_end(&it);
3611 length = n; 3676 length = n;
3612 /* now sort & (if procs) strip out duplicates */ 3677 /* now sort & (if procs) strip out duplicates */
3613 sort(array, length, sizeof(pid_t), cmppid, NULL); 3678 sort(array, length, sizeof(pid_t), cmppid, NULL);
@@ -3623,7 +3688,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3623 l->list = array; 3688 l->list = array;
3624 l->length = length; 3689 l->length = length;
3625 l->use_count++; 3690 l->use_count++;
3626 up_write(&l->mutex); 3691 up_write(&l->rwsem);
3627 *lp = l; 3692 *lp = l;
3628 return 0; 3693 return 0;
3629} 3694}
@@ -3641,7 +3706,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3641{ 3706{
3642 int ret = -EINVAL; 3707 int ret = -EINVAL;
3643 struct cgroup *cgrp; 3708 struct cgroup *cgrp;
3644 struct cgroup_iter it; 3709 struct css_task_iter it;
3645 struct task_struct *tsk; 3710 struct task_struct *tsk;
3646 3711
3647 /* 3712 /*
@@ -3655,8 +3720,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3655 ret = 0; 3720 ret = 0;
3656 cgrp = dentry->d_fsdata; 3721 cgrp = dentry->d_fsdata;
3657 3722
3658 cgroup_iter_start(cgrp, &it); 3723 css_task_iter_start(&cgrp->dummy_css, &it);
3659 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3724 while ((tsk = css_task_iter_next(&it))) {
3660 switch (tsk->state) { 3725 switch (tsk->state) {
3661 case TASK_RUNNING: 3726 case TASK_RUNNING:
3662 stats->nr_running++; 3727 stats->nr_running++;
@@ -3676,7 +3741,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3676 break; 3741 break;
3677 } 3742 }
3678 } 3743 }
3679 cgroup_iter_end(cgrp, &it); 3744 css_task_iter_end(&it);
3680 3745
3681err: 3746err:
3682 return ret; 3747 return ret;
@@ -3701,7 +3766,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3701 int index = 0, pid = *pos; 3766 int index = 0, pid = *pos;
3702 int *iter; 3767 int *iter;
3703 3768
3704 down_read(&l->mutex); 3769 down_read(&l->rwsem);
3705 if (pid) { 3770 if (pid) {
3706 int end = l->length; 3771 int end = l->length;
3707 3772
@@ -3728,7 +3793,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3728static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3793static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3729{ 3794{
3730 struct cgroup_pidlist *l = s->private; 3795 struct cgroup_pidlist *l = s->private;
3731 up_read(&l->mutex); 3796 up_read(&l->rwsem);
3732} 3797}
3733 3798
3734static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3799static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
@@ -3774,7 +3839,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3774 * pidlist_mutex, we have to take pidlist_mutex first. 3839 * pidlist_mutex, we have to take pidlist_mutex first.
3775 */ 3840 */
3776 mutex_lock(&l->owner->pidlist_mutex); 3841 mutex_lock(&l->owner->pidlist_mutex);
3777 down_write(&l->mutex); 3842 down_write(&l->rwsem);
3778 BUG_ON(!l->use_count); 3843 BUG_ON(!l->use_count);
3779 if (!--l->use_count) { 3844 if (!--l->use_count) {
3780 /* we're the last user if refcount is 0; remove and free */ 3845 /* we're the last user if refcount is 0; remove and free */
@@ -3782,12 +3847,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3782 mutex_unlock(&l->owner->pidlist_mutex); 3847 mutex_unlock(&l->owner->pidlist_mutex);
3783 pidlist_free(l->list); 3848 pidlist_free(l->list);
3784 put_pid_ns(l->key.ns); 3849 put_pid_ns(l->key.ns);
3785 up_write(&l->mutex); 3850 up_write(&l->rwsem);
3786 kfree(l); 3851 kfree(l);
3787 return; 3852 return;
3788 } 3853 }
3789 mutex_unlock(&l->owner->pidlist_mutex); 3854 mutex_unlock(&l->owner->pidlist_mutex);
3790 up_write(&l->mutex); 3855 up_write(&l->rwsem);
3791} 3856}
3792 3857
3793static int cgroup_pidlist_release(struct inode *inode, struct file *file) 3858static int cgroup_pidlist_release(struct inode *inode, struct file *file)
@@ -3851,21 +3916,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file)
3851 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); 3916 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3852} 3917}
3853 3918
3854static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 3919static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3855 struct cftype *cft) 3920 struct cftype *cft)
3856{ 3921{
3857 return notify_on_release(cgrp); 3922 return notify_on_release(css->cgroup);
3858} 3923}
3859 3924
3860static int cgroup_write_notify_on_release(struct cgroup *cgrp, 3925static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3861 struct cftype *cft, 3926 struct cftype *cft, u64 val)
3862 u64 val)
3863{ 3927{
3864 clear_bit(CGRP_RELEASABLE, &cgrp->flags); 3928 clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
3865 if (val) 3929 if (val)
3866 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3930 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3867 else 3931 else
3868 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3932 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3869 return 0; 3933 return 0;
3870} 3934}
3871 3935
@@ -3895,18 +3959,18 @@ static void cgroup_event_remove(struct work_struct *work)
3895{ 3959{
3896 struct cgroup_event *event = container_of(work, struct cgroup_event, 3960 struct cgroup_event *event = container_of(work, struct cgroup_event,
3897 remove); 3961 remove);
3898 struct cgroup *cgrp = event->cgrp; 3962 struct cgroup_subsys_state *css = event->css;
3899 3963
3900 remove_wait_queue(event->wqh, &event->wait); 3964 remove_wait_queue(event->wqh, &event->wait);
3901 3965
3902 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3966 event->cft->unregister_event(css, event->cft, event->eventfd);
3903 3967
3904 /* Notify userspace the event is going away. */ 3968 /* Notify userspace the event is going away. */
3905 eventfd_signal(event->eventfd, 1); 3969 eventfd_signal(event->eventfd, 1);
3906 3970
3907 eventfd_ctx_put(event->eventfd); 3971 eventfd_ctx_put(event->eventfd);
3908 kfree(event); 3972 kfree(event);
3909 cgroup_dput(cgrp); 3973 css_put(css);
3910} 3974}
3911 3975
3912/* 3976/*
@@ -3919,7 +3983,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3919{ 3983{
3920 struct cgroup_event *event = container_of(wait, 3984 struct cgroup_event *event = container_of(wait,
3921 struct cgroup_event, wait); 3985 struct cgroup_event, wait);
3922 struct cgroup *cgrp = event->cgrp; 3986 struct cgroup *cgrp = event->css->cgroup;
3923 unsigned long flags = (unsigned long)key; 3987 unsigned long flags = (unsigned long)key;
3924 3988
3925 if (flags & POLLHUP) { 3989 if (flags & POLLHUP) {
@@ -3963,14 +4027,15 @@ static void cgroup_event_ptable_queue_proc(struct file *file,
3963 * Input must be in format '<event_fd> <control_fd> <args>'. 4027 * Input must be in format '<event_fd> <control_fd> <args>'.
3964 * Interpretation of args is defined by control file implementation. 4028 * Interpretation of args is defined by control file implementation.
3965 */ 4029 */
3966static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, 4030static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
3967 const char *buffer) 4031 struct cftype *cft, const char *buffer)
3968{ 4032{
3969 struct cgroup_event *event = NULL; 4033 struct cgroup *cgrp = dummy_css->cgroup;
3970 struct cgroup *cgrp_cfile; 4034 struct cgroup_event *event;
4035 struct cgroup_subsys_state *cfile_css;
3971 unsigned int efd, cfd; 4036 unsigned int efd, cfd;
3972 struct file *efile = NULL; 4037 struct file *efile;
3973 struct file *cfile = NULL; 4038 struct file *cfile;
3974 char *endp; 4039 char *endp;
3975 int ret; 4040 int ret;
3976 4041
@@ -3987,7 +4052,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3987 event = kzalloc(sizeof(*event), GFP_KERNEL); 4052 event = kzalloc(sizeof(*event), GFP_KERNEL);
3988 if (!event) 4053 if (!event)
3989 return -ENOMEM; 4054 return -ENOMEM;
3990 event->cgrp = cgrp; 4055
3991 INIT_LIST_HEAD(&event->list); 4056 INIT_LIST_HEAD(&event->list);
3992 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); 4057 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3993 init_waitqueue_func_entry(&event->wait, cgroup_event_wake); 4058 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
@@ -3996,62 +4061,68 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3996 efile = eventfd_fget(efd); 4061 efile = eventfd_fget(efd);
3997 if (IS_ERR(efile)) { 4062 if (IS_ERR(efile)) {
3998 ret = PTR_ERR(efile); 4063 ret = PTR_ERR(efile);
3999 goto fail; 4064 goto out_kfree;
4000 } 4065 }
4001 4066
4002 event->eventfd = eventfd_ctx_fileget(efile); 4067 event->eventfd = eventfd_ctx_fileget(efile);
4003 if (IS_ERR(event->eventfd)) { 4068 if (IS_ERR(event->eventfd)) {
4004 ret = PTR_ERR(event->eventfd); 4069 ret = PTR_ERR(event->eventfd);
4005 goto fail; 4070 goto out_put_efile;
4006 } 4071 }
4007 4072
4008 cfile = fget(cfd); 4073 cfile = fget(cfd);
4009 if (!cfile) { 4074 if (!cfile) {
4010 ret = -EBADF; 4075 ret = -EBADF;
4011 goto fail; 4076 goto out_put_eventfd;
4012 } 4077 }
4013 4078
4014 /* the process need read permission on control file */ 4079 /* the process need read permission on control file */
4015 /* AV: shouldn't we check that it's been opened for read instead? */ 4080 /* AV: shouldn't we check that it's been opened for read instead? */
4016 ret = inode_permission(file_inode(cfile), MAY_READ); 4081 ret = inode_permission(file_inode(cfile), MAY_READ);
4017 if (ret < 0) 4082 if (ret < 0)
4018 goto fail; 4083 goto out_put_cfile;
4019 4084
4020 event->cft = __file_cft(cfile); 4085 event->cft = __file_cft(cfile);
4021 if (IS_ERR(event->cft)) { 4086 if (IS_ERR(event->cft)) {
4022 ret = PTR_ERR(event->cft); 4087 ret = PTR_ERR(event->cft);
4023 goto fail; 4088 goto out_put_cfile;
4089 }
4090
4091 if (!event->cft->ss) {
4092 ret = -EBADF;
4093 goto out_put_cfile;
4024 } 4094 }
4025 4095
4026 /* 4096 /*
4027 * The file to be monitored must be in the same cgroup as 4097 * Determine the css of @cfile, verify it belongs to the same
4028 * cgroup.event_control is. 4098 * cgroup as cgroup.event_control, and associate @event with it.
4099 * Remaining events are automatically removed on cgroup destruction
4100 * but the removal is asynchronous, so take an extra ref.
4029 */ 4101 */
4030 cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); 4102 rcu_read_lock();
4031 if (cgrp_cfile != cgrp) { 4103
4032 ret = -EINVAL; 4104 ret = -EINVAL;
4033 goto fail; 4105 event->css = cgroup_css(cgrp, event->cft->ss);
4034 } 4106 cfile_css = css_from_dir(cfile->f_dentry->d_parent, event->cft->ss);
4107 if (event->css && event->css == cfile_css && css_tryget(event->css))
4108 ret = 0;
4109
4110 rcu_read_unlock();
4111 if (ret)
4112 goto out_put_cfile;
4035 4113
4036 if (!event->cft->register_event || !event->cft->unregister_event) { 4114 if (!event->cft->register_event || !event->cft->unregister_event) {
4037 ret = -EINVAL; 4115 ret = -EINVAL;
4038 goto fail; 4116 goto out_put_css;
4039 } 4117 }
4040 4118
4041 ret = event->cft->register_event(cgrp, event->cft, 4119 ret = event->cft->register_event(event->css, event->cft,
4042 event->eventfd, buffer); 4120 event->eventfd, buffer);
4043 if (ret) 4121 if (ret)
4044 goto fail; 4122 goto out_put_css;
4045 4123
4046 efile->f_op->poll(efile, &event->pt); 4124 efile->f_op->poll(efile, &event->pt);
4047 4125
4048 /*
4049 * Events should be removed after rmdir of cgroup directory, but before
4050 * destroying subsystem state objects. Let's take reference to cgroup
4051 * directory dentry to do that.
4052 */
4053 dget(cgrp->dentry);
4054
4055 spin_lock(&cgrp->event_list_lock); 4126 spin_lock(&cgrp->event_list_lock);
4056 list_add(&event->list, &cgrp->event_list); 4127 list_add(&event->list, &cgrp->event_list);
4057 spin_unlock(&cgrp->event_list_lock); 4128 spin_unlock(&cgrp->event_list_lock);
@@ -4061,35 +4132,33 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
4061 4132
4062 return 0; 4133 return 0;
4063 4134
4064fail: 4135out_put_css:
4065 if (cfile) 4136 css_put(event->css);
4066 fput(cfile); 4137out_put_cfile:
4067 4138 fput(cfile);
4068 if (event && event->eventfd && !IS_ERR(event->eventfd)) 4139out_put_eventfd:
4069 eventfd_ctx_put(event->eventfd); 4140 eventfd_ctx_put(event->eventfd);
4070 4141out_put_efile:
4071 if (!IS_ERR_OR_NULL(efile)) 4142 fput(efile);
4072 fput(efile); 4143out_kfree:
4073
4074 kfree(event); 4144 kfree(event);
4075 4145
4076 return ret; 4146 return ret;
4077} 4147}
4078 4148
4079static u64 cgroup_clone_children_read(struct cgroup *cgrp, 4149static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4080 struct cftype *cft) 4150 struct cftype *cft)
4081{ 4151{
4082 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4152 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4083} 4153}
4084 4154
4085static int cgroup_clone_children_write(struct cgroup *cgrp, 4155static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4086 struct cftype *cft, 4156 struct cftype *cft, u64 val)
4087 u64 val)
4088{ 4157{
4089 if (val) 4158 if (val)
4090 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4159 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4091 else 4160 else
4092 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4161 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4093 return 0; 4162 return 0;
4094} 4163}
4095 4164
@@ -4148,36 +4217,34 @@ static struct cftype cgroup_base_files[] = {
4148}; 4217};
4149 4218
4150/** 4219/**
4151 * cgroup_populate_dir - selectively creation of files in a directory 4220 * cgroup_populate_dir - create subsys files in a cgroup directory
4152 * @cgrp: target cgroup 4221 * @cgrp: target cgroup
4153 * @base_files: true if the base files should be added
4154 * @subsys_mask: mask of the subsystem ids whose files should be added 4222 * @subsys_mask: mask of the subsystem ids whose files should be added
4223 *
4224 * On failure, no file is added.
4155 */ 4225 */
4156static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 4226static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
4157 unsigned long subsys_mask)
4158{ 4227{
4159 int err;
4160 struct cgroup_subsys *ss; 4228 struct cgroup_subsys *ss;
4161 4229 int i, ret = 0;
4162 if (base_files) {
4163 err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);
4164 if (err < 0)
4165 return err;
4166 }
4167 4230
4168 /* process cftsets of each subsystem */ 4231 /* process cftsets of each subsystem */
4169 for_each_root_subsys(cgrp->root, ss) { 4232 for_each_subsys(ss, i) {
4170 struct cftype_set *set; 4233 struct cftype_set *set;
4171 if (!test_bit(ss->subsys_id, &subsys_mask)) 4234
4235 if (!test_bit(i, &subsys_mask))
4172 continue; 4236 continue;
4173 4237
4174 list_for_each_entry(set, &ss->cftsets, node) 4238 list_for_each_entry(set, &ss->cftsets, node) {
4175 cgroup_addrm_files(cgrp, ss, set->cfts, true); 4239 ret = cgroup_addrm_files(cgrp, set->cfts, true);
4240 if (ret < 0)
4241 goto err;
4242 }
4176 } 4243 }
4177 4244
4178 /* This cgroup is ready now */ 4245 /* This cgroup is ready now */
4179 for_each_root_subsys(cgrp->root, ss) { 4246 for_each_root_subsys(cgrp->root, ss) {
4180 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4247 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
4181 struct css_id *id = rcu_dereference_protected(css->id, true); 4248 struct css_id *id = rcu_dereference_protected(css->id, true);
4182 4249
4183 /* 4250 /*
@@ -4190,14 +4257,57 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
4190 } 4257 }
4191 4258
4192 return 0; 4259 return 0;
4260err:
4261 cgroup_clear_dir(cgrp, subsys_mask);
4262 return ret;
4193} 4263}
4194 4264
4195static void css_dput_fn(struct work_struct *work) 4265/*
4266 * css destruction is four-stage process.
4267 *
4268 * 1. Destruction starts. Killing of the percpu_ref is initiated.
4269 * Implemented in kill_css().
4270 *
4271 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
4272 * and thus css_tryget() is guaranteed to fail, the css can be offlined
4273 * by invoking offline_css(). After offlining, the base ref is put.
4274 * Implemented in css_killed_work_fn().
4275 *
4276 * 3. When the percpu_ref reaches zero, the only possible remaining
4277 * accessors are inside RCU read sections. css_release() schedules the
4278 * RCU callback.
4279 *
4280 * 4. After the grace period, the css can be freed. Implemented in
4281 * css_free_work_fn().
4282 *
4283 * It is actually hairier because both step 2 and 4 require process context
4284 * and thus involve punting to css->destroy_work adding two additional
4285 * steps to the already complex sequence.
4286 */
4287static void css_free_work_fn(struct work_struct *work)
4196{ 4288{
4197 struct cgroup_subsys_state *css = 4289 struct cgroup_subsys_state *css =
4198 container_of(work, struct cgroup_subsys_state, dput_work); 4290 container_of(work, struct cgroup_subsys_state, destroy_work);
4291 struct cgroup *cgrp = css->cgroup;
4199 4292
4200 cgroup_dput(css->cgroup); 4293 if (css->parent)
4294 css_put(css->parent);
4295
4296 css->ss->css_free(css);
4297 cgroup_dput(cgrp);
4298}
4299
4300static void css_free_rcu_fn(struct rcu_head *rcu_head)
4301{
4302 struct cgroup_subsys_state *css =
4303 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4304
4305 /*
4306 * css holds an extra ref to @cgrp->dentry which is put on the last
4307 * css_put(). dput() requires process context which we don't have.
4308 */
4309 INIT_WORK(&css->destroy_work, css_free_work_fn);
4310 schedule_work(&css->destroy_work);
4201} 4311}
4202 4312
4203static void css_release(struct percpu_ref *ref) 4313static void css_release(struct percpu_ref *ref)
@@ -4205,49 +4315,47 @@ static void css_release(struct percpu_ref *ref)
4205 struct cgroup_subsys_state *css = 4315 struct cgroup_subsys_state *css =
4206 container_of(ref, struct cgroup_subsys_state, refcnt); 4316 container_of(ref, struct cgroup_subsys_state, refcnt);
4207 4317
4208 schedule_work(&css->dput_work); 4318 call_rcu(&css->rcu_head, css_free_rcu_fn);
4209} 4319}
4210 4320
4211static void init_cgroup_css(struct cgroup_subsys_state *css, 4321static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
4212 struct cgroup_subsys *ss, 4322 struct cgroup *cgrp)
4213 struct cgroup *cgrp)
4214{ 4323{
4215 css->cgroup = cgrp; 4324 css->cgroup = cgrp;
4325 css->ss = ss;
4216 css->flags = 0; 4326 css->flags = 0;
4217 css->id = NULL; 4327 css->id = NULL;
4218 if (cgrp == cgroup_dummy_top) 4328
4329 if (cgrp->parent)
4330 css->parent = cgroup_css(cgrp->parent, ss);
4331 else
4219 css->flags |= CSS_ROOT; 4332 css->flags |= CSS_ROOT;
4220 BUG_ON(cgrp->subsys[ss->subsys_id]);
4221 cgrp->subsys[ss->subsys_id] = css;
4222 4333
4223 /* 4334 BUG_ON(cgroup_css(cgrp, ss));
4224 * css holds an extra ref to @cgrp->dentry which is put on the last
4225 * css_put(). dput() requires process context, which css_put() may
4226 * be called without. @css->dput_work will be used to invoke
4227 * dput() asynchronously from css_put().
4228 */
4229 INIT_WORK(&css->dput_work, css_dput_fn);
4230} 4335}
4231 4336
4232/* invoke ->post_create() on a new CSS and mark it online if successful */ 4337/* invoke ->css_online() on a new CSS and mark it online if successful */
4233static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) 4338static int online_css(struct cgroup_subsys_state *css)
4234{ 4339{
4340 struct cgroup_subsys *ss = css->ss;
4235 int ret = 0; 4341 int ret = 0;
4236 4342
4237 lockdep_assert_held(&cgroup_mutex); 4343 lockdep_assert_held(&cgroup_mutex);
4238 4344
4239 if (ss->css_online) 4345 if (ss->css_online)
4240 ret = ss->css_online(cgrp); 4346 ret = ss->css_online(css);
4241 if (!ret) 4347 if (!ret) {
4242 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; 4348 css->flags |= CSS_ONLINE;
4349 css->cgroup->nr_css++;
4350 rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css);
4351 }
4243 return ret; 4352 return ret;
4244} 4353}
4245 4354
4246/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ 4355/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
4247static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) 4356static void offline_css(struct cgroup_subsys_state *css)
4248 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4249{ 4357{
4250 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4358 struct cgroup_subsys *ss = css->ss;
4251 4359
4252 lockdep_assert_held(&cgroup_mutex); 4360 lockdep_assert_held(&cgroup_mutex);
4253 4361
@@ -4255,9 +4363,11 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4255 return; 4363 return;
4256 4364
4257 if (ss->css_offline) 4365 if (ss->css_offline)
4258 ss->css_offline(cgrp); 4366 ss->css_offline(css);
4259 4367
4260 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; 4368 css->flags &= ~CSS_ONLINE;
4369 css->cgroup->nr_css--;
4370 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
4261} 4371}
4262 4372
4263/* 4373/*
@@ -4271,6 +4381,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4271static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 4381static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4272 umode_t mode) 4382 umode_t mode)
4273{ 4383{
4384 struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
4274 struct cgroup *cgrp; 4385 struct cgroup *cgrp;
4275 struct cgroup_name *name; 4386 struct cgroup_name *name;
4276 struct cgroupfs_root *root = parent->root; 4387 struct cgroupfs_root *root = parent->root;
@@ -4288,7 +4399,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4288 goto err_free_cgrp; 4399 goto err_free_cgrp;
4289 rcu_assign_pointer(cgrp->name, name); 4400 rcu_assign_pointer(cgrp->name, name);
4290 4401
4291 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); 4402 /*
4403 * Temporarily set the pointer to NULL, so idr_find() won't return
4404 * a half-baked cgroup.
4405 */
4406 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
4292 if (cgrp->id < 0) 4407 if (cgrp->id < 0)
4293 goto err_free_name; 4408 goto err_free_name;
4294 4409
@@ -4317,6 +4432,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4317 cgrp->dentry = dentry; 4432 cgrp->dentry = dentry;
4318 4433
4319 cgrp->parent = parent; 4434 cgrp->parent = parent;
4435 cgrp->dummy_css.parent = &parent->dummy_css;
4320 cgrp->root = parent->root; 4436 cgrp->root = parent->root;
4321 4437
4322 if (notify_on_release(parent)) 4438 if (notify_on_release(parent))
@@ -4328,22 +4444,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4328 for_each_root_subsys(root, ss) { 4444 for_each_root_subsys(root, ss) {
4329 struct cgroup_subsys_state *css; 4445 struct cgroup_subsys_state *css;
4330 4446
4331 css = ss->css_alloc(cgrp); 4447 css = ss->css_alloc(cgroup_css(parent, ss));
4332 if (IS_ERR(css)) { 4448 if (IS_ERR(css)) {
4333 err = PTR_ERR(css); 4449 err = PTR_ERR(css);
4334 goto err_free_all; 4450 goto err_free_all;
4335 } 4451 }
4452 css_ar[ss->subsys_id] = css;
4336 4453
4337 err = percpu_ref_init(&css->refcnt, css_release); 4454 err = percpu_ref_init(&css->refcnt, css_release);
4338 if (err) { 4455 if (err)
4339 ss->css_free(cgrp);
4340 goto err_free_all; 4456 goto err_free_all;
4341 }
4342 4457
4343 init_cgroup_css(css, ss, cgrp); 4458 init_css(css, ss, cgrp);
4344 4459
4345 if (ss->use_id) { 4460 if (ss->use_id) {
4346 err = alloc_css_id(ss, parent, cgrp); 4461 err = alloc_css_id(css);
4347 if (err) 4462 if (err)
4348 goto err_free_all; 4463 goto err_free_all;
4349 } 4464 }
@@ -4365,16 +4480,22 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4365 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4480 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4366 root->number_of_cgroups++; 4481 root->number_of_cgroups++;
4367 4482
4368 /* each css holds a ref to the cgroup's dentry */ 4483 /* each css holds a ref to the cgroup's dentry and the parent css */
4369 for_each_root_subsys(root, ss) 4484 for_each_root_subsys(root, ss) {
4485 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4486
4370 dget(dentry); 4487 dget(dentry);
4488 css_get(css->parent);
4489 }
4371 4490
4372 /* hold a ref to the parent's dentry */ 4491 /* hold a ref to the parent's dentry */
4373 dget(parent->dentry); 4492 dget(parent->dentry);
4374 4493
4375 /* creation succeeded, notify subsystems */ 4494 /* creation succeeded, notify subsystems */
4376 for_each_root_subsys(root, ss) { 4495 for_each_root_subsys(root, ss) {
4377 err = online_css(ss, cgrp); 4496 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4497
4498 err = online_css(css);
4378 if (err) 4499 if (err)
4379 goto err_destroy; 4500 goto err_destroy;
4380 4501
@@ -4388,7 +4509,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4388 } 4509 }
4389 } 4510 }
4390 4511
4391 err = cgroup_populate_dir(cgrp, true, root->subsys_mask); 4512 idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4513
4514 err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4515 if (err)
4516 goto err_destroy;
4517
4518 err = cgroup_populate_dir(cgrp, root->subsys_mask);
4392 if (err) 4519 if (err)
4393 goto err_destroy; 4520 goto err_destroy;
4394 4521
@@ -4399,18 +4526,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4399 4526
4400err_free_all: 4527err_free_all:
4401 for_each_root_subsys(root, ss) { 4528 for_each_root_subsys(root, ss) {
4402 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4529 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4403 4530
4404 if (css) { 4531 if (css) {
4405 percpu_ref_cancel_init(&css->refcnt); 4532 percpu_ref_cancel_init(&css->refcnt);
4406 ss->css_free(cgrp); 4533 ss->css_free(css);
4407 } 4534 }
4408 } 4535 }
4409 mutex_unlock(&cgroup_mutex); 4536 mutex_unlock(&cgroup_mutex);
4410 /* Release the reference count that we took on the superblock */ 4537 /* Release the reference count that we took on the superblock */
4411 deactivate_super(sb); 4538 deactivate_super(sb);
4412err_free_id: 4539err_free_id:
4413 ida_simple_remove(&root->cgroup_ida, cgrp->id); 4540 idr_remove(&root->cgroup_idr, cgrp->id);
4414err_free_name: 4541err_free_name:
4415 kfree(rcu_dereference_raw(cgrp->name)); 4542 kfree(rcu_dereference_raw(cgrp->name));
4416err_free_cgrp: 4543err_free_cgrp:
@@ -4432,22 +4559,84 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4432 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4559 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4433} 4560}
4434 4561
4435static void cgroup_css_killed(struct cgroup *cgrp) 4562/*
4563 * This is called when the refcnt of a css is confirmed to be killed.
4564 * css_tryget() is now guaranteed to fail.
4565 */
4566static void css_killed_work_fn(struct work_struct *work)
4436{ 4567{
4437 if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) 4568 struct cgroup_subsys_state *css =
4438 return; 4569 container_of(work, struct cgroup_subsys_state, destroy_work);
4570 struct cgroup *cgrp = css->cgroup;
4439 4571
4440 /* percpu ref's of all css's are killed, kick off the next step */ 4572 mutex_lock(&cgroup_mutex);
4441 INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); 4573
4442 schedule_work(&cgrp->destroy_work); 4574 /*
4575 * css_tryget() is guaranteed to fail now. Tell subsystems to
4576 * initate destruction.
4577 */
4578 offline_css(css);
4579
4580 /*
4581 * If @cgrp is marked dead, it's waiting for refs of all css's to
4582 * be disabled before proceeding to the second phase of cgroup
4583 * destruction. If we are the last one, kick it off.
4584 */
4585 if (!cgrp->nr_css && cgroup_is_dead(cgrp))
4586 cgroup_destroy_css_killed(cgrp);
4587
4588 mutex_unlock(&cgroup_mutex);
4589
4590 /*
4591 * Put the css refs from kill_css(). Each css holds an extra
4592 * reference to the cgroup's dentry and cgroup removal proceeds
4593 * regardless of css refs. On the last put of each css, whenever
4594 * that may be, the extra dentry ref is put so that dentry
4595 * destruction happens only after all css's are released.
4596 */
4597 css_put(css);
4443} 4598}
4444 4599
4445static void css_ref_killed_fn(struct percpu_ref *ref) 4600/* css kill confirmation processing requires process context, bounce */
4601static void css_killed_ref_fn(struct percpu_ref *ref)
4446{ 4602{
4447 struct cgroup_subsys_state *css = 4603 struct cgroup_subsys_state *css =
4448 container_of(ref, struct cgroup_subsys_state, refcnt); 4604 container_of(ref, struct cgroup_subsys_state, refcnt);
4449 4605
4450 cgroup_css_killed(css->cgroup); 4606 INIT_WORK(&css->destroy_work, css_killed_work_fn);
4607 schedule_work(&css->destroy_work);
4608}
4609
4610/**
4611 * kill_css - destroy a css
4612 * @css: css to destroy
4613 *
4614 * This function initiates destruction of @css by removing cgroup interface
4615 * files and putting its base reference. ->css_offline() will be invoked
4616 * asynchronously once css_tryget() is guaranteed to fail and when the
4617 * reference count reaches zero, @css will be released.
4618 */
4619static void kill_css(struct cgroup_subsys_state *css)
4620{
4621 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
4622
4623 /*
4624 * Killing would put the base ref, but we need to keep it alive
4625 * until after ->css_offline().
4626 */
4627 css_get(css);
4628
4629 /*
4630 * cgroup core guarantees that, by the time ->css_offline() is
4631 * invoked, no new css reference will be given out via
4632 * css_tryget(). We can't simply call percpu_ref_kill() and
4633 * proceed to offlining css's because percpu_ref_kill() doesn't
4634 * guarantee that the ref is seen as killed on all CPUs on return.
4635 *
4636 * Use percpu_ref_kill_and_confirm() to get notifications as each
4637 * css is confirmed to be seen as killed on all CPUs.
4638 */
4639 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4451} 4640}
4452 4641
4453/** 4642/**
@@ -4513,41 +4702,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4513 return -EBUSY; 4702 return -EBUSY;
4514 4703
4515 /* 4704 /*
4516 * Block new css_tryget() by killing css refcnts. cgroup core 4705 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4517 * guarantees that, by the time ->css_offline() is invoked, no new 4706 * will be invoked to perform the rest of destruction once the
4518 * css reference will be given out via css_tryget(). We can't 4707 * percpu refs of all css's are confirmed to be killed.
4519 * simply call percpu_ref_kill() and proceed to offlining css's
4520 * because percpu_ref_kill() doesn't guarantee that the ref is seen
4521 * as killed on all CPUs on return.
4522 *
4523 * Use percpu_ref_kill_and_confirm() to get notifications as each
4524 * css is confirmed to be seen as killed on all CPUs. The
4525 * notification callback keeps track of the number of css's to be
4526 * killed and schedules cgroup_offline_fn() to perform the rest of
4527 * destruction once the percpu refs of all css's are confirmed to
4528 * be killed.
4529 */ 4708 */
4530 atomic_set(&cgrp->css_kill_cnt, 1); 4709 for_each_root_subsys(cgrp->root, ss)
4531 for_each_root_subsys(cgrp->root, ss) { 4710 kill_css(cgroup_css(cgrp, ss));
4532 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4533
4534 /*
4535 * Killing would put the base ref, but we need to keep it
4536 * alive until after ->css_offline.
4537 */
4538 percpu_ref_get(&css->refcnt);
4539
4540 atomic_inc(&cgrp->css_kill_cnt);
4541 percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
4542 }
4543 cgroup_css_killed(cgrp);
4544 4711
4545 /* 4712 /*
4546 * Mark @cgrp dead. This prevents further task migration and child 4713 * Mark @cgrp dead. This prevents further task migration and child
4547 * creation by disabling cgroup_lock_live_group(). Note that 4714 * creation by disabling cgroup_lock_live_group(). Note that
4548 * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to 4715 * CGRP_DEAD assertion is depended upon by css_next_child() to
4549 * resume iteration after dropping RCU read lock. See 4716 * resume iteration after dropping RCU read lock. See
4550 * cgroup_next_sibling() for details. 4717 * css_next_child() for details.
4551 */ 4718 */
4552 set_bit(CGRP_DEAD, &cgrp->flags); 4719 set_bit(CGRP_DEAD, &cgrp->flags);
4553 4720
@@ -4558,9 +4725,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4558 raw_spin_unlock(&release_list_lock); 4725 raw_spin_unlock(&release_list_lock);
4559 4726
4560 /* 4727 /*
4561 * Remove @cgrp directory. The removal puts the base ref but we 4728 * If @cgrp has css's attached, the second stage of cgroup
4562 * aren't quite done with @cgrp yet, so hold onto it. 4729 * destruction is kicked off from css_killed_work_fn() after the
4730 * refs of all attached css's are killed. If @cgrp doesn't have
4731 * any css, we kick it off here.
4732 */
4733 if (!cgrp->nr_css)
4734 cgroup_destroy_css_killed(cgrp);
4735
4736 /*
4737 * Clear the base files and remove @cgrp directory. The removal
4738 * puts the base ref but we aren't quite done with @cgrp yet, so
4739 * hold onto it.
4563 */ 4740 */
4741 cgroup_addrm_files(cgrp, cgroup_base_files, false);
4564 dget(d); 4742 dget(d);
4565 cgroup_d_remove_dir(d); 4743 cgroup_d_remove_dir(d);
4566 4744
@@ -4580,50 +4758,36 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4580}; 4758};
4581 4759
4582/** 4760/**
4583 * cgroup_offline_fn - the second step of cgroup destruction 4761 * cgroup_destroy_css_killed - the second step of cgroup destruction
4584 * @work: cgroup->destroy_free_work 4762 * @work: cgroup->destroy_free_work
4585 * 4763 *
4586 * This function is invoked from a work item for a cgroup which is being 4764 * This function is invoked from a work item for a cgroup which is being
4587 * destroyed after the percpu refcnts of all css's are guaranteed to be 4765 * destroyed after all css's are offlined and performs the rest of
4588 * seen as killed on all CPUs, and performs the rest of destruction. This 4766 * destruction. This is the second step of destruction described in the
4589 * is the second step of destruction described in the comment above 4767 * comment above cgroup_destroy_locked().
4590 * cgroup_destroy_locked().
4591 */ 4768 */
4592static void cgroup_offline_fn(struct work_struct *work) 4769static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4593{ 4770{
4594 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
4595 struct cgroup *parent = cgrp->parent; 4771 struct cgroup *parent = cgrp->parent;
4596 struct dentry *d = cgrp->dentry; 4772 struct dentry *d = cgrp->dentry;
4597 struct cgroup_subsys *ss;
4598 4773
4599 mutex_lock(&cgroup_mutex); 4774 lockdep_assert_held(&cgroup_mutex);
4600 4775
4601 /* 4776 /* delete this cgroup from parent->children */
4602 * css_tryget() is guaranteed to fail now. Tell subsystems to 4777 list_del_rcu(&cgrp->sibling);
4603 * initate destruction.
4604 */
4605 for_each_root_subsys(cgrp->root, ss)
4606 offline_css(ss, cgrp);
4607 4778
4608 /* 4779 /*
4609 * Put the css refs from cgroup_destroy_locked(). Each css holds 4780 * We should remove the cgroup object from idr before its grace
4610 * an extra reference to the cgroup's dentry and cgroup removal 4781 * period starts, so we won't be looking up a cgroup while the
4611 * proceeds regardless of css refs. On the last put of each css, 4782 * cgroup is being freed.
4612 * whenever that may be, the extra dentry ref is put so that dentry
4613 * destruction happens only after all css's are released.
4614 */ 4783 */
4615 for_each_root_subsys(cgrp->root, ss) 4784 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4616 css_put(cgrp->subsys[ss->subsys_id]); 4785 cgrp->id = -1;
4617
4618 /* delete this cgroup from parent->children */
4619 list_del_rcu(&cgrp->sibling);
4620 4786
4621 dput(d); 4787 dput(d);
4622 4788
4623 set_bit(CGRP_RELEASABLE, &parent->flags); 4789 set_bit(CGRP_RELEASABLE, &parent->flags);
4624 check_for_release(parent); 4790 check_for_release(parent);
4625
4626 mutex_unlock(&cgroup_mutex);
4627} 4791}
4628 4792
4629static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4793static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4646,6 +4810,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4646 * deregistration. 4810 * deregistration.
4647 */ 4811 */
4648 if (ss->base_cftypes) { 4812 if (ss->base_cftypes) {
4813 struct cftype *cft;
4814
4815 for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++)
4816 cft->ss = ss;
4817
4649 ss->base_cftset.cfts = ss->base_cftypes; 4818 ss->base_cftset.cfts = ss->base_cftypes;
4650 list_add_tail(&ss->base_cftset.node, &ss->cftsets); 4819 list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4651 } 4820 }
@@ -4665,10 +4834,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4665 /* Create the top cgroup state for this subsystem */ 4834 /* Create the top cgroup state for this subsystem */
4666 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); 4835 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4667 ss->root = &cgroup_dummy_root; 4836 ss->root = &cgroup_dummy_root;
4668 css = ss->css_alloc(cgroup_dummy_top); 4837 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4669 /* We don't handle early failures gracefully */ 4838 /* We don't handle early failures gracefully */
4670 BUG_ON(IS_ERR(css)); 4839 BUG_ON(IS_ERR(css));
4671 init_cgroup_css(css, ss, cgroup_dummy_top); 4840 init_css(css, ss, cgroup_dummy_top);
4672 4841
4673 /* Update the init_css_set to contain a subsys 4842 /* Update the init_css_set to contain a subsys
4674 * pointer to this state - since the subsystem is 4843 * pointer to this state - since the subsystem is
@@ -4683,7 +4852,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4683 * need to invoke fork callbacks here. */ 4852 * need to invoke fork callbacks here. */
4684 BUG_ON(!list_empty(&init_task.tasks)); 4853 BUG_ON(!list_empty(&init_task.tasks));
4685 4854
4686 BUG_ON(online_css(ss, cgroup_dummy_top)); 4855 BUG_ON(online_css(css));
4687 4856
4688 mutex_unlock(&cgroup_mutex); 4857 mutex_unlock(&cgroup_mutex);
4689 4858
@@ -4744,7 +4913,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4744 * struct, so this can happen first (i.e. before the dummy root 4913 * struct, so this can happen first (i.e. before the dummy root
4745 * attachment). 4914 * attachment).
4746 */ 4915 */
4747 css = ss->css_alloc(cgroup_dummy_top); 4916 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4748 if (IS_ERR(css)) { 4917 if (IS_ERR(css)) {
4749 /* failure case - need to deassign the cgroup_subsys[] slot. */ 4918 /* failure case - need to deassign the cgroup_subsys[] slot. */
4750 cgroup_subsys[ss->subsys_id] = NULL; 4919 cgroup_subsys[ss->subsys_id] = NULL;
@@ -4756,8 +4925,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4756 ss->root = &cgroup_dummy_root; 4925 ss->root = &cgroup_dummy_root;
4757 4926
4758 /* our new subsystem will be attached to the dummy hierarchy. */ 4927 /* our new subsystem will be attached to the dummy hierarchy. */
4759 init_cgroup_css(css, ss, cgroup_dummy_top); 4928 init_css(css, ss, cgroup_dummy_top);
4760 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4929 /* init_idr must be after init_css() because it sets css->id. */
4761 if (ss->use_id) { 4930 if (ss->use_id) {
4762 ret = cgroup_init_idr(ss, css); 4931 ret = cgroup_init_idr(ss, css);
4763 if (ret) 4932 if (ret)
@@ -4787,7 +4956,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4787 } 4956 }
4788 write_unlock(&css_set_lock); 4957 write_unlock(&css_set_lock);
4789 4958
4790 ret = online_css(ss, cgroup_dummy_top); 4959 ret = online_css(css);
4791 if (ret) 4960 if (ret)
4792 goto err_unload; 4961 goto err_unload;
4793 4962
@@ -4819,14 +4988,14 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4819 4988
4820 /* 4989 /*
4821 * we shouldn't be called if the subsystem is in use, and the use of 4990 * we shouldn't be called if the subsystem is in use, and the use of
4822 * try_module_get in parse_cgroupfs_options should ensure that it 4991 * try_module_get() in rebind_subsystems() should ensure that it
4823 * doesn't start being used while we're killing it off. 4992 * doesn't start being used while we're killing it off.
4824 */ 4993 */
4825 BUG_ON(ss->root != &cgroup_dummy_root); 4994 BUG_ON(ss->root != &cgroup_dummy_root);
4826 4995
4827 mutex_lock(&cgroup_mutex); 4996 mutex_lock(&cgroup_mutex);
4828 4997
4829 offline_css(ss, cgroup_dummy_top); 4998 offline_css(cgroup_css(cgroup_dummy_top, ss));
4830 4999
4831 if (ss->use_id) 5000 if (ss->use_id)
4832 idr_destroy(&ss->idr); 5001 idr_destroy(&ss->idr);
@@ -4860,8 +5029,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4860 * the cgrp->subsys pointer to find their state. note that this 5029 * the cgrp->subsys pointer to find their state. note that this
4861 * also takes care of freeing the css_id. 5030 * also takes care of freeing the css_id.
4862 */ 5031 */
4863 ss->css_free(cgroup_dummy_top); 5032 ss->css_free(cgroup_css(cgroup_dummy_top, ss));
4864 cgroup_dummy_top->subsys[ss->subsys_id] = NULL; 5033 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4865 5034
4866 mutex_unlock(&cgroup_mutex); 5035 mutex_unlock(&cgroup_mutex);
4867} 5036}
@@ -4943,6 +5112,10 @@ int __init cgroup_init(void)
4943 5112
4944 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); 5113 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
4945 5114
5115 err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
5116 0, 1, GFP_KERNEL);
5117 BUG_ON(err < 0);
5118
4946 mutex_unlock(&cgroup_root_mutex); 5119 mutex_unlock(&cgroup_root_mutex);
4947 mutex_unlock(&cgroup_mutex); 5120 mutex_unlock(&cgroup_mutex);
4948 5121
@@ -5099,7 +5272,7 @@ void cgroup_fork(struct task_struct *child)
5099 * Adds the task to the list running through its css_set if necessary and 5272 * Adds the task to the list running through its css_set if necessary and
5100 * call the subsystem fork() callbacks. Has to be after the task is 5273 * call the subsystem fork() callbacks. Has to be after the task is
5101 * visible on the task list in case we race with the first call to 5274 * visible on the task list in case we race with the first call to
5102 * cgroup_iter_start() - to guarantee that the new task ends up on its 5275 * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5103 * list. 5276 * list.
5104 */ 5277 */
5105void cgroup_post_fork(struct task_struct *child) 5278void cgroup_post_fork(struct task_struct *child)
@@ -5212,10 +5385,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5212 */ 5385 */
5213 for_each_builtin_subsys(ss, i) { 5386 for_each_builtin_subsys(ss, i) {
5214 if (ss->exit) { 5387 if (ss->exit) {
5215 struct cgroup *old_cgrp = cset->subsys[i]->cgroup; 5388 struct cgroup_subsys_state *old_css = cset->subsys[i];
5216 struct cgroup *cgrp = task_cgroup(tsk, i); 5389 struct cgroup_subsys_state *css = task_css(tsk, i);
5217 5390
5218 ss->exit(cgrp, old_cgrp, tsk); 5391 ss->exit(css, old_css, tsk);
5219 } 5392 }
5220 } 5393 }
5221 } 5394 }
@@ -5474,20 +5647,16 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
5474 return 0; 5647 return 0;
5475} 5648}
5476 5649
5477static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, 5650static int alloc_css_id(struct cgroup_subsys_state *child_css)
5478 struct cgroup *child)
5479{ 5651{
5480 int subsys_id, i, depth = 0; 5652 struct cgroup_subsys_state *parent_css = css_parent(child_css);
5481 struct cgroup_subsys_state *parent_css, *child_css;
5482 struct css_id *child_id, *parent_id; 5653 struct css_id *child_id, *parent_id;
5654 int i, depth;
5483 5655
5484 subsys_id = ss->subsys_id;
5485 parent_css = parent->subsys[subsys_id];
5486 child_css = child->subsys[subsys_id];
5487 parent_id = rcu_dereference_protected(parent_css->id, true); 5656 parent_id = rcu_dereference_protected(parent_css->id, true);
5488 depth = parent_id->depth + 1; 5657 depth = parent_id->depth + 1;
5489 5658
5490 child_id = get_new_cssid(ss, depth); 5659 child_id = get_new_cssid(child_css->ss, depth);
5491 if (IS_ERR(child_id)) 5660 if (IS_ERR(child_id))
5492 return PTR_ERR(child_id); 5661 return PTR_ERR(child_id);
5493 5662
@@ -5525,31 +5694,56 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
5525} 5694}
5526EXPORT_SYMBOL_GPL(css_lookup); 5695EXPORT_SYMBOL_GPL(css_lookup);
5527 5696
5528/* 5697/**
5529 * get corresponding css from file open on cgroupfs directory 5698 * css_from_dir - get corresponding css from the dentry of a cgroup dir
5699 * @dentry: directory dentry of interest
5700 * @ss: subsystem of interest
5701 *
5702 * Must be called under RCU read lock. The caller is responsible for
5703 * pinning the returned css if it needs to be accessed outside the RCU
5704 * critical section.
5530 */ 5705 */
5531struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) 5706struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
5707 struct cgroup_subsys *ss)
5532{ 5708{
5533 struct cgroup *cgrp; 5709 struct cgroup *cgrp;
5534 struct inode *inode;
5535 struct cgroup_subsys_state *css;
5536 5710
5537 inode = file_inode(f); 5711 WARN_ON_ONCE(!rcu_read_lock_held());
5538 /* check in cgroup filesystem dir */ 5712
5539 if (inode->i_op != &cgroup_dir_inode_operations) 5713 /* is @dentry a cgroup dir? */
5714 if (!dentry->d_inode ||
5715 dentry->d_inode->i_op != &cgroup_dir_inode_operations)
5540 return ERR_PTR(-EBADF); 5716 return ERR_PTR(-EBADF);
5541 5717
5542 if (id < 0 || id >= CGROUP_SUBSYS_COUNT) 5718 cgrp = __d_cgrp(dentry);
5543 return ERR_PTR(-EINVAL); 5719 return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT);
5720}
5544 5721
5545 /* get cgroup */ 5722/**
5546 cgrp = __d_cgrp(f->f_dentry); 5723 * css_from_id - lookup css by id
5547 css = cgrp->subsys[id]; 5724 * @id: the cgroup id
5548 return css ? css : ERR_PTR(-ENOENT); 5725 * @ss: cgroup subsys to be looked into
5726 *
5727 * Returns the css if there's valid one with @id, otherwise returns NULL.
5728 * Should be called under rcu_read_lock().
5729 */
5730struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5731{
5732 struct cgroup *cgrp;
5733
5734 rcu_lockdep_assert(rcu_read_lock_held() ||
5735 lockdep_is_held(&cgroup_mutex),
5736 "css_from_id() needs proper protection");
5737
5738 cgrp = idr_find(&ss->root->cgroup_idr, id);
5739 if (cgrp)
5740 return cgroup_css(cgrp, ss);
5741 return NULL;
5549} 5742}
5550 5743
5551#ifdef CONFIG_CGROUP_DEBUG 5744#ifdef CONFIG_CGROUP_DEBUG
5552static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) 5745static struct cgroup_subsys_state *
5746debug_css_alloc(struct cgroup_subsys_state *parent_css)
5553{ 5747{
5554 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5748 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5555 5749
@@ -5559,22 +5753,24 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
5559 return css; 5753 return css;
5560} 5754}
5561 5755
5562static void debug_css_free(struct cgroup *cgrp) 5756static void debug_css_free(struct cgroup_subsys_state *css)
5563{ 5757{
5564 kfree(cgrp->subsys[debug_subsys_id]); 5758 kfree(css);
5565} 5759}
5566 5760
5567static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) 5761static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
5762 struct cftype *cft)
5568{ 5763{
5569 return cgroup_task_count(cgrp); 5764 return cgroup_task_count(css->cgroup);
5570} 5765}
5571 5766
5572static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) 5767static u64 current_css_set_read(struct cgroup_subsys_state *css,
5768 struct cftype *cft)
5573{ 5769{
5574 return (u64)(unsigned long)current->cgroups; 5770 return (u64)(unsigned long)current->cgroups;
5575} 5771}
5576 5772
5577static u64 current_css_set_refcount_read(struct cgroup *cgrp, 5773static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5578 struct cftype *cft) 5774 struct cftype *cft)
5579{ 5775{
5580 u64 count; 5776 u64 count;
@@ -5585,7 +5781,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp,
5585 return count; 5781 return count;
5586} 5782}
5587 5783
5588static int current_css_set_cg_links_read(struct cgroup *cgrp, 5784static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
5589 struct cftype *cft, 5785 struct cftype *cft,
5590 struct seq_file *seq) 5786 struct seq_file *seq)
5591{ 5787{
@@ -5612,14 +5808,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp,
5612} 5808}
5613 5809
5614#define MAX_TASKS_SHOWN_PER_CSS 25 5810#define MAX_TASKS_SHOWN_PER_CSS 25
5615static int cgroup_css_links_read(struct cgroup *cgrp, 5811static int cgroup_css_links_read(struct cgroup_subsys_state *css,
5616 struct cftype *cft, 5812 struct cftype *cft, struct seq_file *seq)
5617 struct seq_file *seq)
5618{ 5813{
5619 struct cgrp_cset_link *link; 5814 struct cgrp_cset_link *link;
5620 5815
5621 read_lock(&css_set_lock); 5816 read_lock(&css_set_lock);
5622 list_for_each_entry(link, &cgrp->cset_links, cset_link) { 5817 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5623 struct css_set *cset = link->cset; 5818 struct css_set *cset = link->cset;
5624 struct task_struct *task; 5819 struct task_struct *task;
5625 int count = 0; 5820 int count = 0;
@@ -5638,9 +5833,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp,
5638 return 0; 5833 return 0;
5639} 5834}
5640 5835
5641static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) 5836static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5642{ 5837{
5643 return test_bit(CGRP_RELEASABLE, &cgrp->flags); 5838 return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
5644} 5839}
5645 5840
5646static struct cftype debug_files[] = { 5841static struct cftype debug_files[] = {
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 75dda1ea5026..f0ff64d0ebaa 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -45,25 +45,19 @@ struct freezer {
45 spinlock_t lock; 45 spinlock_t lock;
46}; 46};
47 47
48static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) 48static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
49{ 49{
50 return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), 50 return css ? container_of(css, struct freezer, css) : NULL;
51 struct freezer, css);
52} 51}
53 52
54static inline struct freezer *task_freezer(struct task_struct *task) 53static inline struct freezer *task_freezer(struct task_struct *task)
55{ 54{
56 return container_of(task_subsys_state(task, freezer_subsys_id), 55 return css_freezer(task_css(task, freezer_subsys_id));
57 struct freezer, css);
58} 56}
59 57
60static struct freezer *parent_freezer(struct freezer *freezer) 58static struct freezer *parent_freezer(struct freezer *freezer)
61{ 59{
62 struct cgroup *pcg = freezer->css.cgroup->parent; 60 return css_freezer(css_parent(&freezer->css));
63
64 if (pcg)
65 return cgroup_freezer(pcg);
66 return NULL;
67} 61}
68 62
69bool cgroup_freezing(struct task_struct *task) 63bool cgroup_freezing(struct task_struct *task)
@@ -92,7 +86,8 @@ static const char *freezer_state_strs(unsigned int state)
92 86
93struct cgroup_subsys freezer_subsys; 87struct cgroup_subsys freezer_subsys;
94 88
95static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) 89static struct cgroup_subsys_state *
90freezer_css_alloc(struct cgroup_subsys_state *parent_css)
96{ 91{
97 struct freezer *freezer; 92 struct freezer *freezer;
98 93
@@ -105,22 +100,22 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
105} 100}
106 101
107/** 102/**
108 * freezer_css_online - commit creation of a freezer cgroup 103 * freezer_css_online - commit creation of a freezer css
109 * @cgroup: cgroup being created 104 * @css: css being created
110 * 105 *
111 * We're committing to creation of @cgroup. Mark it online and inherit 106 * We're committing to creation of @css. Mark it online and inherit
112 * parent's freezing state while holding both parent's and our 107 * parent's freezing state while holding both parent's and our
113 * freezer->lock. 108 * freezer->lock.
114 */ 109 */
115static int freezer_css_online(struct cgroup *cgroup) 110static int freezer_css_online(struct cgroup_subsys_state *css)
116{ 111{
117 struct freezer *freezer = cgroup_freezer(cgroup); 112 struct freezer *freezer = css_freezer(css);
118 struct freezer *parent = parent_freezer(freezer); 113 struct freezer *parent = parent_freezer(freezer);
119 114
120 /* 115 /*
121 * The following double locking and freezing state inheritance 116 * The following double locking and freezing state inheritance
122 * guarantee that @cgroup can never escape ancestors' freezing 117 * guarantee that @cgroup can never escape ancestors' freezing
123 * states. See cgroup_for_each_descendant_pre() for details. 118 * states. See css_for_each_descendant_pre() for details.
124 */ 119 */
125 if (parent) 120 if (parent)
126 spin_lock_irq(&parent->lock); 121 spin_lock_irq(&parent->lock);
@@ -141,15 +136,15 @@ static int freezer_css_online(struct cgroup *cgroup)
141} 136}
142 137
143/** 138/**
144 * freezer_css_offline - initiate destruction of @cgroup 139 * freezer_css_offline - initiate destruction of a freezer css
145 * @cgroup: cgroup being destroyed 140 * @css: css being destroyed
146 * 141 *
147 * @cgroup is going away. Mark it dead and decrement system_freezing_count 142 * @css is going away. Mark it dead and decrement system_freezing_count if
148 * if it was holding one. 143 * it was holding one.
149 */ 144 */
150static void freezer_css_offline(struct cgroup *cgroup) 145static void freezer_css_offline(struct cgroup_subsys_state *css)
151{ 146{
152 struct freezer *freezer = cgroup_freezer(cgroup); 147 struct freezer *freezer = css_freezer(css);
153 148
154 spin_lock_irq(&freezer->lock); 149 spin_lock_irq(&freezer->lock);
155 150
@@ -161,9 +156,9 @@ static void freezer_css_offline(struct cgroup *cgroup)
161 spin_unlock_irq(&freezer->lock); 156 spin_unlock_irq(&freezer->lock);
162} 157}
163 158
164static void freezer_css_free(struct cgroup *cgroup) 159static void freezer_css_free(struct cgroup_subsys_state *css)
165{ 160{
166 kfree(cgroup_freezer(cgroup)); 161 kfree(css_freezer(css));
167} 162}
168 163
169/* 164/*
@@ -175,25 +170,26 @@ static void freezer_css_free(struct cgroup *cgroup)
175 * @freezer->lock. freezer_attach() makes the new tasks conform to the 170 * @freezer->lock. freezer_attach() makes the new tasks conform to the
176 * current state and all following state changes can see the new tasks. 171 * current state and all following state changes can see the new tasks.
177 */ 172 */
178static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) 173static void freezer_attach(struct cgroup_subsys_state *new_css,
174 struct cgroup_taskset *tset)
179{ 175{
180 struct freezer *freezer = cgroup_freezer(new_cgrp); 176 struct freezer *freezer = css_freezer(new_css);
181 struct task_struct *task; 177 struct task_struct *task;
182 bool clear_frozen = false; 178 bool clear_frozen = false;
183 179
184 spin_lock_irq(&freezer->lock); 180 spin_lock_irq(&freezer->lock);
185 181
186 /* 182 /*
187 * Make the new tasks conform to the current state of @new_cgrp. 183 * Make the new tasks conform to the current state of @new_css.
188 * For simplicity, when migrating any task to a FROZEN cgroup, we 184 * For simplicity, when migrating any task to a FROZEN cgroup, we
189 * revert it to FREEZING and let update_if_frozen() determine the 185 * revert it to FREEZING and let update_if_frozen() determine the
190 * correct state later. 186 * correct state later.
191 * 187 *
192 * Tasks in @tset are on @new_cgrp but may not conform to its 188 * Tasks in @tset are on @new_css but may not conform to its
193 * current state before executing the following - !frozen tasks may 189 * current state before executing the following - !frozen tasks may
194 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. 190 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
195 */ 191 */
196 cgroup_taskset_for_each(task, new_cgrp, tset) { 192 cgroup_taskset_for_each(task, new_css, tset) {
197 if (!(freezer->state & CGROUP_FREEZING)) { 193 if (!(freezer->state & CGROUP_FREEZING)) {
198 __thaw_task(task); 194 __thaw_task(task);
199 } else { 195 } else {
@@ -231,7 +227,7 @@ static void freezer_fork(struct task_struct *task)
231 * The root cgroup is non-freezable, so we can skip the 227 * The root cgroup is non-freezable, so we can skip the
232 * following check. 228 * following check.
233 */ 229 */
234 if (!freezer->css.cgroup->parent) 230 if (!parent_freezer(freezer))
235 goto out; 231 goto out;
236 232
237 spin_lock_irq(&freezer->lock); 233 spin_lock_irq(&freezer->lock);
@@ -244,7 +240,7 @@ out:
244 240
245/** 241/**
246 * update_if_frozen - update whether a cgroup finished freezing 242 * update_if_frozen - update whether a cgroup finished freezing
247 * @cgroup: cgroup of interest 243 * @css: css of interest
248 * 244 *
249 * Once FREEZING is initiated, transition to FROZEN is lazily updated by 245 * Once FREEZING is initiated, transition to FROZEN is lazily updated by
250 * calling this function. If the current state is FREEZING but not FROZEN, 246 * calling this function. If the current state is FREEZING but not FROZEN,
@@ -255,14 +251,14 @@ out:
255 * update_if_frozen() on all descendants prior to invoking this function. 251 * update_if_frozen() on all descendants prior to invoking this function.
256 * 252 *
257 * Task states and freezer state might disagree while tasks are being 253 * Task states and freezer state might disagree while tasks are being
258 * migrated into or out of @cgroup, so we can't verify task states against 254 * migrated into or out of @css, so we can't verify task states against
259 * @freezer state here. See freezer_attach() for details. 255 * @freezer state here. See freezer_attach() for details.
260 */ 256 */
261static void update_if_frozen(struct cgroup *cgroup) 257static void update_if_frozen(struct cgroup_subsys_state *css)
262{ 258{
263 struct freezer *freezer = cgroup_freezer(cgroup); 259 struct freezer *freezer = css_freezer(css);
264 struct cgroup *pos; 260 struct cgroup_subsys_state *pos;
265 struct cgroup_iter it; 261 struct css_task_iter it;
266 struct task_struct *task; 262 struct task_struct *task;
267 263
268 WARN_ON_ONCE(!rcu_read_lock_held()); 264 WARN_ON_ONCE(!rcu_read_lock_held());
@@ -274,8 +270,8 @@ static void update_if_frozen(struct cgroup *cgroup)
274 goto out_unlock; 270 goto out_unlock;
275 271
276 /* are all (live) children frozen? */ 272 /* are all (live) children frozen? */
277 cgroup_for_each_child(pos, cgroup) { 273 css_for_each_child(pos, css) {
278 struct freezer *child = cgroup_freezer(pos); 274 struct freezer *child = css_freezer(pos);
279 275
280 if ((child->state & CGROUP_FREEZER_ONLINE) && 276 if ((child->state & CGROUP_FREEZER_ONLINE) &&
281 !(child->state & CGROUP_FROZEN)) 277 !(child->state & CGROUP_FROZEN))
@@ -283,9 +279,9 @@ static void update_if_frozen(struct cgroup *cgroup)
283 } 279 }
284 280
285 /* are all tasks frozen? */ 281 /* are all tasks frozen? */
286 cgroup_iter_start(cgroup, &it); 282 css_task_iter_start(css, &it);
287 283
288 while ((task = cgroup_iter_next(cgroup, &it))) { 284 while ((task = css_task_iter_next(&it))) {
289 if (freezing(task)) { 285 if (freezing(task)) {
290 /* 286 /*
291 * freezer_should_skip() indicates that the task 287 * freezer_should_skip() indicates that the task
@@ -300,52 +296,49 @@ static void update_if_frozen(struct cgroup *cgroup)
300 296
301 freezer->state |= CGROUP_FROZEN; 297 freezer->state |= CGROUP_FROZEN;
302out_iter_end: 298out_iter_end:
303 cgroup_iter_end(cgroup, &it); 299 css_task_iter_end(&it);
304out_unlock: 300out_unlock:
305 spin_unlock_irq(&freezer->lock); 301 spin_unlock_irq(&freezer->lock);
306} 302}
307 303
308static int freezer_read(struct cgroup *cgroup, struct cftype *cft, 304static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft,
309 struct seq_file *m) 305 struct seq_file *m)
310{ 306{
311 struct cgroup *pos; 307 struct cgroup_subsys_state *pos;
312 308
313 rcu_read_lock(); 309 rcu_read_lock();
314 310
315 /* update states bottom-up */ 311 /* update states bottom-up */
316 cgroup_for_each_descendant_post(pos, cgroup) 312 css_for_each_descendant_post(pos, css)
317 update_if_frozen(pos); 313 update_if_frozen(pos);
318 update_if_frozen(cgroup);
319 314
320 rcu_read_unlock(); 315 rcu_read_unlock();
321 316
322 seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); 317 seq_puts(m, freezer_state_strs(css_freezer(css)->state));
323 seq_putc(m, '\n'); 318 seq_putc(m, '\n');
324 return 0; 319 return 0;
325} 320}
326 321
327static void freeze_cgroup(struct freezer *freezer) 322static void freeze_cgroup(struct freezer *freezer)
328{ 323{
329 struct cgroup *cgroup = freezer->css.cgroup; 324 struct css_task_iter it;
330 struct cgroup_iter it;
331 struct task_struct *task; 325 struct task_struct *task;
332 326
333 cgroup_iter_start(cgroup, &it); 327 css_task_iter_start(&freezer->css, &it);
334 while ((task = cgroup_iter_next(cgroup, &it))) 328 while ((task = css_task_iter_next(&it)))
335 freeze_task(task); 329 freeze_task(task);
336 cgroup_iter_end(cgroup, &it); 330 css_task_iter_end(&it);
337} 331}
338 332
339static void unfreeze_cgroup(struct freezer *freezer) 333static void unfreeze_cgroup(struct freezer *freezer)
340{ 334{
341 struct cgroup *cgroup = freezer->css.cgroup; 335 struct css_task_iter it;
342 struct cgroup_iter it;
343 struct task_struct *task; 336 struct task_struct *task;
344 337
345 cgroup_iter_start(cgroup, &it); 338 css_task_iter_start(&freezer->css, &it);
346 while ((task = cgroup_iter_next(cgroup, &it))) 339 while ((task = css_task_iter_next(&it)))
347 __thaw_task(task); 340 __thaw_task(task);
348 cgroup_iter_end(cgroup, &it); 341 css_task_iter_end(&it);
349} 342}
350 343
351/** 344/**
@@ -395,12 +388,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
395 */ 388 */
396static void freezer_change_state(struct freezer *freezer, bool freeze) 389static void freezer_change_state(struct freezer *freezer, bool freeze)
397{ 390{
398 struct cgroup *pos; 391 struct cgroup_subsys_state *pos;
399
400 /* update @freezer */
401 spin_lock_irq(&freezer->lock);
402 freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
403 spin_unlock_irq(&freezer->lock);
404 392
405 /* 393 /*
406 * Update all its descendants in pre-order traversal. Each 394 * Update all its descendants in pre-order traversal. Each
@@ -408,24 +396,33 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
408 * CGROUP_FREEZING_PARENT. 396 * CGROUP_FREEZING_PARENT.
409 */ 397 */
410 rcu_read_lock(); 398 rcu_read_lock();
411 cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { 399 css_for_each_descendant_pre(pos, &freezer->css) {
412 struct freezer *pos_f = cgroup_freezer(pos); 400 struct freezer *pos_f = css_freezer(pos);
413 struct freezer *parent = parent_freezer(pos_f); 401 struct freezer *parent = parent_freezer(pos_f);
414 402
415 /*
416 * Our update to @parent->state is already visible which is
417 * all we need. No need to lock @parent. For more info on
418 * synchronization, see freezer_post_create().
419 */
420 spin_lock_irq(&pos_f->lock); 403 spin_lock_irq(&pos_f->lock);
421 freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, 404
422 CGROUP_FREEZING_PARENT); 405 if (pos_f == freezer) {
406 freezer_apply_state(pos_f, freeze,
407 CGROUP_FREEZING_SELF);
408 } else {
409 /*
410 * Our update to @parent->state is already visible
411 * which is all we need. No need to lock @parent.
412 * For more info on synchronization, see
413 * freezer_post_create().
414 */
415 freezer_apply_state(pos_f,
416 parent->state & CGROUP_FREEZING,
417 CGROUP_FREEZING_PARENT);
418 }
419
423 spin_unlock_irq(&pos_f->lock); 420 spin_unlock_irq(&pos_f->lock);
424 } 421 }
425 rcu_read_unlock(); 422 rcu_read_unlock();
426} 423}
427 424
428static int freezer_write(struct cgroup *cgroup, struct cftype *cft, 425static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
429 const char *buffer) 426 const char *buffer)
430{ 427{
431 bool freeze; 428 bool freeze;
@@ -437,20 +434,22 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
437 else 434 else
438 return -EINVAL; 435 return -EINVAL;
439 436
440 freezer_change_state(cgroup_freezer(cgroup), freeze); 437 freezer_change_state(css_freezer(css), freeze);
441 return 0; 438 return 0;
442} 439}
443 440
444static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) 441static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
442 struct cftype *cft)
445{ 443{
446 struct freezer *freezer = cgroup_freezer(cgroup); 444 struct freezer *freezer = css_freezer(css);
447 445
448 return (bool)(freezer->state & CGROUP_FREEZING_SELF); 446 return (bool)(freezer->state & CGROUP_FREEZING_SELF);
449} 447}
450 448
451static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) 449static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
450 struct cftype *cft)
452{ 451{
453 struct freezer *freezer = cgroup_freezer(cgroup); 452 struct freezer *freezer = css_freezer(css);
454 453
455 return (bool)(freezer->state & CGROUP_FREEZING_PARENT); 454 return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
456} 455}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ea1966db34f2..6bf981e13c43 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -68,10 +68,6 @@
68 */ 68 */
69int number_of_cpusets __read_mostly; 69int number_of_cpusets __read_mostly;
70 70
71/* Forward declare cgroup structures */
72struct cgroup_subsys cpuset_subsys;
73struct cpuset;
74
75/* See "Frequency meter" comments, below. */ 71/* See "Frequency meter" comments, below. */
76 72
77struct fmeter { 73struct fmeter {
@@ -115,27 +111,20 @@ struct cpuset {
115 int relax_domain_level; 111 int relax_domain_level;
116}; 112};
117 113
118/* Retrieve the cpuset for a cgroup */ 114static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
119static inline struct cpuset *cgroup_cs(struct cgroup *cgrp)
120{ 115{
121 return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id), 116 return css ? container_of(css, struct cpuset, css) : NULL;
122 struct cpuset, css);
123} 117}
124 118
125/* Retrieve the cpuset for a task */ 119/* Retrieve the cpuset for a task */
126static inline struct cpuset *task_cs(struct task_struct *task) 120static inline struct cpuset *task_cs(struct task_struct *task)
127{ 121{
128 return container_of(task_subsys_state(task, cpuset_subsys_id), 122 return css_cs(task_css(task, cpuset_subsys_id));
129 struct cpuset, css);
130} 123}
131 124
132static inline struct cpuset *parent_cs(const struct cpuset *cs) 125static inline struct cpuset *parent_cs(struct cpuset *cs)
133{ 126{
134 struct cgroup *pcgrp = cs->css.cgroup->parent; 127 return css_cs(css_parent(&cs->css));
135
136 if (pcgrp)
137 return cgroup_cs(pcgrp);
138 return NULL;
139} 128}
140 129
141#ifdef CONFIG_NUMA 130#ifdef CONFIG_NUMA
@@ -212,29 +201,30 @@ static struct cpuset top_cpuset = {
212/** 201/**
213 * cpuset_for_each_child - traverse online children of a cpuset 202 * cpuset_for_each_child - traverse online children of a cpuset
214 * @child_cs: loop cursor pointing to the current child 203 * @child_cs: loop cursor pointing to the current child
215 * @pos_cgrp: used for iteration 204 * @pos_css: used for iteration
216 * @parent_cs: target cpuset to walk children of 205 * @parent_cs: target cpuset to walk children of
217 * 206 *
218 * Walk @child_cs through the online children of @parent_cs. Must be used 207 * Walk @child_cs through the online children of @parent_cs. Must be used
219 * with RCU read locked. 208 * with RCU read locked.
220 */ 209 */
221#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ 210#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
222 cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ 211 css_for_each_child((pos_css), &(parent_cs)->css) \
223 if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) 212 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
224 213
225/** 214/**
226 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants 215 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
227 * @des_cs: loop cursor pointing to the current descendant 216 * @des_cs: loop cursor pointing to the current descendant
228 * @pos_cgrp: used for iteration 217 * @pos_css: used for iteration
229 * @root_cs: target cpuset to walk ancestor of 218 * @root_cs: target cpuset to walk ancestor of
230 * 219 *
231 * Walk @des_cs through the online descendants of @root_cs. Must be used 220 * Walk @des_cs through the online descendants of @root_cs. Must be used
232 * with RCU read locked. The caller may modify @pos_cgrp by calling 221 * with RCU read locked. The caller may modify @pos_css by calling
233 * cgroup_rightmost_descendant() to skip subtree. 222 * css_rightmost_descendant() to skip subtree. @root_cs is included in the
223 * iteration and the first node to be visited.
234 */ 224 */
235#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ 225#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
236 cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ 226 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
237 if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) 227 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
238 228
239/* 229/*
240 * There are two global mutexes guarding cpuset structures - cpuset_mutex 230 * There are two global mutexes guarding cpuset structures - cpuset_mutex
@@ -320,8 +310,7 @@ static struct file_system_type cpuset_fs_type = {
320 * 310 *
321 * Call with callback_mutex held. 311 * Call with callback_mutex held.
322 */ 312 */
323static void guarantee_online_cpus(const struct cpuset *cs, 313static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
324 struct cpumask *pmask)
325{ 314{
326 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 315 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
327 cs = parent_cs(cs); 316 cs = parent_cs(cs);
@@ -339,7 +328,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
339 * 328 *
340 * Call with callback_mutex held. 329 * Call with callback_mutex held.
341 */ 330 */
342static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 331static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
343{ 332{
344 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) 333 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
345 cs = parent_cs(cs); 334 cs = parent_cs(cs);
@@ -384,7 +373,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
384 * alloc_trial_cpuset - allocate a trial cpuset 373 * alloc_trial_cpuset - allocate a trial cpuset
385 * @cs: the cpuset that the trial cpuset duplicates 374 * @cs: the cpuset that the trial cpuset duplicates
386 */ 375 */
387static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) 376static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
388{ 377{
389 struct cpuset *trial; 378 struct cpuset *trial;
390 379
@@ -431,9 +420,9 @@ static void free_trial_cpuset(struct cpuset *trial)
431 * Return 0 if valid, -errno if not. 420 * Return 0 if valid, -errno if not.
432 */ 421 */
433 422
434static int validate_change(const struct cpuset *cur, const struct cpuset *trial) 423static int validate_change(struct cpuset *cur, struct cpuset *trial)
435{ 424{
436 struct cgroup *cgrp; 425 struct cgroup_subsys_state *css;
437 struct cpuset *c, *par; 426 struct cpuset *c, *par;
438 int ret; 427 int ret;
439 428
@@ -441,7 +430,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
441 430
442 /* Each of our child cpusets must be a subset of us */ 431 /* Each of our child cpusets must be a subset of us */
443 ret = -EBUSY; 432 ret = -EBUSY;
444 cpuset_for_each_child(c, cgrp, cur) 433 cpuset_for_each_child(c, css, cur)
445 if (!is_cpuset_subset(c, trial)) 434 if (!is_cpuset_subset(c, trial))
446 goto out; 435 goto out;
447 436
@@ -462,7 +451,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
462 * overlap 451 * overlap
463 */ 452 */
464 ret = -EINVAL; 453 ret = -EINVAL;
465 cpuset_for_each_child(c, cgrp, par) { 454 cpuset_for_each_child(c, css, par) {
466 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 455 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
467 c != cur && 456 c != cur &&
468 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 457 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -515,13 +504,16 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
515 struct cpuset *root_cs) 504 struct cpuset *root_cs)
516{ 505{
517 struct cpuset *cp; 506 struct cpuset *cp;
518 struct cgroup *pos_cgrp; 507 struct cgroup_subsys_state *pos_css;
519 508
520 rcu_read_lock(); 509 rcu_read_lock();
521 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 510 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
511 if (cp == root_cs)
512 continue;
513
522 /* skip the whole subtree if @cp doesn't have any CPU */ 514 /* skip the whole subtree if @cp doesn't have any CPU */
523 if (cpumask_empty(cp->cpus_allowed)) { 515 if (cpumask_empty(cp->cpus_allowed)) {
524 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 516 pos_css = css_rightmost_descendant(pos_css);
525 continue; 517 continue;
526 } 518 }
527 519
@@ -596,7 +588,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
596 struct sched_domain_attr *dattr; /* attributes for custom domains */ 588 struct sched_domain_attr *dattr; /* attributes for custom domains */
597 int ndoms = 0; /* number of sched domains in result */ 589 int ndoms = 0; /* number of sched domains in result */
598 int nslot; /* next empty doms[] struct cpumask slot */ 590 int nslot; /* next empty doms[] struct cpumask slot */
599 struct cgroup *pos_cgrp; 591 struct cgroup_subsys_state *pos_css;
600 592
601 doms = NULL; 593 doms = NULL;
602 dattr = NULL; 594 dattr = NULL;
@@ -625,7 +617,9 @@ static int generate_sched_domains(cpumask_var_t **domains,
625 csn = 0; 617 csn = 0;
626 618
627 rcu_read_lock(); 619 rcu_read_lock();
628 cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { 620 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
621 if (cp == &top_cpuset)
622 continue;
629 /* 623 /*
630 * Continue traversing beyond @cp iff @cp has some CPUs and 624 * Continue traversing beyond @cp iff @cp has some CPUs and
631 * isn't load balancing. The former is obvious. The 625 * isn't load balancing. The former is obvious. The
@@ -642,7 +636,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
642 csa[csn++] = cp; 636 csa[csn++] = cp;
643 637
644 /* skip @cp's subtree */ 638 /* skip @cp's subtree */
645 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 639 pos_css = css_rightmost_descendant(pos_css);
646 } 640 }
647 rcu_read_unlock(); 641 rcu_read_unlock();
648 642
@@ -837,52 +831,45 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
837/** 831/**
838 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's 832 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
839 * @tsk: task to test 833 * @tsk: task to test
840 * @scan: struct cgroup_scanner containing the cgroup of the task 834 * @data: cpuset to @tsk belongs to
841 * 835 *
842 * Called by cgroup_scan_tasks() for each task in a cgroup whose 836 * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed
843 * cpus_allowed mask needs to be changed. 837 * mask needs to be changed.
844 * 838 *
845 * We don't need to re-check for the cgroup/cpuset membership, since we're 839 * We don't need to re-check for the cgroup/cpuset membership, since we're
846 * holding cpuset_mutex at this point. 840 * holding cpuset_mutex at this point.
847 */ 841 */
848static void cpuset_change_cpumask(struct task_struct *tsk, 842static void cpuset_change_cpumask(struct task_struct *tsk, void *data)
849 struct cgroup_scanner *scan)
850{ 843{
851 struct cpuset *cpus_cs; 844 struct cpuset *cs = data;
845 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
852 846
853 cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg));
854 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); 847 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
855} 848}
856 849
857/** 850/**
858 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 851 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
859 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 852 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
860 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 853 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
861 * 854 *
862 * Called with cpuset_mutex held 855 * Called with cpuset_mutex held
863 * 856 *
864 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 857 * The css_scan_tasks() function will scan all the tasks in a cgroup,
865 * calling callback functions for each. 858 * calling callback functions for each.
866 * 859 *
867 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 860 * No return value. It's guaranteed that css_scan_tasks() always returns 0
868 * if @heap != NULL. 861 * if @heap != NULL.
869 */ 862 */
870static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) 863static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
871{ 864{
872 struct cgroup_scanner scan; 865 css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap);
873
874 scan.cg = cs->css.cgroup;
875 scan.test_task = NULL;
876 scan.process_task = cpuset_change_cpumask;
877 scan.heap = heap;
878 cgroup_scan_tasks(&scan);
879} 866}
880 867
881/* 868/*
882 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. 869 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
883 * @root_cs: the root cpuset of the hierarchy 870 * @root_cs: the root cpuset of the hierarchy
884 * @update_root: update root cpuset or not? 871 * @update_root: update root cpuset or not?
885 * @heap: the heap used by cgroup_scan_tasks() 872 * @heap: the heap used by css_scan_tasks()
886 * 873 *
887 * This will update cpumasks of tasks in @root_cs and all other empty cpusets 874 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
888 * which take on cpumask of @root_cs. 875 * which take on cpumask of @root_cs.
@@ -893,17 +880,19 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
893 bool update_root, struct ptr_heap *heap) 880 bool update_root, struct ptr_heap *heap)
894{ 881{
895 struct cpuset *cp; 882 struct cpuset *cp;
896 struct cgroup *pos_cgrp; 883 struct cgroup_subsys_state *pos_css;
897
898 if (update_root)
899 update_tasks_cpumask(root_cs, heap);
900 884
901 rcu_read_lock(); 885 rcu_read_lock();
902 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 886 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
903 /* skip the whole subtree if @cp have some CPU */ 887 if (cp == root_cs) {
904 if (!cpumask_empty(cp->cpus_allowed)) { 888 if (!update_root)
905 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 889 continue;
906 continue; 890 } else {
891 /* skip the whole subtree if @cp have some CPU */
892 if (!cpumask_empty(cp->cpus_allowed)) {
893 pos_css = css_rightmost_descendant(pos_css);
894 continue;
895 }
907 } 896 }
908 if (!css_tryget(&cp->css)) 897 if (!css_tryget(&cp->css))
909 continue; 898 continue;
@@ -1059,20 +1048,24 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1059 task_unlock(tsk); 1048 task_unlock(tsk);
1060} 1049}
1061 1050
1051struct cpuset_change_nodemask_arg {
1052 struct cpuset *cs;
1053 nodemask_t *newmems;
1054};
1055
1062/* 1056/*
1063 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy 1057 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
1064 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if 1058 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
1065 * memory_migrate flag is set. Called with cpuset_mutex held. 1059 * memory_migrate flag is set. Called with cpuset_mutex held.
1066 */ 1060 */
1067static void cpuset_change_nodemask(struct task_struct *p, 1061static void cpuset_change_nodemask(struct task_struct *p, void *data)
1068 struct cgroup_scanner *scan)
1069{ 1062{
1070 struct cpuset *cs = cgroup_cs(scan->cg); 1063 struct cpuset_change_nodemask_arg *arg = data;
1064 struct cpuset *cs = arg->cs;
1071 struct mm_struct *mm; 1065 struct mm_struct *mm;
1072 int migrate; 1066 int migrate;
1073 nodemask_t *newmems = scan->data;
1074 1067
1075 cpuset_change_task_nodemask(p, newmems); 1068 cpuset_change_task_nodemask(p, arg->newmems);
1076 1069
1077 mm = get_task_mm(p); 1070 mm = get_task_mm(p);
1078 if (!mm) 1071 if (!mm)
@@ -1082,7 +1075,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
1082 1075
1083 mpol_rebind_mm(mm, &cs->mems_allowed); 1076 mpol_rebind_mm(mm, &cs->mems_allowed);
1084 if (migrate) 1077 if (migrate)
1085 cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems); 1078 cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems);
1086 mmput(mm); 1079 mmput(mm);
1087} 1080}
1088 1081
@@ -1091,28 +1084,22 @@ static void *cpuset_being_rebound;
1091/** 1084/**
1092 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1085 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1093 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1086 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1094 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1087 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1095 * 1088 *
1096 * Called with cpuset_mutex held 1089 * Called with cpuset_mutex held. No return value. It's guaranteed that
1097 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1090 * css_scan_tasks() always returns 0 if @heap != NULL.
1098 * if @heap != NULL.
1099 */ 1091 */
1100static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) 1092static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1101{ 1093{
1102 static nodemask_t newmems; /* protected by cpuset_mutex */ 1094 static nodemask_t newmems; /* protected by cpuset_mutex */
1103 struct cgroup_scanner scan;
1104 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1095 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1096 struct cpuset_change_nodemask_arg arg = { .cs = cs,
1097 .newmems = &newmems };
1105 1098
1106 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1099 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1107 1100
1108 guarantee_online_mems(mems_cs, &newmems); 1101 guarantee_online_mems(mems_cs, &newmems);
1109 1102
1110 scan.cg = cs->css.cgroup;
1111 scan.test_task = NULL;
1112 scan.process_task = cpuset_change_nodemask;
1113 scan.heap = heap;
1114 scan.data = &newmems;
1115
1116 /* 1103 /*
1117 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't 1104 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
1118 * take while holding tasklist_lock. Forks can happen - the 1105 * take while holding tasklist_lock. Forks can happen - the
@@ -1123,7 +1110,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1123 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1110 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1124 * is idempotent. Also migrate pages in each mm to new nodes. 1111 * is idempotent. Also migrate pages in each mm to new nodes.
1125 */ 1112 */
1126 cgroup_scan_tasks(&scan); 1113 css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap);
1127 1114
1128 /* 1115 /*
1129 * All the tasks' nodemasks have been updated, update 1116 * All the tasks' nodemasks have been updated, update
@@ -1139,7 +1126,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1139 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. 1126 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
1140 * @cs: the root cpuset of the hierarchy 1127 * @cs: the root cpuset of the hierarchy
1141 * @update_root: update the root cpuset or not? 1128 * @update_root: update the root cpuset or not?
1142 * @heap: the heap used by cgroup_scan_tasks() 1129 * @heap: the heap used by css_scan_tasks()
1143 * 1130 *
1144 * This will update nodemasks of tasks in @root_cs and all other empty cpusets 1131 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
1145 * which take on nodemask of @root_cs. 1132 * which take on nodemask of @root_cs.
@@ -1150,17 +1137,19 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1150 bool update_root, struct ptr_heap *heap) 1137 bool update_root, struct ptr_heap *heap)
1151{ 1138{
1152 struct cpuset *cp; 1139 struct cpuset *cp;
1153 struct cgroup *pos_cgrp; 1140 struct cgroup_subsys_state *pos_css;
1154
1155 if (update_root)
1156 update_tasks_nodemask(root_cs, heap);
1157 1141
1158 rcu_read_lock(); 1142 rcu_read_lock();
1159 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 1143 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
1160 /* skip the whole subtree if @cp have some CPU */ 1144 if (cp == root_cs) {
1161 if (!nodes_empty(cp->mems_allowed)) { 1145 if (!update_root)
1162 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 1146 continue;
1163 continue; 1147 } else {
1148 /* skip the whole subtree if @cp have some CPU */
1149 if (!nodes_empty(cp->mems_allowed)) {
1150 pos_css = css_rightmost_descendant(pos_css);
1151 continue;
1152 }
1164 } 1153 }
1165 if (!css_tryget(&cp->css)) 1154 if (!css_tryget(&cp->css))
1166 continue; 1155 continue;
@@ -1267,44 +1256,39 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1267 return 0; 1256 return 0;
1268} 1257}
1269 1258
1270/* 1259/**
1271 * cpuset_change_flag - make a task's spread flags the same as its cpuset's 1260 * cpuset_change_flag - make a task's spread flags the same as its cpuset's
1272 * @tsk: task to be updated 1261 * @tsk: task to be updated
1273 * @scan: struct cgroup_scanner containing the cgroup of the task 1262 * @data: cpuset to @tsk belongs to
1274 * 1263 *
1275 * Called by cgroup_scan_tasks() for each task in a cgroup. 1264 * Called by css_scan_tasks() for each task in a cgroup.
1276 * 1265 *
1277 * We don't need to re-check for the cgroup/cpuset membership, since we're 1266 * We don't need to re-check for the cgroup/cpuset membership, since we're
1278 * holding cpuset_mutex at this point. 1267 * holding cpuset_mutex at this point.
1279 */ 1268 */
1280static void cpuset_change_flag(struct task_struct *tsk, 1269static void cpuset_change_flag(struct task_struct *tsk, void *data)
1281 struct cgroup_scanner *scan)
1282{ 1270{
1283 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); 1271 struct cpuset *cs = data;
1272
1273 cpuset_update_task_spread_flag(cs, tsk);
1284} 1274}
1285 1275
1286/* 1276/**
1287 * update_tasks_flags - update the spread flags of tasks in the cpuset. 1277 * update_tasks_flags - update the spread flags of tasks in the cpuset.
1288 * @cs: the cpuset in which each task's spread flags needs to be changed 1278 * @cs: the cpuset in which each task's spread flags needs to be changed
1289 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1279 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1290 * 1280 *
1291 * Called with cpuset_mutex held 1281 * Called with cpuset_mutex held
1292 * 1282 *
1293 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 1283 * The css_scan_tasks() function will scan all the tasks in a cgroup,
1294 * calling callback functions for each. 1284 * calling callback functions for each.
1295 * 1285 *
1296 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1286 * No return value. It's guaranteed that css_scan_tasks() always returns 0
1297 * if @heap != NULL. 1287 * if @heap != NULL.
1298 */ 1288 */
1299static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) 1289static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1300{ 1290{
1301 struct cgroup_scanner scan; 1291 css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap);
1302
1303 scan.cg = cs->css.cgroup;
1304 scan.test_task = NULL;
1305 scan.process_task = cpuset_change_flag;
1306 scan.heap = heap;
1307 cgroup_scan_tasks(&scan);
1308} 1292}
1309 1293
1310/* 1294/*
@@ -1462,9 +1446,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1462} 1446}
1463 1447
1464/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 1448/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1465static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1449static int cpuset_can_attach(struct cgroup_subsys_state *css,
1450 struct cgroup_taskset *tset)
1466{ 1451{
1467 struct cpuset *cs = cgroup_cs(cgrp); 1452 struct cpuset *cs = css_cs(css);
1468 struct task_struct *task; 1453 struct task_struct *task;
1469 int ret; 1454 int ret;
1470 1455
@@ -1475,11 +1460,11 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1475 * flag is set. 1460 * flag is set.
1476 */ 1461 */
1477 ret = -ENOSPC; 1462 ret = -ENOSPC;
1478 if (!cgroup_sane_behavior(cgrp) && 1463 if (!cgroup_sane_behavior(css->cgroup) &&
1479 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1464 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1480 goto out_unlock; 1465 goto out_unlock;
1481 1466
1482 cgroup_taskset_for_each(task, cgrp, tset) { 1467 cgroup_taskset_for_each(task, css, tset) {
1483 /* 1468 /*
1484 * Kthreads which disallow setaffinity shouldn't be moved 1469 * Kthreads which disallow setaffinity shouldn't be moved
1485 * to a new cpuset; we don't want to change their cpu 1470 * to a new cpuset; we don't want to change their cpu
@@ -1508,11 +1493,11 @@ out_unlock:
1508 return ret; 1493 return ret;
1509} 1494}
1510 1495
1511static void cpuset_cancel_attach(struct cgroup *cgrp, 1496static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
1512 struct cgroup_taskset *tset) 1497 struct cgroup_taskset *tset)
1513{ 1498{
1514 mutex_lock(&cpuset_mutex); 1499 mutex_lock(&cpuset_mutex);
1515 cgroup_cs(cgrp)->attach_in_progress--; 1500 css_cs(css)->attach_in_progress--;
1516 mutex_unlock(&cpuset_mutex); 1501 mutex_unlock(&cpuset_mutex);
1517} 1502}
1518 1503
@@ -1523,16 +1508,18 @@ static void cpuset_cancel_attach(struct cgroup *cgrp,
1523 */ 1508 */
1524static cpumask_var_t cpus_attach; 1509static cpumask_var_t cpus_attach;
1525 1510
1526static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1511static void cpuset_attach(struct cgroup_subsys_state *css,
1512 struct cgroup_taskset *tset)
1527{ 1513{
1528 /* static buf protected by cpuset_mutex */ 1514 /* static buf protected by cpuset_mutex */
1529 static nodemask_t cpuset_attach_nodemask_to; 1515 static nodemask_t cpuset_attach_nodemask_to;
1530 struct mm_struct *mm; 1516 struct mm_struct *mm;
1531 struct task_struct *task; 1517 struct task_struct *task;
1532 struct task_struct *leader = cgroup_taskset_first(tset); 1518 struct task_struct *leader = cgroup_taskset_first(tset);
1533 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); 1519 struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset,
1534 struct cpuset *cs = cgroup_cs(cgrp); 1520 cpuset_subsys_id);
1535 struct cpuset *oldcs = cgroup_cs(oldcgrp); 1521 struct cpuset *cs = css_cs(css);
1522 struct cpuset *oldcs = css_cs(oldcss);
1536 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); 1523 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1537 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1524 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1538 1525
@@ -1546,7 +1533,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1546 1533
1547 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); 1534 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
1548 1535
1549 cgroup_taskset_for_each(task, cgrp, tset) { 1536 cgroup_taskset_for_each(task, css, tset) {
1550 /* 1537 /*
1551 * can_attach beforehand should guarantee that this doesn't 1538 * can_attach beforehand should guarantee that this doesn't
1552 * fail. TODO: have a better way to handle failure here 1539 * fail. TODO: have a better way to handle failure here
@@ -1608,9 +1595,10 @@ typedef enum {
1608 FILE_SPREAD_SLAB, 1595 FILE_SPREAD_SLAB,
1609} cpuset_filetype_t; 1596} cpuset_filetype_t;
1610 1597
1611static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1598static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1599 u64 val)
1612{ 1600{
1613 struct cpuset *cs = cgroup_cs(cgrp); 1601 struct cpuset *cs = css_cs(css);
1614 cpuset_filetype_t type = cft->private; 1602 cpuset_filetype_t type = cft->private;
1615 int retval = 0; 1603 int retval = 0;
1616 1604
@@ -1657,9 +1645,10 @@ out_unlock:
1657 return retval; 1645 return retval;
1658} 1646}
1659 1647
1660static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) 1648static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
1649 s64 val)
1661{ 1650{
1662 struct cpuset *cs = cgroup_cs(cgrp); 1651 struct cpuset *cs = css_cs(css);
1663 cpuset_filetype_t type = cft->private; 1652 cpuset_filetype_t type = cft->private;
1664 int retval = -ENODEV; 1653 int retval = -ENODEV;
1665 1654
@@ -1683,10 +1672,10 @@ out_unlock:
1683/* 1672/*
1684 * Common handling for a write to a "cpus" or "mems" file. 1673 * Common handling for a write to a "cpus" or "mems" file.
1685 */ 1674 */
1686static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, 1675static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1687 const char *buf) 1676 struct cftype *cft, const char *buf)
1688{ 1677{
1689 struct cpuset *cs = cgroup_cs(cgrp); 1678 struct cpuset *cs = css_cs(css);
1690 struct cpuset *trialcs; 1679 struct cpuset *trialcs;
1691 int retval = -ENODEV; 1680 int retval = -ENODEV;
1692 1681
@@ -1765,13 +1754,12 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1765 return count; 1754 return count;
1766} 1755}
1767 1756
1768static ssize_t cpuset_common_file_read(struct cgroup *cgrp, 1757static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css,
1769 struct cftype *cft, 1758 struct cftype *cft, struct file *file,
1770 struct file *file, 1759 char __user *buf, size_t nbytes,
1771 char __user *buf, 1760 loff_t *ppos)
1772 size_t nbytes, loff_t *ppos)
1773{ 1761{
1774 struct cpuset *cs = cgroup_cs(cgrp); 1762 struct cpuset *cs = css_cs(css);
1775 cpuset_filetype_t type = cft->private; 1763 cpuset_filetype_t type = cft->private;
1776 char *page; 1764 char *page;
1777 ssize_t retval = 0; 1765 ssize_t retval = 0;
@@ -1801,9 +1789,9 @@ out:
1801 return retval; 1789 return retval;
1802} 1790}
1803 1791
1804static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) 1792static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
1805{ 1793{
1806 struct cpuset *cs = cgroup_cs(cgrp); 1794 struct cpuset *cs = css_cs(css);
1807 cpuset_filetype_t type = cft->private; 1795 cpuset_filetype_t type = cft->private;
1808 switch (type) { 1796 switch (type) {
1809 case FILE_CPU_EXCLUSIVE: 1797 case FILE_CPU_EXCLUSIVE:
@@ -1832,9 +1820,9 @@ static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft)
1832 return 0; 1820 return 0;
1833} 1821}
1834 1822
1835static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft) 1823static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
1836{ 1824{
1837 struct cpuset *cs = cgroup_cs(cgrp); 1825 struct cpuset *cs = css_cs(css);
1838 cpuset_filetype_t type = cft->private; 1826 cpuset_filetype_t type = cft->private;
1839 switch (type) { 1827 switch (type) {
1840 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1828 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1949,11 +1937,12 @@ static struct cftype files[] = {
1949 * cgrp: control group that the new cpuset will be part of 1937 * cgrp: control group that the new cpuset will be part of
1950 */ 1938 */
1951 1939
1952static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) 1940static struct cgroup_subsys_state *
1941cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1953{ 1942{
1954 struct cpuset *cs; 1943 struct cpuset *cs;
1955 1944
1956 if (!cgrp->parent) 1945 if (!parent_css)
1957 return &top_cpuset.css; 1946 return &top_cpuset.css;
1958 1947
1959 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 1948 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
@@ -1973,12 +1962,12 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp)
1973 return &cs->css; 1962 return &cs->css;
1974} 1963}
1975 1964
1976static int cpuset_css_online(struct cgroup *cgrp) 1965static int cpuset_css_online(struct cgroup_subsys_state *css)
1977{ 1966{
1978 struct cpuset *cs = cgroup_cs(cgrp); 1967 struct cpuset *cs = css_cs(css);
1979 struct cpuset *parent = parent_cs(cs); 1968 struct cpuset *parent = parent_cs(cs);
1980 struct cpuset *tmp_cs; 1969 struct cpuset *tmp_cs;
1981 struct cgroup *pos_cg; 1970 struct cgroup_subsys_state *pos_css;
1982 1971
1983 if (!parent) 1972 if (!parent)
1984 return 0; 1973 return 0;
@@ -1993,7 +1982,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
1993 1982
1994 number_of_cpusets++; 1983 number_of_cpusets++;
1995 1984
1996 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) 1985 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1997 goto out_unlock; 1986 goto out_unlock;
1998 1987
1999 /* 1988 /*
@@ -2010,7 +1999,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
2010 * (and likewise for mems) to the new cgroup. 1999 * (and likewise for mems) to the new cgroup.
2011 */ 2000 */
2012 rcu_read_lock(); 2001 rcu_read_lock();
2013 cpuset_for_each_child(tmp_cs, pos_cg, parent) { 2002 cpuset_for_each_child(tmp_cs, pos_css, parent) {
2014 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { 2003 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2015 rcu_read_unlock(); 2004 rcu_read_unlock();
2016 goto out_unlock; 2005 goto out_unlock;
@@ -2027,9 +2016,15 @@ out_unlock:
2027 return 0; 2016 return 0;
2028} 2017}
2029 2018
2030static void cpuset_css_offline(struct cgroup *cgrp) 2019/*
2020 * If the cpuset being removed has its flag 'sched_load_balance'
2021 * enabled, then simulate turning sched_load_balance off, which
2022 * will call rebuild_sched_domains_locked().
2023 */
2024
2025static void cpuset_css_offline(struct cgroup_subsys_state *css)
2031{ 2026{
2032 struct cpuset *cs = cgroup_cs(cgrp); 2027 struct cpuset *cs = css_cs(css);
2033 2028
2034 mutex_lock(&cpuset_mutex); 2029 mutex_lock(&cpuset_mutex);
2035 2030
@@ -2042,15 +2037,9 @@ static void cpuset_css_offline(struct cgroup *cgrp)
2042 mutex_unlock(&cpuset_mutex); 2037 mutex_unlock(&cpuset_mutex);
2043} 2038}
2044 2039
2045/* 2040static void cpuset_css_free(struct cgroup_subsys_state *css)
2046 * If the cpuset being removed has its flag 'sched_load_balance'
2047 * enabled, then simulate turning sched_load_balance off, which
2048 * will call rebuild_sched_domains_locked().
2049 */
2050
2051static void cpuset_css_free(struct cgroup *cgrp)
2052{ 2041{
2053 struct cpuset *cs = cgroup_cs(cgrp); 2042 struct cpuset *cs = css_cs(css);
2054 2043
2055 free_cpumask_var(cs->cpus_allowed); 2044 free_cpumask_var(cs->cpus_allowed);
2056 kfree(cs); 2045 kfree(cs);
@@ -2257,11 +2246,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2257 /* if cpus or mems changed, we need to propagate to descendants */ 2246 /* if cpus or mems changed, we need to propagate to descendants */
2258 if (cpus_updated || mems_updated) { 2247 if (cpus_updated || mems_updated) {
2259 struct cpuset *cs; 2248 struct cpuset *cs;
2260 struct cgroup *pos_cgrp; 2249 struct cgroup_subsys_state *pos_css;
2261 2250
2262 rcu_read_lock(); 2251 rcu_read_lock();
2263 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { 2252 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2264 if (!css_tryget(&cs->css)) 2253 if (cs == &top_cpuset || !css_tryget(&cs->css))
2265 continue; 2254 continue;
2266 rcu_read_unlock(); 2255 rcu_read_unlock();
2267 2256
@@ -2350,7 +2339,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2350 2339
2351void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2340void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2352{ 2341{
2353 const struct cpuset *cpus_cs; 2342 struct cpuset *cpus_cs;
2354 2343
2355 rcu_read_lock(); 2344 rcu_read_lock();
2356 cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2345 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
@@ -2423,7 +2412,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2423 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall 2412 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
2424 * (an unusual configuration), then returns the root cpuset. 2413 * (an unusual configuration), then returns the root cpuset.
2425 */ 2414 */
2426static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) 2415static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2427{ 2416{
2428 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) 2417 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2429 cs = parent_cs(cs); 2418 cs = parent_cs(cs);
@@ -2493,7 +2482,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2493 */ 2482 */
2494int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) 2483int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2495{ 2484{
2496 const struct cpuset *cs; /* current cpuset ancestors */ 2485 struct cpuset *cs; /* current cpuset ancestors */
2497 int allowed; /* is allocation in zone z allowed? */ 2486 int allowed; /* is allocation in zone z allowed? */
2498 2487
2499 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2488 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
@@ -2731,7 +2720,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2731 goto out_free; 2720 goto out_free;
2732 2721
2733 rcu_read_lock(); 2722 rcu_read_lock();
2734 css = task_subsys_state(tsk, cpuset_subsys_id); 2723 css = task_css(tsk, cpuset_subsys_id);
2735 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2724 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2736 rcu_read_unlock(); 2725 rcu_read_unlock();
2737 if (retval < 0) 2726 if (retval < 0)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f86599e8c123..9300f5226077 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -340,8 +340,8 @@ struct perf_cgroup {
340static inline struct perf_cgroup * 340static inline struct perf_cgroup *
341perf_cgroup_from_task(struct task_struct *task) 341perf_cgroup_from_task(struct task_struct *task)
342{ 342{
343 return container_of(task_subsys_state(task, perf_subsys_id), 343 return container_of(task_css(task, perf_subsys_id),
344 struct perf_cgroup, css); 344 struct perf_cgroup, css);
345} 345}
346 346
347static inline bool 347static inline bool
@@ -591,7 +591,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
591 if (!f.file) 591 if (!f.file)
592 return -EBADF; 592 return -EBADF;
593 593
594 css = cgroup_css_from_dir(f.file, perf_subsys_id); 594 rcu_read_lock();
595
596 css = css_from_dir(f.file->f_dentry, &perf_subsys);
595 if (IS_ERR(css)) { 597 if (IS_ERR(css)) {
596 ret = PTR_ERR(css); 598 ret = PTR_ERR(css);
597 goto out; 599 goto out;
@@ -617,6 +619,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
617 ret = -EINVAL; 619 ret = -EINVAL;
618 } 620 }
619out: 621out:
622 rcu_read_unlock();
620 fdput(f); 623 fdput(f);
621 return ret; 624 return ret;
622} 625}
@@ -7798,7 +7801,8 @@ unlock:
7798device_initcall(perf_event_sysfs_init); 7801device_initcall(perf_event_sysfs_init);
7799 7802
7800#ifdef CONFIG_CGROUP_PERF 7803#ifdef CONFIG_CGROUP_PERF
7801static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) 7804static struct cgroup_subsys_state *
7805perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7802{ 7806{
7803 struct perf_cgroup *jc; 7807 struct perf_cgroup *jc;
7804 7808
@@ -7815,11 +7819,10 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
7815 return &jc->css; 7819 return &jc->css;
7816} 7820}
7817 7821
7818static void perf_cgroup_css_free(struct cgroup *cont) 7822static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
7819{ 7823{
7820 struct perf_cgroup *jc; 7824 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
7821 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7825
7822 struct perf_cgroup, css);
7823 free_percpu(jc->info); 7826 free_percpu(jc->info);
7824 kfree(jc); 7827 kfree(jc);
7825} 7828}
@@ -7831,15 +7834,17 @@ static int __perf_cgroup_move(void *info)
7831 return 0; 7834 return 0;
7832} 7835}
7833 7836
7834static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 7837static void perf_cgroup_attach(struct cgroup_subsys_state *css,
7838 struct cgroup_taskset *tset)
7835{ 7839{
7836 struct task_struct *task; 7840 struct task_struct *task;
7837 7841
7838 cgroup_taskset_for_each(task, cgrp, tset) 7842 cgroup_taskset_for_each(task, css, tset)
7839 task_function_call(task, __perf_cgroup_move, task); 7843 task_function_call(task, __perf_cgroup_move, task);
7840} 7844}
7841 7845
7842static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, 7846static void perf_cgroup_exit(struct cgroup_subsys_state *css,
7847 struct cgroup_subsys_state *old_css,
7843 struct task_struct *task) 7848 struct task_struct *task)
7844{ 7849{
7845 /* 7850 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 05c39f030314..e53bda3ff2f1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6815,7 +6815,7 @@ void sched_move_task(struct task_struct *tsk)
6815 if (unlikely(running)) 6815 if (unlikely(running))
6816 tsk->sched_class->put_prev_task(rq, tsk); 6816 tsk->sched_class->put_prev_task(rq, tsk);
6817 6817
6818 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, 6818 tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
6819 lockdep_is_held(&tsk->sighand->siglock)), 6819 lockdep_is_held(&tsk->sighand->siglock)),
6820 struct task_group, css); 6820 struct task_group, css);
6821 tg = autogroup_task_group(tsk, tg); 6821 tg = autogroup_task_group(tsk, tg);
@@ -7137,23 +7137,22 @@ int sched_rt_handler(struct ctl_table *table, int write,
7137 7137
7138#ifdef CONFIG_CGROUP_SCHED 7138#ifdef CONFIG_CGROUP_SCHED
7139 7139
7140/* return corresponding task_group object of a cgroup */ 7140static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
7141static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7142{ 7141{
7143 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), 7142 return css ? container_of(css, struct task_group, css) : NULL;
7144 struct task_group, css);
7145} 7143}
7146 7144
7147static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) 7145static struct cgroup_subsys_state *
7146cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7148{ 7147{
7149 struct task_group *tg, *parent; 7148 struct task_group *parent = css_tg(parent_css);
7149 struct task_group *tg;
7150 7150
7151 if (!cgrp->parent) { 7151 if (!parent) {
7152 /* This is early initialization for the top cgroup */ 7152 /* This is early initialization for the top cgroup */
7153 return &root_task_group.css; 7153 return &root_task_group.css;
7154 } 7154 }
7155 7155
7156 parent = cgroup_tg(cgrp->parent);
7157 tg = sched_create_group(parent); 7156 tg = sched_create_group(parent);
7158 if (IS_ERR(tg)) 7157 if (IS_ERR(tg))
7159 return ERR_PTR(-ENOMEM); 7158 return ERR_PTR(-ENOMEM);
@@ -7161,41 +7160,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7161 return &tg->css; 7160 return &tg->css;
7162} 7161}
7163 7162
7164static int cpu_cgroup_css_online(struct cgroup *cgrp) 7163static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7165{ 7164{
7166 struct task_group *tg = cgroup_tg(cgrp); 7165 struct task_group *tg = css_tg(css);
7167 struct task_group *parent; 7166 struct task_group *parent = css_tg(css_parent(css));
7168
7169 if (!cgrp->parent)
7170 return 0;
7171 7167
7172 parent = cgroup_tg(cgrp->parent); 7168 if (parent)
7173 sched_online_group(tg, parent); 7169 sched_online_group(tg, parent);
7174 return 0; 7170 return 0;
7175} 7171}
7176 7172
7177static void cpu_cgroup_css_free(struct cgroup *cgrp) 7173static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
7178{ 7174{
7179 struct task_group *tg = cgroup_tg(cgrp); 7175 struct task_group *tg = css_tg(css);
7180 7176
7181 sched_destroy_group(tg); 7177 sched_destroy_group(tg);
7182} 7178}
7183 7179
7184static void cpu_cgroup_css_offline(struct cgroup *cgrp) 7180static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
7185{ 7181{
7186 struct task_group *tg = cgroup_tg(cgrp); 7182 struct task_group *tg = css_tg(css);
7187 7183
7188 sched_offline_group(tg); 7184 sched_offline_group(tg);
7189} 7185}
7190 7186
7191static int cpu_cgroup_can_attach(struct cgroup *cgrp, 7187static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7192 struct cgroup_taskset *tset) 7188 struct cgroup_taskset *tset)
7193{ 7189{
7194 struct task_struct *task; 7190 struct task_struct *task;
7195 7191
7196 cgroup_taskset_for_each(task, cgrp, tset) { 7192 cgroup_taskset_for_each(task, css, tset) {
7197#ifdef CONFIG_RT_GROUP_SCHED 7193#ifdef CONFIG_RT_GROUP_SCHED
7198 if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) 7194 if (!sched_rt_can_attach(css_tg(css), task))
7199 return -EINVAL; 7195 return -EINVAL;
7200#else 7196#else
7201 /* We don't support RT-tasks being in separate groups */ 7197 /* We don't support RT-tasks being in separate groups */
@@ -7206,18 +7202,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7206 return 0; 7202 return 0;
7207} 7203}
7208 7204
7209static void cpu_cgroup_attach(struct cgroup *cgrp, 7205static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
7210 struct cgroup_taskset *tset) 7206 struct cgroup_taskset *tset)
7211{ 7207{
7212 struct task_struct *task; 7208 struct task_struct *task;
7213 7209
7214 cgroup_taskset_for_each(task, cgrp, tset) 7210 cgroup_taskset_for_each(task, css, tset)
7215 sched_move_task(task); 7211 sched_move_task(task);
7216} 7212}
7217 7213
7218static void 7214static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
7219cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, 7215 struct cgroup_subsys_state *old_css,
7220 struct task_struct *task) 7216 struct task_struct *task)
7221{ 7217{
7222 /* 7218 /*
7223 * cgroup_exit() is called in the copy_process() failure path. 7219 * cgroup_exit() is called in the copy_process() failure path.
@@ -7231,15 +7227,16 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7231} 7227}
7232 7228
7233#ifdef CONFIG_FAIR_GROUP_SCHED 7229#ifdef CONFIG_FAIR_GROUP_SCHED
7234static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7230static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
7235 u64 shareval) 7231 struct cftype *cftype, u64 shareval)
7236{ 7232{
7237 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); 7233 return sched_group_set_shares(css_tg(css), scale_load(shareval));
7238} 7234}
7239 7235
7240static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 7236static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
7237 struct cftype *cft)
7241{ 7238{
7242 struct task_group *tg = cgroup_tg(cgrp); 7239 struct task_group *tg = css_tg(css);
7243 7240
7244 return (u64) scale_load_down(tg->shares); 7241 return (u64) scale_load_down(tg->shares);
7245} 7242}
@@ -7361,26 +7358,28 @@ long tg_get_cfs_period(struct task_group *tg)
7361 return cfs_period_us; 7358 return cfs_period_us;
7362} 7359}
7363 7360
7364static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) 7361static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
7362 struct cftype *cft)
7365{ 7363{
7366 return tg_get_cfs_quota(cgroup_tg(cgrp)); 7364 return tg_get_cfs_quota(css_tg(css));
7367} 7365}
7368 7366
7369static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, 7367static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
7370 s64 cfs_quota_us) 7368 struct cftype *cftype, s64 cfs_quota_us)
7371{ 7369{
7372 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); 7370 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
7373} 7371}
7374 7372
7375static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) 7373static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
7374 struct cftype *cft)
7376{ 7375{
7377 return tg_get_cfs_period(cgroup_tg(cgrp)); 7376 return tg_get_cfs_period(css_tg(css));
7378} 7377}
7379 7378
7380static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7379static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
7381 u64 cfs_period_us) 7380 struct cftype *cftype, u64 cfs_period_us)
7382{ 7381{
7383 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); 7382 return tg_set_cfs_period(css_tg(css), cfs_period_us);
7384} 7383}
7385 7384
7386struct cfs_schedulable_data { 7385struct cfs_schedulable_data {
@@ -7461,10 +7460,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7461 return ret; 7460 return ret;
7462} 7461}
7463 7462
7464static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, 7463static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
7465 struct cgroup_map_cb *cb) 7464 struct cgroup_map_cb *cb)
7466{ 7465{
7467 struct task_group *tg = cgroup_tg(cgrp); 7466 struct task_group *tg = css_tg(css);
7468 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7467 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7469 7468
7470 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7469 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
@@ -7477,26 +7476,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
7477#endif /* CONFIG_FAIR_GROUP_SCHED */ 7476#endif /* CONFIG_FAIR_GROUP_SCHED */
7478 7477
7479#ifdef CONFIG_RT_GROUP_SCHED 7478#ifdef CONFIG_RT_GROUP_SCHED
7480static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 7479static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
7481 s64 val) 7480 struct cftype *cft, s64 val)
7482{ 7481{
7483 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); 7482 return sched_group_set_rt_runtime(css_tg(css), val);
7484} 7483}
7485 7484
7486static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) 7485static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
7486 struct cftype *cft)
7487{ 7487{
7488 return sched_group_rt_runtime(cgroup_tg(cgrp)); 7488 return sched_group_rt_runtime(css_tg(css));
7489} 7489}
7490 7490
7491static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 7491static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
7492 u64 rt_period_us) 7492 struct cftype *cftype, u64 rt_period_us)
7493{ 7493{
7494 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); 7494 return sched_group_set_rt_period(css_tg(css), rt_period_us);
7495} 7495}
7496 7496
7497static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) 7497static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
7498 struct cftype *cft)
7498{ 7499{
7499 return sched_group_rt_period(cgroup_tg(cgrp)); 7500 return sched_group_rt_period(css_tg(css));
7500} 7501}
7501#endif /* CONFIG_RT_GROUP_SCHED */ 7502#endif /* CONFIG_RT_GROUP_SCHED */
7502 7503
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dbb7e2cd95eb..f64722ff0299 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -33,30 +33,20 @@ struct cpuacct {
33 struct kernel_cpustat __percpu *cpustat; 33 struct kernel_cpustat __percpu *cpustat;
34}; 34};
35 35
36/* return cpu accounting group corresponding to this container */ 36static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
37static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
38{ 37{
39 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), 38 return css ? container_of(css, struct cpuacct, css) : NULL;
40 struct cpuacct, css);
41} 39}
42 40
43/* return cpu accounting group to which this task belongs */ 41/* return cpu accounting group to which this task belongs */
44static inline struct cpuacct *task_ca(struct task_struct *tsk) 42static inline struct cpuacct *task_ca(struct task_struct *tsk)
45{ 43{
46 return container_of(task_subsys_state(tsk, cpuacct_subsys_id), 44 return css_ca(task_css(tsk, cpuacct_subsys_id));
47 struct cpuacct, css);
48}
49
50static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
51{
52 return cgroup_ca(ca->css.cgroup->parent);
53} 45}
54 46
55static inline struct cpuacct *parent_ca(struct cpuacct *ca) 47static inline struct cpuacct *parent_ca(struct cpuacct *ca)
56{ 48{
57 if (!ca->css.cgroup->parent) 49 return css_ca(css_parent(&ca->css));
58 return NULL;
59 return cgroup_ca(ca->css.cgroup->parent);
60} 50}
61 51
62static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); 52static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
@@ -66,11 +56,12 @@ static struct cpuacct root_cpuacct = {
66}; 56};
67 57
68/* create a new cpu accounting group */ 58/* create a new cpu accounting group */
69static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) 59static struct cgroup_subsys_state *
60cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
70{ 61{
71 struct cpuacct *ca; 62 struct cpuacct *ca;
72 63
73 if (!cgrp->parent) 64 if (!parent_css)
74 return &root_cpuacct.css; 65 return &root_cpuacct.css;
75 66
76 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 67 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
@@ -96,9 +87,9 @@ out:
96} 87}
97 88
98/* destroy an existing cpu accounting group */ 89/* destroy an existing cpu accounting group */
99static void cpuacct_css_free(struct cgroup *cgrp) 90static void cpuacct_css_free(struct cgroup_subsys_state *css)
100{ 91{
101 struct cpuacct *ca = cgroup_ca(cgrp); 92 struct cpuacct *ca = css_ca(css);
102 93
103 free_percpu(ca->cpustat); 94 free_percpu(ca->cpustat);
104 free_percpu(ca->cpuusage); 95 free_percpu(ca->cpuusage);
@@ -141,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
141} 132}
142 133
143/* return total cpu usage (in nanoseconds) of a group */ 134/* return total cpu usage (in nanoseconds) of a group */
144static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 135static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
145{ 136{
146 struct cpuacct *ca = cgroup_ca(cgrp); 137 struct cpuacct *ca = css_ca(css);
147 u64 totalcpuusage = 0; 138 u64 totalcpuusage = 0;
148 int i; 139 int i;
149 140
@@ -153,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
153 return totalcpuusage; 144 return totalcpuusage;
154} 145}
155 146
156static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, 147static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
157 u64 reset) 148 u64 reset)
158{ 149{
159 struct cpuacct *ca = cgroup_ca(cgrp); 150 struct cpuacct *ca = css_ca(css);
160 int err = 0; 151 int err = 0;
161 int i; 152 int i;
162 153
@@ -172,10 +163,10 @@ out:
172 return err; 163 return err;
173} 164}
174 165
175static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, 166static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css,
176 struct seq_file *m) 167 struct cftype *cft, struct seq_file *m)
177{ 168{
178 struct cpuacct *ca = cgroup_ca(cgroup); 169 struct cpuacct *ca = css_ca(css);
179 u64 percpu; 170 u64 percpu;
180 int i; 171 int i;
181 172
@@ -192,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = {
192 [CPUACCT_STAT_SYSTEM] = "system", 183 [CPUACCT_STAT_SYSTEM] = "system",
193}; 184};
194 185
195static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 186static int cpuacct_stats_show(struct cgroup_subsys_state *css,
196 struct cgroup_map_cb *cb) 187 struct cftype *cft, struct cgroup_map_cb *cb)
197{ 188{
198 struct cpuacct *ca = cgroup_ca(cgrp); 189 struct cpuacct *ca = css_ca(css);
199 int cpu; 190 int cpu;
200 s64 val = 0; 191 s64 val = 0;
201 192
@@ -281,7 +272,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
281 while (ca != &root_cpuacct) { 272 while (ca != &root_cpuacct) {
282 kcpustat = this_cpu_ptr(ca->cpustat); 273 kcpustat = this_cpu_ptr(ca->cpustat);
283 kcpustat->cpustat[index] += val; 274 kcpustat->cpustat[index] += val;
284 ca = __parent_ca(ca); 275 ca = parent_ca(ca);
285 } 276 }
286 rcu_read_unlock(); 277 rcu_read_unlock();
287} 278}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef0a7b2439dd..471a56db05ea 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -665,9 +665,9 @@ extern int group_balance_cpu(struct sched_group *sg);
665/* 665/*
666 * Return the group to which this tasks belongs. 666 * Return the group to which this tasks belongs.
667 * 667 *
668 * We cannot use task_subsys_state() and friends because the cgroup 668 * We cannot use task_css() and friends because the cgroup subsystem
669 * subsystem changes that value before the cgroup_subsys::attach() method 669 * changes that value before the cgroup_subsys::attach() method is called,
670 * is called, therefore we cannot pin it and might observe the wrong value. 670 * therefore we cannot pin it and might observe the wrong value.
671 * 671 *
672 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup 672 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
673 * core changes this before calling sched_move_task(). 673 * core changes this before calling sched_move_task().
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 9cea7de22ffb..bda8e44f6fde 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -36,21 +36,13 @@ static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
36static inline 36static inline
37struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) 37struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
38{ 38{
39 return container_of(s, struct hugetlb_cgroup, css); 39 return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
40}
41
42static inline
43struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup)
44{
45 return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup,
46 hugetlb_subsys_id));
47} 40}
48 41
49static inline 42static inline
50struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) 43struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
51{ 44{
52 return hugetlb_cgroup_from_css(task_subsys_state(task, 45 return hugetlb_cgroup_from_css(task_css(task, hugetlb_subsys_id));
53 hugetlb_subsys_id));
54} 46}
55 47
56static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) 48static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
@@ -58,17 +50,15 @@ static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
58 return (h_cg == root_h_cgroup); 50 return (h_cg == root_h_cgroup);
59} 51}
60 52
61static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg) 53static inline struct hugetlb_cgroup *
54parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
62{ 55{
63 if (!cg->parent) 56 return hugetlb_cgroup_from_css(css_parent(&h_cg->css));
64 return NULL;
65 return hugetlb_cgroup_from_cgroup(cg->parent);
66} 57}
67 58
68static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg) 59static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
69{ 60{
70 int idx; 61 int idx;
71 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg);
72 62
73 for (idx = 0; idx < hugetlb_max_hstate; idx++) { 63 for (idx = 0; idx < hugetlb_max_hstate; idx++) {
74 if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0) 64 if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
@@ -77,19 +67,18 @@ static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
77 return false; 67 return false;
78} 68}
79 69
80static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup) 70static struct cgroup_subsys_state *
71hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
81{ 72{
73 struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
74 struct hugetlb_cgroup *h_cgroup;
82 int idx; 75 int idx;
83 struct cgroup *parent_cgroup;
84 struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup;
85 76
86 h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); 77 h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
87 if (!h_cgroup) 78 if (!h_cgroup)
88 return ERR_PTR(-ENOMEM); 79 return ERR_PTR(-ENOMEM);
89 80
90 parent_cgroup = cgroup->parent; 81 if (parent_h_cgroup) {
91 if (parent_cgroup) {
92 parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup);
93 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) 82 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
94 res_counter_init(&h_cgroup->hugepage[idx], 83 res_counter_init(&h_cgroup->hugepage[idx],
95 &parent_h_cgroup->hugepage[idx]); 84 &parent_h_cgroup->hugepage[idx]);
@@ -101,11 +90,11 @@ static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgrou
101 return &h_cgroup->css; 90 return &h_cgroup->css;
102} 91}
103 92
104static void hugetlb_cgroup_css_free(struct cgroup *cgroup) 93static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
105{ 94{
106 struct hugetlb_cgroup *h_cgroup; 95 struct hugetlb_cgroup *h_cgroup;
107 96
108 h_cgroup = hugetlb_cgroup_from_cgroup(cgroup); 97 h_cgroup = hugetlb_cgroup_from_css(css);
109 kfree(h_cgroup); 98 kfree(h_cgroup);
110} 99}
111 100
@@ -117,15 +106,14 @@ static void hugetlb_cgroup_css_free(struct cgroup *cgroup)
117 * page reference and test for page active here. This function 106 * page reference and test for page active here. This function
118 * cannot fail. 107 * cannot fail.
119 */ 108 */
120static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup, 109static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
121 struct page *page) 110 struct page *page)
122{ 111{
123 int csize; 112 int csize;
124 struct res_counter *counter; 113 struct res_counter *counter;
125 struct res_counter *fail_res; 114 struct res_counter *fail_res;
126 struct hugetlb_cgroup *page_hcg; 115 struct hugetlb_cgroup *page_hcg;
127 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); 116 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
128 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup);
129 117
130 page_hcg = hugetlb_cgroup_from_page(page); 118 page_hcg = hugetlb_cgroup_from_page(page);
131 /* 119 /*
@@ -155,8 +143,9 @@ out:
155 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to 143 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
156 * the parent cgroup. 144 * the parent cgroup.
157 */ 145 */
158static void hugetlb_cgroup_css_offline(struct cgroup *cgroup) 146static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
159{ 147{
148 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
160 struct hstate *h; 149 struct hstate *h;
161 struct page *page; 150 struct page *page;
162 int idx = 0; 151 int idx = 0;
@@ -165,13 +154,13 @@ static void hugetlb_cgroup_css_offline(struct cgroup *cgroup)
165 for_each_hstate(h) { 154 for_each_hstate(h) {
166 spin_lock(&hugetlb_lock); 155 spin_lock(&hugetlb_lock);
167 list_for_each_entry(page, &h->hugepage_activelist, lru) 156 list_for_each_entry(page, &h->hugepage_activelist, lru)
168 hugetlb_cgroup_move_parent(idx, cgroup, page); 157 hugetlb_cgroup_move_parent(idx, h_cg, page);
169 158
170 spin_unlock(&hugetlb_lock); 159 spin_unlock(&hugetlb_lock);
171 idx++; 160 idx++;
172 } 161 }
173 cond_resched(); 162 cond_resched();
174 } while (hugetlb_cgroup_have_usage(cgroup)); 163 } while (hugetlb_cgroup_have_usage(h_cg));
175} 164}
176 165
177int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 166int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
@@ -253,14 +242,15 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
253 return; 242 return;
254} 243}
255 244
256static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft, 245static ssize_t hugetlb_cgroup_read(struct cgroup_subsys_state *css,
257 struct file *file, char __user *buf, 246 struct cftype *cft, struct file *file,
258 size_t nbytes, loff_t *ppos) 247 char __user *buf, size_t nbytes,
248 loff_t *ppos)
259{ 249{
260 u64 val; 250 u64 val;
261 char str[64]; 251 char str[64];
262 int idx, name, len; 252 int idx, name, len;
263 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); 253 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
264 254
265 idx = MEMFILE_IDX(cft->private); 255 idx = MEMFILE_IDX(cft->private);
266 name = MEMFILE_ATTR(cft->private); 256 name = MEMFILE_ATTR(cft->private);
@@ -270,12 +260,12 @@ static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft,
270 return simple_read_from_buffer(buf, nbytes, ppos, str, len); 260 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
271} 261}
272 262
273static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft, 263static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
274 const char *buffer) 264 struct cftype *cft, const char *buffer)
275{ 265{
276 int idx, name, ret; 266 int idx, name, ret;
277 unsigned long long val; 267 unsigned long long val;
278 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); 268 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
279 269
280 idx = MEMFILE_IDX(cft->private); 270 idx = MEMFILE_IDX(cft->private);
281 name = MEMFILE_ATTR(cft->private); 271 name = MEMFILE_ATTR(cft->private);
@@ -300,10 +290,11 @@ static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft,
300 return ret; 290 return ret;
301} 291}
302 292
303static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event) 293static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css,
294 unsigned int event)
304{ 295{
305 int idx, name, ret = 0; 296 int idx, name, ret = 0;
306 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); 297 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
307 298
308 idx = MEMFILE_IDX(event); 299 idx = MEMFILE_IDX(event);
309 name = MEMFILE_ATTR(event); 300 name = MEMFILE_ATTR(event);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0878ff7c26a9..3b83957b6439 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -483,10 +483,9 @@ enum res_type {
483 */ 483 */
484static DEFINE_MUTEX(memcg_create_mutex); 484static DEFINE_MUTEX(memcg_create_mutex);
485 485
486static inline
487struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) 486struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
488{ 487{
489 return container_of(s, struct mem_cgroup, css); 488 return s ? container_of(s, struct mem_cgroup, css) : NULL;
490} 489}
491 490
492/* Some nice accessors for the vmpressure. */ 491/* Some nice accessors for the vmpressure. */
@@ -1035,12 +1034,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1035 preempt_enable(); 1034 preempt_enable();
1036} 1035}
1037 1036
1038struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
1039{
1040 return mem_cgroup_from_css(
1041 cgroup_subsys_state(cont, mem_cgroup_subsys_id));
1042}
1043
1044struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 1037struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1045{ 1038{
1046 /* 1039 /*
@@ -1051,7 +1044,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1051 if (unlikely(!p)) 1044 if (unlikely(!p))
1052 return NULL; 1045 return NULL;
1053 1046
1054 return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id)); 1047 return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id));
1055} 1048}
1056 1049
1057struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 1050struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
@@ -1084,20 +1077,11 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
1084static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 1077static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
1085 struct mem_cgroup *last_visited) 1078 struct mem_cgroup *last_visited)
1086{ 1079{
1087 struct cgroup *prev_cgroup, *next_cgroup; 1080 struct cgroup_subsys_state *prev_css, *next_css;
1088 1081
1089 /* 1082 prev_css = last_visited ? &last_visited->css : NULL;
1090 * Root is not visited by cgroup iterators so it needs an
1091 * explicit visit.
1092 */
1093 if (!last_visited)
1094 return root;
1095
1096 prev_cgroup = (last_visited == root) ? NULL
1097 : last_visited->css.cgroup;
1098skip_node: 1083skip_node:
1099 next_cgroup = cgroup_next_descendant_pre( 1084 next_css = css_next_descendant_pre(prev_css, &root->css);
1100 prev_cgroup, root->css.cgroup);
1101 1085
1102 /* 1086 /*
1103 * Even if we found a group we have to make sure it is 1087 * Even if we found a group we have to make sure it is
@@ -1106,13 +1090,13 @@ skip_node:
1106 * last_visited css is safe to use because it is 1090 * last_visited css is safe to use because it is
1107 * protected by css_get and the tree walk is rcu safe. 1091 * protected by css_get and the tree walk is rcu safe.
1108 */ 1092 */
1109 if (next_cgroup) { 1093 if (next_css) {
1110 struct mem_cgroup *mem = mem_cgroup_from_cont( 1094 struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
1111 next_cgroup); 1095
1112 if (css_tryget(&mem->css)) 1096 if (css_tryget(&mem->css))
1113 return mem; 1097 return mem;
1114 else { 1098 else {
1115 prev_cgroup = next_cgroup; 1099 prev_css = next_css;
1116 goto skip_node; 1100 goto skip_node;
1117 } 1101 }
1118 } 1102 }
@@ -1525,10 +1509,8 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1525 1509
1526int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1510int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1527{ 1511{
1528 struct cgroup *cgrp = memcg->css.cgroup;
1529
1530 /* root ? */ 1512 /* root ? */
1531 if (cgrp->parent == NULL) 1513 if (!css_parent(&memcg->css))
1532 return vm_swappiness; 1514 return vm_swappiness;
1533 1515
1534 return memcg->swappiness; 1516 return memcg->swappiness;
@@ -1805,12 +1787,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1805 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 1787 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1806 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; 1788 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1807 for_each_mem_cgroup_tree(iter, memcg) { 1789 for_each_mem_cgroup_tree(iter, memcg) {
1808 struct cgroup *cgroup = iter->css.cgroup; 1790 struct css_task_iter it;
1809 struct cgroup_iter it;
1810 struct task_struct *task; 1791 struct task_struct *task;
1811 1792
1812 cgroup_iter_start(cgroup, &it); 1793 css_task_iter_start(&iter->css, &it);
1813 while ((task = cgroup_iter_next(cgroup, &it))) { 1794 while ((task = css_task_iter_next(&it))) {
1814 switch (oom_scan_process_thread(task, totalpages, NULL, 1795 switch (oom_scan_process_thread(task, totalpages, NULL,
1815 false)) { 1796 false)) {
1816 case OOM_SCAN_SELECT: 1797 case OOM_SCAN_SELECT:
@@ -1823,7 +1804,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1823 case OOM_SCAN_CONTINUE: 1804 case OOM_SCAN_CONTINUE:
1824 continue; 1805 continue;
1825 case OOM_SCAN_ABORT: 1806 case OOM_SCAN_ABORT:
1826 cgroup_iter_end(cgroup, &it); 1807 css_task_iter_end(&it);
1827 mem_cgroup_iter_break(memcg, iter); 1808 mem_cgroup_iter_break(memcg, iter);
1828 if (chosen) 1809 if (chosen)
1829 put_task_struct(chosen); 1810 put_task_struct(chosen);
@@ -1840,7 +1821,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1840 get_task_struct(chosen); 1821 get_task_struct(chosen);
1841 } 1822 }
1842 } 1823 }
1843 cgroup_iter_end(cgroup, &it); 1824 css_task_iter_end(&it);
1844 } 1825 }
1845 1826
1846 if (!chosen) 1827 if (!chosen)
@@ -2954,10 +2935,10 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2954} 2935}
2955 2936
2956#ifdef CONFIG_SLABINFO 2937#ifdef CONFIG_SLABINFO
2957static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft, 2938static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
2958 struct seq_file *m) 2939 struct cftype *cft, struct seq_file *m)
2959{ 2940{
2960 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 2941 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2961 struct memcg_cache_params *params; 2942 struct memcg_cache_params *params;
2962 2943
2963 if (!memcg_can_account_kmem(memcg)) 2944 if (!memcg_can_account_kmem(memcg))
@@ -4943,10 +4924,10 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
4943 */ 4924 */
4944static inline bool __memcg_has_children(struct mem_cgroup *memcg) 4925static inline bool __memcg_has_children(struct mem_cgroup *memcg)
4945{ 4926{
4946 struct cgroup *pos; 4927 struct cgroup_subsys_state *pos;
4947 4928
4948 /* bounce at first found */ 4929 /* bounce at first found */
4949 cgroup_for_each_child(pos, memcg->css.cgroup) 4930 css_for_each_child(pos, &memcg->css)
4950 return true; 4931 return true;
4951 return false; 4932 return false;
4952} 4933}
@@ -5002,9 +4983,10 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
5002 return 0; 4983 return 0;
5003} 4984}
5004 4985
5005static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 4986static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,
4987 unsigned int event)
5006{ 4988{
5007 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4989 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5008 int ret; 4990 int ret;
5009 4991
5010 if (mem_cgroup_is_root(memcg)) 4992 if (mem_cgroup_is_root(memcg))
@@ -5017,21 +4999,18 @@ static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
5017} 4999}
5018 5000
5019 5001
5020static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 5002static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
5003 struct cftype *cft)
5021{ 5004{
5022 return mem_cgroup_from_cont(cont)->use_hierarchy; 5005 return mem_cgroup_from_css(css)->use_hierarchy;
5023} 5006}
5024 5007
5025static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 5008static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
5026 u64 val) 5009 struct cftype *cft, u64 val)
5027{ 5010{
5028 int retval = 0; 5011 int retval = 0;
5029 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5012 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5030 struct cgroup *parent = cont->parent; 5013 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css));
5031 struct mem_cgroup *parent_memcg = NULL;
5032
5033 if (parent)
5034 parent_memcg = mem_cgroup_from_cont(parent);
5035 5014
5036 mutex_lock(&memcg_create_mutex); 5015 mutex_lock(&memcg_create_mutex);
5037 5016
@@ -5101,11 +5080,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
5101 return val << PAGE_SHIFT; 5080 return val << PAGE_SHIFT;
5102} 5081}
5103 5082
5104static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, 5083static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
5105 struct file *file, char __user *buf, 5084 struct cftype *cft, struct file *file,
5106 size_t nbytes, loff_t *ppos) 5085 char __user *buf, size_t nbytes, loff_t *ppos)
5107{ 5086{
5108 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5087 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5109 char str[64]; 5088 char str[64];
5110 u64 val; 5089 u64 val;
5111 int name, len; 5090 int name, len;
@@ -5138,11 +5117,11 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
5138 return simple_read_from_buffer(buf, nbytes, ppos, str, len); 5117 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
5139} 5118}
5140 5119
5141static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) 5120static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
5142{ 5121{
5143 int ret = -EINVAL; 5122 int ret = -EINVAL;
5144#ifdef CONFIG_MEMCG_KMEM 5123#ifdef CONFIG_MEMCG_KMEM
5145 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5124 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5146 /* 5125 /*
5147 * For simplicity, we won't allow this to be disabled. It also can't 5126 * For simplicity, we won't allow this to be disabled. It also can't
5148 * be changed if the cgroup has children already, or if tasks had 5127 * be changed if the cgroup has children already, or if tasks had
@@ -5158,7 +5137,7 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
5158 mutex_lock(&memcg_create_mutex); 5137 mutex_lock(&memcg_create_mutex);
5159 mutex_lock(&set_limit_mutex); 5138 mutex_lock(&set_limit_mutex);
5160 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { 5139 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
5161 if (cgroup_task_count(cont) || memcg_has_children(memcg)) { 5140 if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
5162 ret = -EBUSY; 5141 ret = -EBUSY;
5163 goto out; 5142 goto out;
5164 } 5143 }
@@ -5228,10 +5207,10 @@ out:
5228 * The user of this function is... 5207 * The user of this function is...
5229 * RES_LIMIT. 5208 * RES_LIMIT.
5230 */ 5209 */
5231static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 5210static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
5232 const char *buffer) 5211 const char *buffer)
5233{ 5212{
5234 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5213 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5235 enum res_type type; 5214 enum res_type type;
5236 int name; 5215 int name;
5237 unsigned long long val; 5216 unsigned long long val;
@@ -5255,7 +5234,7 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
5255 else if (type == _MEMSWAP) 5234 else if (type == _MEMSWAP)
5256 ret = mem_cgroup_resize_memsw_limit(memcg, val); 5235 ret = mem_cgroup_resize_memsw_limit(memcg, val);
5257 else if (type == _KMEM) 5236 else if (type == _KMEM)
5258 ret = memcg_update_kmem_limit(cont, val); 5237 ret = memcg_update_kmem_limit(css, val);
5259 else 5238 else
5260 return -EINVAL; 5239 return -EINVAL;
5261 break; 5240 break;
@@ -5283,18 +5262,15 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
5283static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 5262static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
5284 unsigned long long *mem_limit, unsigned long long *memsw_limit) 5263 unsigned long long *mem_limit, unsigned long long *memsw_limit)
5285{ 5264{
5286 struct cgroup *cgroup;
5287 unsigned long long min_limit, min_memsw_limit, tmp; 5265 unsigned long long min_limit, min_memsw_limit, tmp;
5288 5266
5289 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 5267 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
5290 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 5268 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
5291 cgroup = memcg->css.cgroup;
5292 if (!memcg->use_hierarchy) 5269 if (!memcg->use_hierarchy)
5293 goto out; 5270 goto out;
5294 5271
5295 while (cgroup->parent) { 5272 while (css_parent(&memcg->css)) {
5296 cgroup = cgroup->parent; 5273 memcg = mem_cgroup_from_css(css_parent(&memcg->css));
5297 memcg = mem_cgroup_from_cont(cgroup);
5298 if (!memcg->use_hierarchy) 5274 if (!memcg->use_hierarchy)
5299 break; 5275 break;
5300 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 5276 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -5307,9 +5283,9 @@ out:
5307 *memsw_limit = min_memsw_limit; 5283 *memsw_limit = min_memsw_limit;
5308} 5284}
5309 5285
5310static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 5286static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
5311{ 5287{
5312 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5288 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5313 int name; 5289 int name;
5314 enum res_type type; 5290 enum res_type type;
5315 5291
@@ -5342,17 +5318,17 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
5342 return 0; 5318 return 0;
5343} 5319}
5344 5320
5345static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 5321static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
5346 struct cftype *cft) 5322 struct cftype *cft)
5347{ 5323{
5348 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 5324 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
5349} 5325}
5350 5326
5351#ifdef CONFIG_MMU 5327#ifdef CONFIG_MMU
5352static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 5328static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5353 struct cftype *cft, u64 val) 5329 struct cftype *cft, u64 val)
5354{ 5330{
5355 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5331 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5356 5332
5357 if (val >= (1 << NR_MOVE_TYPE)) 5333 if (val >= (1 << NR_MOVE_TYPE))
5358 return -EINVAL; 5334 return -EINVAL;
@@ -5367,7 +5343,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
5367 return 0; 5343 return 0;
5368} 5344}
5369#else 5345#else
5370static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 5346static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5371 struct cftype *cft, u64 val) 5347 struct cftype *cft, u64 val)
5372{ 5348{
5373 return -ENOSYS; 5349 return -ENOSYS;
@@ -5375,13 +5351,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
5375#endif 5351#endif
5376 5352
5377#ifdef CONFIG_NUMA 5353#ifdef CONFIG_NUMA
5378static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, 5354static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
5379 struct seq_file *m) 5355 struct cftype *cft, struct seq_file *m)
5380{ 5356{
5381 int nid; 5357 int nid;
5382 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 5358 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
5383 unsigned long node_nr; 5359 unsigned long node_nr;
5384 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5360 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5385 5361
5386 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); 5362 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
5387 seq_printf(m, "total=%lu", total_nr); 5363 seq_printf(m, "total=%lu", total_nr);
@@ -5426,10 +5402,10 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
5426 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 5402 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
5427} 5403}
5428 5404
5429static int memcg_stat_show(struct cgroup *cont, struct cftype *cft, 5405static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft,
5430 struct seq_file *m) 5406 struct seq_file *m)
5431{ 5407{
5432 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5408 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5433 struct mem_cgroup *mi; 5409 struct mem_cgroup *mi;
5434 unsigned int i; 5410 unsigned int i;
5435 5411
@@ -5513,27 +5489,23 @@ static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
5513 return 0; 5489 return 0;
5514} 5490}
5515 5491
5516static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 5492static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
5493 struct cftype *cft)
5517{ 5494{
5518 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5495 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5519 5496
5520 return mem_cgroup_swappiness(memcg); 5497 return mem_cgroup_swappiness(memcg);
5521} 5498}
5522 5499
5523static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 5500static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
5524 u64 val) 5501 struct cftype *cft, u64 val)
5525{ 5502{
5526 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5503 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5527 struct mem_cgroup *parent; 5504 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
5528
5529 if (val > 100)
5530 return -EINVAL;
5531 5505
5532 if (cgrp->parent == NULL) 5506 if (val > 100 || !parent)
5533 return -EINVAL; 5507 return -EINVAL;
5534 5508
5535 parent = mem_cgroup_from_cont(cgrp->parent);
5536
5537 mutex_lock(&memcg_create_mutex); 5509 mutex_lock(&memcg_create_mutex);
5538 5510
5539 /* If under hierarchy, only empty-root can set this value */ 5511 /* If under hierarchy, only empty-root can set this value */
@@ -5636,10 +5608,10 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
5636 mem_cgroup_oom_notify_cb(iter); 5608 mem_cgroup_oom_notify_cb(iter);
5637} 5609}
5638 5610
5639static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 5611static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
5640 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5612 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
5641{ 5613{
5642 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5614 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5643 struct mem_cgroup_thresholds *thresholds; 5615 struct mem_cgroup_thresholds *thresholds;
5644 struct mem_cgroup_threshold_ary *new; 5616 struct mem_cgroup_threshold_ary *new;
5645 enum res_type type = MEMFILE_TYPE(cft->private); 5617 enum res_type type = MEMFILE_TYPE(cft->private);
@@ -5719,10 +5691,10 @@ unlock:
5719 return ret; 5691 return ret;
5720} 5692}
5721 5693
5722static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, 5694static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
5723 struct cftype *cft, struct eventfd_ctx *eventfd) 5695 struct cftype *cft, struct eventfd_ctx *eventfd)
5724{ 5696{
5725 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5697 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5726 struct mem_cgroup_thresholds *thresholds; 5698 struct mem_cgroup_thresholds *thresholds;
5727 struct mem_cgroup_threshold_ary *new; 5699 struct mem_cgroup_threshold_ary *new;
5728 enum res_type type = MEMFILE_TYPE(cft->private); 5700 enum res_type type = MEMFILE_TYPE(cft->private);
@@ -5798,10 +5770,10 @@ unlock:
5798 mutex_unlock(&memcg->thresholds_lock); 5770 mutex_unlock(&memcg->thresholds_lock);
5799} 5771}
5800 5772
5801static int mem_cgroup_oom_register_event(struct cgroup *cgrp, 5773static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
5802 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5774 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
5803{ 5775{
5804 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5776 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5805 struct mem_cgroup_eventfd_list *event; 5777 struct mem_cgroup_eventfd_list *event;
5806 enum res_type type = MEMFILE_TYPE(cft->private); 5778 enum res_type type = MEMFILE_TYPE(cft->private);
5807 5779
@@ -5823,10 +5795,10 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
5823 return 0; 5795 return 0;
5824} 5796}
5825 5797
5826static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 5798static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
5827 struct cftype *cft, struct eventfd_ctx *eventfd) 5799 struct cftype *cft, struct eventfd_ctx *eventfd)
5828{ 5800{
5829 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5801 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5830 struct mem_cgroup_eventfd_list *ev, *tmp; 5802 struct mem_cgroup_eventfd_list *ev, *tmp;
5831 enum res_type type = MEMFILE_TYPE(cft->private); 5803 enum res_type type = MEMFILE_TYPE(cft->private);
5832 5804
@@ -5844,10 +5816,10 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
5844 spin_unlock(&memcg_oom_lock); 5816 spin_unlock(&memcg_oom_lock);
5845} 5817}
5846 5818
5847static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 5819static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css,
5848 struct cftype *cft, struct cgroup_map_cb *cb) 5820 struct cftype *cft, struct cgroup_map_cb *cb)
5849{ 5821{
5850 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5822 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5851 5823
5852 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); 5824 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
5853 5825
@@ -5858,18 +5830,16 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
5858 return 0; 5830 return 0;
5859} 5831}
5860 5832
5861static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 5833static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
5862 struct cftype *cft, u64 val) 5834 struct cftype *cft, u64 val)
5863{ 5835{
5864 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5836 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5865 struct mem_cgroup *parent; 5837 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
5866 5838
5867 /* cannot set to root cgroup and only 0 and 1 are allowed */ 5839 /* cannot set to root cgroup and only 0 and 1 are allowed */
5868 if (!cgrp->parent || !((val == 0) || (val == 1))) 5840 if (!parent || !((val == 0) || (val == 1)))
5869 return -EINVAL; 5841 return -EINVAL;
5870 5842
5871 parent = mem_cgroup_from_cont(cgrp->parent);
5872
5873 mutex_lock(&memcg_create_mutex); 5843 mutex_lock(&memcg_create_mutex);
5874 /* oom-kill-disable is a flag for subhierarchy. */ 5844 /* oom-kill-disable is a flag for subhierarchy. */
5875 if ((parent->use_hierarchy) || memcg_has_children(memcg)) { 5845 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
@@ -6228,7 +6198,7 @@ static void __init mem_cgroup_soft_limit_tree_init(void)
6228} 6198}
6229 6199
6230static struct cgroup_subsys_state * __ref 6200static struct cgroup_subsys_state * __ref
6231mem_cgroup_css_alloc(struct cgroup *cont) 6201mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6232{ 6202{
6233 struct mem_cgroup *memcg; 6203 struct mem_cgroup *memcg;
6234 long error = -ENOMEM; 6204 long error = -ENOMEM;
@@ -6243,7 +6213,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
6243 goto free_out; 6213 goto free_out;
6244 6214
6245 /* root ? */ 6215 /* root ? */
6246 if (cont->parent == NULL) { 6216 if (parent_css == NULL) {
6247 root_mem_cgroup = memcg; 6217 root_mem_cgroup = memcg;
6248 res_counter_init(&memcg->res, NULL); 6218 res_counter_init(&memcg->res, NULL);
6249 res_counter_init(&memcg->memsw, NULL); 6219 res_counter_init(&memcg->memsw, NULL);
@@ -6265,17 +6235,16 @@ free_out:
6265} 6235}
6266 6236
6267static int 6237static int
6268mem_cgroup_css_online(struct cgroup *cont) 6238mem_cgroup_css_online(struct cgroup_subsys_state *css)
6269{ 6239{
6270 struct mem_cgroup *memcg, *parent; 6240 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6241 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
6271 int error = 0; 6242 int error = 0;
6272 6243
6273 if (!cont->parent) 6244 if (!parent)
6274 return 0; 6245 return 0;
6275 6246
6276 mutex_lock(&memcg_create_mutex); 6247 mutex_lock(&memcg_create_mutex);
6277 memcg = mem_cgroup_from_cont(cont);
6278 parent = mem_cgroup_from_cont(cont->parent);
6279 6248
6280 memcg->use_hierarchy = parent->use_hierarchy; 6249 memcg->use_hierarchy = parent->use_hierarchy;
6281 memcg->oom_kill_disable = parent->oom_kill_disable; 6250 memcg->oom_kill_disable = parent->oom_kill_disable;
@@ -6326,9 +6295,9 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6326 mem_cgroup_iter_invalidate(root_mem_cgroup); 6295 mem_cgroup_iter_invalidate(root_mem_cgroup);
6327} 6296}
6328 6297
6329static void mem_cgroup_css_offline(struct cgroup *cont) 6298static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6330{ 6299{
6331 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6300 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6332 6301
6333 kmem_cgroup_css_offline(memcg); 6302 kmem_cgroup_css_offline(memcg);
6334 6303
@@ -6338,9 +6307,9 @@ static void mem_cgroup_css_offline(struct cgroup *cont)
6338 vmpressure_cleanup(&memcg->vmpressure); 6307 vmpressure_cleanup(&memcg->vmpressure);
6339} 6308}
6340 6309
6341static void mem_cgroup_css_free(struct cgroup *cont) 6310static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
6342{ 6311{
6343 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6312 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6344 6313
6345 memcg_destroy_kmem(memcg); 6314 memcg_destroy_kmem(memcg);
6346 __mem_cgroup_free(memcg); 6315 __mem_cgroup_free(memcg);
@@ -6710,12 +6679,12 @@ static void mem_cgroup_clear_mc(void)
6710 mem_cgroup_end_move(from); 6679 mem_cgroup_end_move(from);
6711} 6680}
6712 6681
6713static int mem_cgroup_can_attach(struct cgroup *cgroup, 6682static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
6714 struct cgroup_taskset *tset) 6683 struct cgroup_taskset *tset)
6715{ 6684{
6716 struct task_struct *p = cgroup_taskset_first(tset); 6685 struct task_struct *p = cgroup_taskset_first(tset);
6717 int ret = 0; 6686 int ret = 0;
6718 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); 6687 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6719 unsigned long move_charge_at_immigrate; 6688 unsigned long move_charge_at_immigrate;
6720 6689
6721 /* 6690 /*
@@ -6757,7 +6726,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
6757 return ret; 6726 return ret;
6758} 6727}
6759 6728
6760static void mem_cgroup_cancel_attach(struct cgroup *cgroup, 6729static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
6761 struct cgroup_taskset *tset) 6730 struct cgroup_taskset *tset)
6762{ 6731{
6763 mem_cgroup_clear_mc(); 6732 mem_cgroup_clear_mc();
@@ -6905,7 +6874,7 @@ retry:
6905 up_read(&mm->mmap_sem); 6874 up_read(&mm->mmap_sem);
6906} 6875}
6907 6876
6908static void mem_cgroup_move_task(struct cgroup *cont, 6877static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
6909 struct cgroup_taskset *tset) 6878 struct cgroup_taskset *tset)
6910{ 6879{
6911 struct task_struct *p = cgroup_taskset_first(tset); 6880 struct task_struct *p = cgroup_taskset_first(tset);
@@ -6920,16 +6889,16 @@ static void mem_cgroup_move_task(struct cgroup *cont,
6920 mem_cgroup_clear_mc(); 6889 mem_cgroup_clear_mc();
6921} 6890}
6922#else /* !CONFIG_MMU */ 6891#else /* !CONFIG_MMU */
6923static int mem_cgroup_can_attach(struct cgroup *cgroup, 6892static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
6924 struct cgroup_taskset *tset) 6893 struct cgroup_taskset *tset)
6925{ 6894{
6926 return 0; 6895 return 0;
6927} 6896}
6928static void mem_cgroup_cancel_attach(struct cgroup *cgroup, 6897static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
6929 struct cgroup_taskset *tset) 6898 struct cgroup_taskset *tset)
6930{ 6899{
6931} 6900}
6932static void mem_cgroup_move_task(struct cgroup *cont, 6901static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
6933 struct cgroup_taskset *tset) 6902 struct cgroup_taskset *tset)
6934{ 6903{
6935} 6904}
@@ -6939,15 +6908,15 @@ static void mem_cgroup_move_task(struct cgroup *cont,
6939 * Cgroup retains root cgroups across [un]mount cycles making it necessary 6908 * Cgroup retains root cgroups across [un]mount cycles making it necessary
6940 * to verify sane_behavior flag on each mount attempt. 6909 * to verify sane_behavior flag on each mount attempt.
6941 */ 6910 */
6942static void mem_cgroup_bind(struct cgroup *root) 6911static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
6943{ 6912{
6944 /* 6913 /*
6945 * use_hierarchy is forced with sane_behavior. cgroup core 6914 * use_hierarchy is forced with sane_behavior. cgroup core
6946 * guarantees that @root doesn't have any children, so turning it 6915 * guarantees that @root doesn't have any children, so turning it
6947 * on for the root memcg is enough. 6916 * on for the root memcg is enough.
6948 */ 6917 */
6949 if (cgroup_sane_behavior(root)) 6918 if (cgroup_sane_behavior(root_css->cgroup))
6950 mem_cgroup_from_cont(root)->use_hierarchy = true; 6919 mem_cgroup_from_css(root_css)->use_hierarchy = true;
6951} 6920}
6952 6921
6953struct cgroup_subsys mem_cgroup_subsys = { 6922struct cgroup_subsys mem_cgroup_subsys = {
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 0c1e37d829fa..e0f62837c3f4 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -74,15 +74,10 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work)
74 return container_of(work, struct vmpressure, work); 74 return container_of(work, struct vmpressure, work);
75} 75}
76 76
77static struct vmpressure *cg_to_vmpressure(struct cgroup *cg)
78{
79 return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id));
80}
81
82static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) 77static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
83{ 78{
84 struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup; 79 struct cgroup_subsys_state *css = vmpressure_to_css(vmpr);
85 struct mem_cgroup *memcg = mem_cgroup_from_cont(cg); 80 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
86 81
87 memcg = parent_mem_cgroup(memcg); 82 memcg = parent_mem_cgroup(memcg);
88 if (!memcg) 83 if (!memcg)
@@ -283,7 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
283 278
284/** 279/**
285 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd 280 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
286 * @cg: cgroup that is interested in vmpressure notifications 281 * @css: css that is interested in vmpressure notifications
287 * @cft: cgroup control files handle 282 * @cft: cgroup control files handle
288 * @eventfd: eventfd context to link notifications with 283 * @eventfd: eventfd context to link notifications with
289 * @args: event arguments (used to set up a pressure level threshold) 284 * @args: event arguments (used to set up a pressure level threshold)
@@ -298,10 +293,11 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
298 * cftype).register_event, and then cgroup core will handle everything by 293 * cftype).register_event, and then cgroup core will handle everything by
299 * itself. 294 * itself.
300 */ 295 */
301int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, 296int vmpressure_register_event(struct cgroup_subsys_state *css,
302 struct eventfd_ctx *eventfd, const char *args) 297 struct cftype *cft, struct eventfd_ctx *eventfd,
298 const char *args)
303{ 299{
304 struct vmpressure *vmpr = cg_to_vmpressure(cg); 300 struct vmpressure *vmpr = css_to_vmpressure(css);
305 struct vmpressure_event *ev; 301 struct vmpressure_event *ev;
306 int level; 302 int level;
307 303
@@ -329,7 +325,7 @@ int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
329 325
330/** 326/**
331 * vmpressure_unregister_event() - Unbind eventfd from vmpressure 327 * vmpressure_unregister_event() - Unbind eventfd from vmpressure
332 * @cg: cgroup handle 328 * @css: css handle
333 * @cft: cgroup control files handle 329 * @cft: cgroup control files handle
334 * @eventfd: eventfd context that was used to link vmpressure with the @cg 330 * @eventfd: eventfd context that was used to link vmpressure with the @cg
335 * 331 *
@@ -341,10 +337,11 @@ int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
341 * cftype).unregister_event, and then cgroup core will handle everything 337 * cftype).unregister_event, and then cgroup core will handle everything
342 * by itself. 338 * by itself.
343 */ 339 */
344void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, 340void vmpressure_unregister_event(struct cgroup_subsys_state *css,
341 struct cftype *cft,
345 struct eventfd_ctx *eventfd) 342 struct eventfd_ctx *eventfd)
346{ 343{
347 struct vmpressure *vmpr = cg_to_vmpressure(cg); 344 struct vmpressure *vmpr = css_to_vmpressure(css);
348 struct vmpressure_event *ev; 345 struct vmpressure_event *ev;
349 346
350 mutex_lock(&vmpr->events_lock); 347 mutex_lock(&vmpr->events_lock);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index e533259dce3c..d9cd627e6a16 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -29,12 +29,6 @@
29 29
30#define PRIOMAP_MIN_SZ 128 30#define PRIOMAP_MIN_SZ 128
31 31
32static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp)
33{
34 return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id),
35 struct cgroup_netprio_state, css);
36}
37
38/* 32/*
39 * Extend @dev->priomap so that it's large enough to accomodate 33 * Extend @dev->priomap so that it's large enough to accomodate
40 * @target_idx. @dev->priomap.priomap_len > @target_idx after successful 34 * @target_idx. @dev->priomap.priomap_len > @target_idx after successful
@@ -87,67 +81,70 @@ static int extend_netdev_table(struct net_device *dev, u32 target_idx)
87 81
88/** 82/**
89 * netprio_prio - return the effective netprio of a cgroup-net_device pair 83 * netprio_prio - return the effective netprio of a cgroup-net_device pair
90 * @cgrp: cgroup part of the target pair 84 * @css: css part of the target pair
91 * @dev: net_device part of the target pair 85 * @dev: net_device part of the target pair
92 * 86 *
93 * Should be called under RCU read or rtnl lock. 87 * Should be called under RCU read or rtnl lock.
94 */ 88 */
95static u32 netprio_prio(struct cgroup *cgrp, struct net_device *dev) 89static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev)
96{ 90{
97 struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); 91 struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
92 int id = css->cgroup->id;
98 93
99 if (map && cgrp->id < map->priomap_len) 94 if (map && id < map->priomap_len)
100 return map->priomap[cgrp->id]; 95 return map->priomap[id];
101 return 0; 96 return 0;
102} 97}
103 98
104/** 99/**
105 * netprio_set_prio - set netprio on a cgroup-net_device pair 100 * netprio_set_prio - set netprio on a cgroup-net_device pair
106 * @cgrp: cgroup part of the target pair 101 * @css: css part of the target pair
107 * @dev: net_device part of the target pair 102 * @dev: net_device part of the target pair
108 * @prio: prio to set 103 * @prio: prio to set
109 * 104 *
110 * Set netprio to @prio on @cgrp-@dev pair. Should be called under rtnl 105 * Set netprio to @prio on @css-@dev pair. Should be called under rtnl
111 * lock and may fail under memory pressure for non-zero @prio. 106 * lock and may fail under memory pressure for non-zero @prio.
112 */ 107 */
113static int netprio_set_prio(struct cgroup *cgrp, struct net_device *dev, 108static int netprio_set_prio(struct cgroup_subsys_state *css,
114 u32 prio) 109 struct net_device *dev, u32 prio)
115{ 110{
116 struct netprio_map *map; 111 struct netprio_map *map;
112 int id = css->cgroup->id;
117 int ret; 113 int ret;
118 114
119 /* avoid extending priomap for zero writes */ 115 /* avoid extending priomap for zero writes */
120 map = rtnl_dereference(dev->priomap); 116 map = rtnl_dereference(dev->priomap);
121 if (!prio && (!map || map->priomap_len <= cgrp->id)) 117 if (!prio && (!map || map->priomap_len <= id))
122 return 0; 118 return 0;
123 119
124 ret = extend_netdev_table(dev, cgrp->id); 120 ret = extend_netdev_table(dev, id);
125 if (ret) 121 if (ret)
126 return ret; 122 return ret;
127 123
128 map = rtnl_dereference(dev->priomap); 124 map = rtnl_dereference(dev->priomap);
129 map->priomap[cgrp->id] = prio; 125 map->priomap[id] = prio;
130 return 0; 126 return 0;
131} 127}
132 128
133static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) 129static struct cgroup_subsys_state *
130cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
134{ 131{
135 struct cgroup_netprio_state *cs; 132 struct cgroup_subsys_state *css;
136 133
137 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 134 css = kzalloc(sizeof(*css), GFP_KERNEL);
138 if (!cs) 135 if (!css)
139 return ERR_PTR(-ENOMEM); 136 return ERR_PTR(-ENOMEM);
140 137
141 return &cs->css; 138 return css;
142} 139}
143 140
144static int cgrp_css_online(struct cgroup *cgrp) 141static int cgrp_css_online(struct cgroup_subsys_state *css)
145{ 142{
146 struct cgroup *parent = cgrp->parent; 143 struct cgroup_subsys_state *parent_css = css_parent(css);
147 struct net_device *dev; 144 struct net_device *dev;
148 int ret = 0; 145 int ret = 0;
149 146
150 if (!parent) 147 if (!parent_css)
151 return 0; 148 return 0;
152 149
153 rtnl_lock(); 150 rtnl_lock();
@@ -156,9 +153,9 @@ static int cgrp_css_online(struct cgroup *cgrp)
156 * onlining, there is no need to clear them on offline. 153 * onlining, there is no need to clear them on offline.
157 */ 154 */
158 for_each_netdev(&init_net, dev) { 155 for_each_netdev(&init_net, dev) {
159 u32 prio = netprio_prio(parent, dev); 156 u32 prio = netprio_prio(parent_css, dev);
160 157
161 ret = netprio_set_prio(cgrp, dev, prio); 158 ret = netprio_set_prio(css, dev, prio);
162 if (ret) 159 if (ret)
163 break; 160 break;
164 } 161 }
@@ -166,29 +163,29 @@ static int cgrp_css_online(struct cgroup *cgrp)
166 return ret; 163 return ret;
167} 164}
168 165
169static void cgrp_css_free(struct cgroup *cgrp) 166static void cgrp_css_free(struct cgroup_subsys_state *css)
170{ 167{
171 kfree(cgrp_netprio_state(cgrp)); 168 kfree(css);
172} 169}
173 170
174static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) 171static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
175{ 172{
176 return cgrp->id; 173 return css->cgroup->id;
177} 174}
178 175
179static int read_priomap(struct cgroup *cont, struct cftype *cft, 176static int read_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
180 struct cgroup_map_cb *cb) 177 struct cgroup_map_cb *cb)
181{ 178{
182 struct net_device *dev; 179 struct net_device *dev;
183 180
184 rcu_read_lock(); 181 rcu_read_lock();
185 for_each_netdev_rcu(&init_net, dev) 182 for_each_netdev_rcu(&init_net, dev)
186 cb->fill(cb, dev->name, netprio_prio(cont, dev)); 183 cb->fill(cb, dev->name, netprio_prio(css, dev));
187 rcu_read_unlock(); 184 rcu_read_unlock();
188 return 0; 185 return 0;
189} 186}
190 187
191static int write_priomap(struct cgroup *cgrp, struct cftype *cft, 188static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
192 const char *buffer) 189 const char *buffer)
193{ 190{
194 char devname[IFNAMSIZ + 1]; 191 char devname[IFNAMSIZ + 1];
@@ -205,7 +202,7 @@ static int write_priomap(struct cgroup *cgrp, struct cftype *cft,
205 202
206 rtnl_lock(); 203 rtnl_lock();
207 204
208 ret = netprio_set_prio(cgrp, dev, prio); 205 ret = netprio_set_prio(css, dev, prio);
209 206
210 rtnl_unlock(); 207 rtnl_unlock();
211 dev_put(dev); 208 dev_put(dev);
@@ -221,12 +218,13 @@ static int update_netprio(const void *v, struct file *file, unsigned n)
221 return 0; 218 return 0;
222} 219}
223 220
224static void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 221static void net_prio_attach(struct cgroup_subsys_state *css,
222 struct cgroup_taskset *tset)
225{ 223{
226 struct task_struct *p; 224 struct task_struct *p;
227 void *v; 225 void *v;
228 226
229 cgroup_taskset_for_each(p, cgrp, tset) { 227 cgroup_taskset_for_each(p, css, tset) {
230 task_lock(p); 228 task_lock(p);
231 v = (void *)(unsigned long)task_netprioidx(p); 229 v = (void *)(unsigned long)task_netprioidx(p);
232 iterate_fd(p->files, 0, update_netprio, v); 230 iterate_fd(p->files, 0, update_netprio, v);
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index da14436c1735..8a57d79b0b16 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -132,10 +132,10 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
132 return 0; 132 return 0;
133} 133}
134 134
135static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft, 135static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
136 const char *buffer) 136 const char *buffer)
137{ 137{
138 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 138 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
139 unsigned long long val; 139 unsigned long long val;
140 int ret = 0; 140 int ret = 0;
141 141
@@ -180,9 +180,9 @@ static u64 tcp_read_usage(struct mem_cgroup *memcg)
180 return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); 180 return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE);
181} 181}
182 182
183static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) 183static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
184{ 184{
185 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 185 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
186 u64 val; 186 u64 val;
187 187
188 switch (cft->private) { 188 switch (cft->private) {
@@ -202,13 +202,13 @@ static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft)
202 return val; 202 return val;
203} 203}
204 204
205static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event) 205static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
206{ 206{
207 struct mem_cgroup *memcg; 207 struct mem_cgroup *memcg;
208 struct tcp_memcontrol *tcp; 208 struct tcp_memcontrol *tcp;
209 struct cg_proto *cg_proto; 209 struct cg_proto *cg_proto;
210 210
211 memcg = mem_cgroup_from_cont(cont); 211 memcg = mem_cgroup_from_css(css);
212 cg_proto = tcp_prot.proto_cgroup(memcg); 212 cg_proto = tcp_prot.proto_cgroup(memcg);
213 if (!cg_proto) 213 if (!cg_proto)
214 return 0; 214 return 0;
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 3a294eb98d61..867b4a3e3980 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -23,19 +23,18 @@
23#include <net/sock.h> 23#include <net/sock.h>
24#include <net/cls_cgroup.h> 24#include <net/cls_cgroup.h>
25 25
26static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp) 26static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css)
27{ 27{
28 return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id), 28 return css ? container_of(css, struct cgroup_cls_state, css) : NULL;
29 struct cgroup_cls_state, css);
30} 29}
31 30
32static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) 31static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p)
33{ 32{
34 return container_of(task_subsys_state(p, net_cls_subsys_id), 33 return css_cls_state(task_css(p, net_cls_subsys_id));
35 struct cgroup_cls_state, css);
36} 34}
37 35
38static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) 36static struct cgroup_subsys_state *
37cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
39{ 38{
40 struct cgroup_cls_state *cs; 39 struct cgroup_cls_state *cs;
41 40
@@ -45,17 +44,19 @@ static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp)
45 return &cs->css; 44 return &cs->css;
46} 45}
47 46
48static int cgrp_css_online(struct cgroup *cgrp) 47static int cgrp_css_online(struct cgroup_subsys_state *css)
49{ 48{
50 if (cgrp->parent) 49 struct cgroup_cls_state *cs = css_cls_state(css);
51 cgrp_cls_state(cgrp)->classid = 50 struct cgroup_cls_state *parent = css_cls_state(css_parent(css));
52 cgrp_cls_state(cgrp->parent)->classid; 51
52 if (parent)
53 cs->classid = parent->classid;
53 return 0; 54 return 0;
54} 55}
55 56
56static void cgrp_css_free(struct cgroup *cgrp) 57static void cgrp_css_free(struct cgroup_subsys_state *css)
57{ 58{
58 kfree(cgrp_cls_state(cgrp)); 59 kfree(css_cls_state(css));
59} 60}
60 61
61static int update_classid(const void *v, struct file *file, unsigned n) 62static int update_classid(const void *v, struct file *file, unsigned n)
@@ -67,12 +68,13 @@ static int update_classid(const void *v, struct file *file, unsigned n)
67 return 0; 68 return 0;
68} 69}
69 70
70static void cgrp_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 71static void cgrp_attach(struct cgroup_subsys_state *css,
72 struct cgroup_taskset *tset)
71{ 73{
72 struct task_struct *p; 74 struct task_struct *p;
73 void *v; 75 void *v;
74 76
75 cgroup_taskset_for_each(p, cgrp, tset) { 77 cgroup_taskset_for_each(p, css, tset) {
76 task_lock(p); 78 task_lock(p);
77 v = (void *)(unsigned long)task_cls_classid(p); 79 v = (void *)(unsigned long)task_cls_classid(p);
78 iterate_fd(p->files, 0, update_classid, v); 80 iterate_fd(p->files, 0, update_classid, v);
@@ -80,14 +82,15 @@ static void cgrp_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
80 } 82 }
81} 83}
82 84
83static u64 read_classid(struct cgroup *cgrp, struct cftype *cft) 85static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
84{ 86{
85 return cgrp_cls_state(cgrp)->classid; 87 return css_cls_state(css)->classid;
86} 88}
87 89
88static int write_classid(struct cgroup *cgrp, struct cftype *cft, u64 value) 90static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
91 u64 value)
89{ 92{
90 cgrp_cls_state(cgrp)->classid = (u32) value; 93 css_cls_state(css)->classid = (u32) value;
91 return 0; 94 return 0;
92} 95}
93 96
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index e8aad69f0d69..c123628d3f84 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -53,22 +53,17 @@ struct dev_cgroup {
53 53
54static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) 54static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
55{ 55{
56 return container_of(s, struct dev_cgroup, css); 56 return s ? container_of(s, struct dev_cgroup, css) : NULL;
57}
58
59static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup)
60{
61 return css_to_devcgroup(cgroup_subsys_state(cgroup, devices_subsys_id));
62} 57}
63 58
64static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) 59static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
65{ 60{
66 return css_to_devcgroup(task_subsys_state(task, devices_subsys_id)); 61 return css_to_devcgroup(task_css(task, devices_subsys_id));
67} 62}
68 63
69struct cgroup_subsys devices_subsys; 64struct cgroup_subsys devices_subsys;
70 65
71static int devcgroup_can_attach(struct cgroup *new_cgrp, 66static int devcgroup_can_attach(struct cgroup_subsys_state *new_css,
72 struct cgroup_taskset *set) 67 struct cgroup_taskset *set)
73{ 68{
74 struct task_struct *task = cgroup_taskset_first(set); 69 struct task_struct *task = cgroup_taskset_first(set);
@@ -193,18 +188,16 @@ static inline bool is_devcg_online(const struct dev_cgroup *devcg)
193/** 188/**
194 * devcgroup_online - initializes devcgroup's behavior and exceptions based on 189 * devcgroup_online - initializes devcgroup's behavior and exceptions based on
195 * parent's 190 * parent's
196 * @cgroup: cgroup getting online 191 * @css: css getting online
197 * returns 0 in case of success, error code otherwise 192 * returns 0 in case of success, error code otherwise
198 */ 193 */
199static int devcgroup_online(struct cgroup *cgroup) 194static int devcgroup_online(struct cgroup_subsys_state *css)
200{ 195{
201 struct dev_cgroup *dev_cgroup, *parent_dev_cgroup = NULL; 196 struct dev_cgroup *dev_cgroup = css_to_devcgroup(css);
197 struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css_parent(css));
202 int ret = 0; 198 int ret = 0;
203 199
204 mutex_lock(&devcgroup_mutex); 200 mutex_lock(&devcgroup_mutex);
205 dev_cgroup = cgroup_to_devcgroup(cgroup);
206 if (cgroup->parent)
207 parent_dev_cgroup = cgroup_to_devcgroup(cgroup->parent);
208 201
209 if (parent_dev_cgroup == NULL) 202 if (parent_dev_cgroup == NULL)
210 dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW; 203 dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW;
@@ -219,9 +212,9 @@ static int devcgroup_online(struct cgroup *cgroup)
219 return ret; 212 return ret;
220} 213}
221 214
222static void devcgroup_offline(struct cgroup *cgroup) 215static void devcgroup_offline(struct cgroup_subsys_state *css)
223{ 216{
224 struct dev_cgroup *dev_cgroup = cgroup_to_devcgroup(cgroup); 217 struct dev_cgroup *dev_cgroup = css_to_devcgroup(css);
225 218
226 mutex_lock(&devcgroup_mutex); 219 mutex_lock(&devcgroup_mutex);
227 dev_cgroup->behavior = DEVCG_DEFAULT_NONE; 220 dev_cgroup->behavior = DEVCG_DEFAULT_NONE;
@@ -231,7 +224,8 @@ static void devcgroup_offline(struct cgroup *cgroup)
231/* 224/*
232 * called from kernel/cgroup.c with cgroup_lock() held. 225 * called from kernel/cgroup.c with cgroup_lock() held.
233 */ 226 */
234static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup) 227static struct cgroup_subsys_state *
228devcgroup_css_alloc(struct cgroup_subsys_state *parent_css)
235{ 229{
236 struct dev_cgroup *dev_cgroup; 230 struct dev_cgroup *dev_cgroup;
237 231
@@ -244,11 +238,10 @@ static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup)
244 return &dev_cgroup->css; 238 return &dev_cgroup->css;
245} 239}
246 240
247static void devcgroup_css_free(struct cgroup *cgroup) 241static void devcgroup_css_free(struct cgroup_subsys_state *css)
248{ 242{
249 struct dev_cgroup *dev_cgroup; 243 struct dev_cgroup *dev_cgroup = css_to_devcgroup(css);
250 244
251 dev_cgroup = cgroup_to_devcgroup(cgroup);
252 __dev_exception_clean(dev_cgroup); 245 __dev_exception_clean(dev_cgroup);
253 kfree(dev_cgroup); 246 kfree(dev_cgroup);
254} 247}
@@ -291,10 +284,10 @@ static void set_majmin(char *str, unsigned m)
291 sprintf(str, "%u", m); 284 sprintf(str, "%u", m);
292} 285}
293 286
294static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft, 287static int devcgroup_seq_read(struct cgroup_subsys_state *css,
295 struct seq_file *m) 288 struct cftype *cft, struct seq_file *m)
296{ 289{
297 struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup); 290 struct dev_cgroup *devcgroup = css_to_devcgroup(css);
298 struct dev_exception_item *ex; 291 struct dev_exception_item *ex;
299 char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; 292 char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN];
300 293
@@ -394,12 +387,10 @@ static bool may_access(struct dev_cgroup *dev_cgroup,
394static int parent_has_perm(struct dev_cgroup *childcg, 387static int parent_has_perm(struct dev_cgroup *childcg,
395 struct dev_exception_item *ex) 388 struct dev_exception_item *ex)
396{ 389{
397 struct cgroup *pcg = childcg->css.cgroup->parent; 390 struct dev_cgroup *parent = css_to_devcgroup(css_parent(&childcg->css));
398 struct dev_cgroup *parent;
399 391
400 if (!pcg) 392 if (!parent)
401 return 1; 393 return 1;
402 parent = cgroup_to_devcgroup(pcg);
403 return may_access(parent, ex, childcg->behavior); 394 return may_access(parent, ex, childcg->behavior);
404} 395}
405 396
@@ -451,13 +442,13 @@ static void revalidate_active_exceptions(struct dev_cgroup *devcg)
451static int propagate_exception(struct dev_cgroup *devcg_root, 442static int propagate_exception(struct dev_cgroup *devcg_root,
452 struct dev_exception_item *ex) 443 struct dev_exception_item *ex)
453{ 444{
454 struct cgroup *root = devcg_root->css.cgroup, *pos; 445 struct cgroup_subsys_state *pos;
455 int rc = 0; 446 int rc = 0;
456 447
457 rcu_read_lock(); 448 rcu_read_lock();
458 449
459 cgroup_for_each_descendant_pre(pos, root) { 450 css_for_each_descendant_pre(pos, &devcg_root->css) {
460 struct dev_cgroup *devcg = cgroup_to_devcgroup(pos); 451 struct dev_cgroup *devcg = css_to_devcgroup(pos);
461 452
462 /* 453 /*
463 * Because devcgroup_mutex is held, no devcg will become 454 * Because devcgroup_mutex is held, no devcg will become
@@ -465,7 +456,7 @@ static int propagate_exception(struct dev_cgroup *devcg_root,
465 * methods), and online ones are safe to access outside RCU 456 * methods), and online ones are safe to access outside RCU
466 * read lock without bumping refcnt. 457 * read lock without bumping refcnt.
467 */ 458 */
468 if (!is_devcg_online(devcg)) 459 if (pos == &devcg_root->css || !is_devcg_online(devcg))
469 continue; 460 continue;
470 461
471 rcu_read_unlock(); 462 rcu_read_unlock();
@@ -524,15 +515,11 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
524 char temp[12]; /* 11 + 1 characters needed for a u32 */ 515 char temp[12]; /* 11 + 1 characters needed for a u32 */
525 int count, rc = 0; 516 int count, rc = 0;
526 struct dev_exception_item ex; 517 struct dev_exception_item ex;
527 struct cgroup *p = devcgroup->css.cgroup; 518 struct dev_cgroup *parent = css_to_devcgroup(css_parent(&devcgroup->css));
528 struct dev_cgroup *parent = NULL;
529 519
530 if (!capable(CAP_SYS_ADMIN)) 520 if (!capable(CAP_SYS_ADMIN))
531 return -EPERM; 521 return -EPERM;
532 522
533 if (p->parent)
534 parent = cgroup_to_devcgroup(p->parent);
535
536 memset(&ex, 0, sizeof(ex)); 523 memset(&ex, 0, sizeof(ex));
537 b = buffer; 524 b = buffer;
538 525
@@ -677,13 +664,13 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
677 return rc; 664 return rc;
678} 665}
679 666
680static int devcgroup_access_write(struct cgroup *cgrp, struct cftype *cft, 667static int devcgroup_access_write(struct cgroup_subsys_state *css,
681 const char *buffer) 668 struct cftype *cft, const char *buffer)
682{ 669{
683 int retval; 670 int retval;
684 671
685 mutex_lock(&devcgroup_mutex); 672 mutex_lock(&devcgroup_mutex);
686 retval = devcgroup_update_access(cgroup_to_devcgroup(cgrp), 673 retval = devcgroup_update_access(css_to_devcgroup(css),
687 cft->private, buffer); 674 cft->private, buffer);
688 mutex_unlock(&devcgroup_mutex); 675 mutex_unlock(&devcgroup_mutex);
689 return retval; 676 return retval;