summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--block/blk-cgroup.c49
-rw-r--r--block/blk-cgroup.h38
-rw-r--r--block/blk-throttle.c43
-rw-r--r--block/cfq-iosched.c90
-rw-r--r--fs/bio.c2
-rw-r--r--include/linux/cgroup.h303
-rw-r--r--include/linux/memcontrol.h2
-rw-r--r--include/linux/vmpressure.h6
-rw-r--r--include/net/cls_cgroup.h4
-rw-r--r--include/net/netprio_cgroup.h8
-rw-r--r--kernel/cgroup.c1643
-rw-r--r--kernel/cgroup_freezer.c155
-rw-r--r--kernel/cpuset.c317
-rw-r--r--kernel/events/core.c27
-rw-r--r--kernel/sched/core.c113
-rw-r--r--kernel/sched/cpuacct.c51
-rw-r--r--kernel/sched/sched.h6
-rw-r--r--mm/hugetlb_cgroup.c69
-rw-r--r--mm/memcontrol.c223
-rw-r--r--mm/vmpressure.c25
-rw-r--r--net/core/netprio_cgroup.c72
-rw-r--r--net/ipv4/tcp_memcontrol.c12
-rw-r--r--net/sched/cls_cgroup.c39
-rw-r--r--security/device_cgroup.c65
24 files changed, 1751 insertions, 1611 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 290792a13e3c..e90c7c164c83 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -437,10 +437,10 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
437 return &blkg->rl; 437 return &blkg->rl;
438} 438}
439 439
440static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, 440static int blkcg_reset_stats(struct cgroup_subsys_state *css,
441 u64 val) 441 struct cftype *cftype, u64 val)
442{ 442{
443 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 443 struct blkcg *blkcg = css_to_blkcg(css);
444 struct blkcg_gq *blkg; 444 struct blkcg_gq *blkg;
445 int i; 445 int i;
446 446
@@ -614,15 +614,13 @@ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
614{ 614{
615 struct blkcg_policy *pol = blkcg_policy[pd->plid]; 615 struct blkcg_policy *pol = blkcg_policy[pd->plid];
616 struct blkcg_gq *pos_blkg; 616 struct blkcg_gq *pos_blkg;
617 struct cgroup *pos_cgrp; 617 struct cgroup_subsys_state *pos_css;
618 u64 sum; 618 u64 sum = 0;
619 619
620 lockdep_assert_held(pd->blkg->q->queue_lock); 620 lockdep_assert_held(pd->blkg->q->queue_lock);
621 621
622 sum = blkg_stat_read((void *)pd + off);
623
624 rcu_read_lock(); 622 rcu_read_lock();
625 blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { 623 blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
626 struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); 624 struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
627 struct blkg_stat *stat = (void *)pos_pd + off; 625 struct blkg_stat *stat = (void *)pos_pd + off;
628 626
@@ -649,16 +647,14 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
649{ 647{
650 struct blkcg_policy *pol = blkcg_policy[pd->plid]; 648 struct blkcg_policy *pol = blkcg_policy[pd->plid];
651 struct blkcg_gq *pos_blkg; 649 struct blkcg_gq *pos_blkg;
652 struct cgroup *pos_cgrp; 650 struct cgroup_subsys_state *pos_css;
653 struct blkg_rwstat sum; 651 struct blkg_rwstat sum = { };
654 int i; 652 int i;
655 653
656 lockdep_assert_held(pd->blkg->q->queue_lock); 654 lockdep_assert_held(pd->blkg->q->queue_lock);
657 655
658 sum = blkg_rwstat_read((void *)pd + off);
659
660 rcu_read_lock(); 656 rcu_read_lock();
661 blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { 657 blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
662 struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); 658 struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
663 struct blkg_rwstat *rwstat = (void *)pos_pd + off; 659 struct blkg_rwstat *rwstat = (void *)pos_pd + off;
664 struct blkg_rwstat tmp; 660 struct blkg_rwstat tmp;
@@ -765,18 +761,18 @@ struct cftype blkcg_files[] = {
765 761
766/** 762/**
767 * blkcg_css_offline - cgroup css_offline callback 763 * blkcg_css_offline - cgroup css_offline callback
768 * @cgroup: cgroup of interest 764 * @css: css of interest
769 * 765 *
770 * This function is called when @cgroup is about to go away and responsible 766 * This function is called when @css is about to go away and responsible
771 * for shooting down all blkgs associated with @cgroup. blkgs should be 767 * for shooting down all blkgs associated with @css. blkgs should be
772 * removed while holding both q and blkcg locks. As blkcg lock is nested 768 * removed while holding both q and blkcg locks. As blkcg lock is nested
773 * inside q lock, this function performs reverse double lock dancing. 769 * inside q lock, this function performs reverse double lock dancing.
774 * 770 *
775 * This is the blkcg counterpart of ioc_release_fn(). 771 * This is the blkcg counterpart of ioc_release_fn().
776 */ 772 */
777static void blkcg_css_offline(struct cgroup *cgroup) 773static void blkcg_css_offline(struct cgroup_subsys_state *css)
778{ 774{
779 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 775 struct blkcg *blkcg = css_to_blkcg(css);
780 776
781 spin_lock_irq(&blkcg->lock); 777 spin_lock_irq(&blkcg->lock);
782 778
@@ -798,21 +794,21 @@ static void blkcg_css_offline(struct cgroup *cgroup)
798 spin_unlock_irq(&blkcg->lock); 794 spin_unlock_irq(&blkcg->lock);
799} 795}
800 796
801static void blkcg_css_free(struct cgroup *cgroup) 797static void blkcg_css_free(struct cgroup_subsys_state *css)
802{ 798{
803 struct blkcg *blkcg = cgroup_to_blkcg(cgroup); 799 struct blkcg *blkcg = css_to_blkcg(css);
804 800
805 if (blkcg != &blkcg_root) 801 if (blkcg != &blkcg_root)
806 kfree(blkcg); 802 kfree(blkcg);
807} 803}
808 804
809static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup) 805static struct cgroup_subsys_state *
806blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
810{ 807{
811 static atomic64_t id_seq = ATOMIC64_INIT(0); 808 static atomic64_t id_seq = ATOMIC64_INIT(0);
812 struct blkcg *blkcg; 809 struct blkcg *blkcg;
813 struct cgroup *parent = cgroup->parent;
814 810
815 if (!parent) { 811 if (!parent_css) {
816 blkcg = &blkcg_root; 812 blkcg = &blkcg_root;
817 goto done; 813 goto done;
818 } 814 }
@@ -883,14 +879,15 @@ void blkcg_exit_queue(struct request_queue *q)
883 * of the main cic data structures. For now we allow a task to change 879 * of the main cic data structures. For now we allow a task to change
884 * its cgroup only if it's the only owner of its ioc. 880 * its cgroup only if it's the only owner of its ioc.
885 */ 881 */
886static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 882static int blkcg_can_attach(struct cgroup_subsys_state *css,
883 struct cgroup_taskset *tset)
887{ 884{
888 struct task_struct *task; 885 struct task_struct *task;
889 struct io_context *ioc; 886 struct io_context *ioc;
890 int ret = 0; 887 int ret = 0;
891 888
892 /* task_lock() is needed to avoid races with exit_io_context() */ 889 /* task_lock() is needed to avoid races with exit_io_context() */
893 cgroup_taskset_for_each(task, cgrp, tset) { 890 cgroup_taskset_for_each(task, css, tset) {
894 task_lock(task); 891 task_lock(task);
895 ioc = task->io_context; 892 ioc = task->io_context;
896 if (ioc && atomic_read(&ioc->nr_tasks) > 1) 893 if (ioc && atomic_read(&ioc->nr_tasks) > 1)
@@ -1127,7 +1124,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
1127 1124
1128 /* kill the intf files first */ 1125 /* kill the intf files first */
1129 if (pol->cftypes) 1126 if (pol->cftypes)
1130 cgroup_rm_cftypes(&blkio_subsys, pol->cftypes); 1127 cgroup_rm_cftypes(pol->cftypes);
1131 1128
1132 /* unregister and update blkgs */ 1129 /* unregister and update blkgs */
1133 blkcg_policy[pol->plid] = NULL; 1130 blkcg_policy[pol->plid] = NULL;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 8056c03a3382..ae6969a7ffd4 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -179,22 +179,20 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
179void blkg_conf_finish(struct blkg_conf_ctx *ctx); 179void blkg_conf_finish(struct blkg_conf_ctx *ctx);
180 180
181 181
182static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) 182static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
183{ 183{
184 return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), 184 return css ? container_of(css, struct blkcg, css) : NULL;
185 struct blkcg, css);
186} 185}
187 186
188static inline struct blkcg *task_blkcg(struct task_struct *tsk) 187static inline struct blkcg *task_blkcg(struct task_struct *tsk)
189{ 188{
190 return container_of(task_subsys_state(tsk, blkio_subsys_id), 189 return css_to_blkcg(task_css(tsk, blkio_subsys_id));
191 struct blkcg, css);
192} 190}
193 191
194static inline struct blkcg *bio_blkcg(struct bio *bio) 192static inline struct blkcg *bio_blkcg(struct bio *bio)
195{ 193{
196 if (bio && bio->bi_css) 194 if (bio && bio->bi_css)
197 return container_of(bio->bi_css, struct blkcg, css); 195 return css_to_blkcg(bio->bi_css);
198 return task_blkcg(current); 196 return task_blkcg(current);
199} 197}
200 198
@@ -206,9 +204,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
206 */ 204 */
207static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) 205static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
208{ 206{
209 struct cgroup *pcg = blkcg->css.cgroup->parent; 207 return css_to_blkcg(css_parent(&blkcg->css));
210
211 return pcg ? cgroup_to_blkcg(pcg) : NULL;
212} 208}
213 209
214/** 210/**
@@ -288,32 +284,33 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
288/** 284/**
289 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants 285 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
290 * @d_blkg: loop cursor pointing to the current descendant 286 * @d_blkg: loop cursor pointing to the current descendant
291 * @pos_cgrp: used for iteration 287 * @pos_css: used for iteration
292 * @p_blkg: target blkg to walk descendants of 288 * @p_blkg: target blkg to walk descendants of
293 * 289 *
294 * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU 290 * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
295 * read locked. If called under either blkcg or queue lock, the iteration 291 * read locked. If called under either blkcg or queue lock, the iteration
296 * is guaranteed to include all and only online blkgs. The caller may 292 * is guaranteed to include all and only online blkgs. The caller may
297 * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip 293 * update @pos_css by calling css_rightmost_descendant() to skip subtree.
298 * subtree. 294 * @p_blkg is included in the iteration and the first node to be visited.
299 */ 295 */
300#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \ 296#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \
301 cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ 297 css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \
302 if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ 298 if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
303 (p_blkg)->q, false))) 299 (p_blkg)->q, false)))
304 300
305/** 301/**
306 * blkg_for_each_descendant_post - post-order walk of a blkg's descendants 302 * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
307 * @d_blkg: loop cursor pointing to the current descendant 303 * @d_blkg: loop cursor pointing to the current descendant
308 * @pos_cgrp: used for iteration 304 * @pos_css: used for iteration
309 * @p_blkg: target blkg to walk descendants of 305 * @p_blkg: target blkg to walk descendants of
310 * 306 *
311 * Similar to blkg_for_each_descendant_pre() but performs post-order 307 * Similar to blkg_for_each_descendant_pre() but performs post-order
312 * traversal instead. Synchronization rules are the same. 308 * traversal instead. Synchronization rules are the same. @p_blkg is
309 * included in the iteration and the last node to be visited.
313 */ 310 */
314#define blkg_for_each_descendant_post(d_blkg, pos_cgrp, p_blkg) \ 311#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \
315 cgroup_for_each_descendant_post((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ 312 css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \
316 if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ 313 if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
317 (p_blkg)->q, false))) 314 (p_blkg)->q, false)))
318 315
319/** 316/**
@@ -576,7 +573,6 @@ static inline int blkcg_activate_policy(struct request_queue *q,
576static inline void blkcg_deactivate_policy(struct request_queue *q, 573static inline void blkcg_deactivate_policy(struct request_queue *q,
577 const struct blkcg_policy *pol) { } 574 const struct blkcg_policy *pol) { }
578 575
579static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; }
580static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } 576static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
581 577
582static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, 578static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 08a32dfd3844..8331aba9426f 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1293,10 +1293,10 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
1293 return __blkg_prfill_rwstat(sf, pd, &rwstat); 1293 return __blkg_prfill_rwstat(sf, pd, &rwstat);
1294} 1294}
1295 1295
1296static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, 1296static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css,
1297 struct seq_file *sf) 1297 struct cftype *cft, struct seq_file *sf)
1298{ 1298{
1299 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1299 struct blkcg *blkcg = css_to_blkcg(css);
1300 1300
1301 blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, 1301 blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl,
1302 cft->private, true); 1302 cft->private, true);
@@ -1325,31 +1325,31 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
1325 return __blkg_prfill_u64(sf, pd, v); 1325 return __blkg_prfill_u64(sf, pd, v);
1326} 1326}
1327 1327
1328static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft, 1328static int tg_print_conf_u64(struct cgroup_subsys_state *css,
1329 struct seq_file *sf) 1329 struct cftype *cft, struct seq_file *sf)
1330{ 1330{
1331 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64, 1331 blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64,
1332 &blkcg_policy_throtl, cft->private, false); 1332 &blkcg_policy_throtl, cft->private, false);
1333 return 0; 1333 return 0;
1334} 1334}
1335 1335
1336static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft, 1336static int tg_print_conf_uint(struct cgroup_subsys_state *css,
1337 struct seq_file *sf) 1337 struct cftype *cft, struct seq_file *sf)
1338{ 1338{
1339 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint, 1339 blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint,
1340 &blkcg_policy_throtl, cft->private, false); 1340 &blkcg_policy_throtl, cft->private, false);
1341 return 0; 1341 return 0;
1342} 1342}
1343 1343
1344static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, 1344static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft,
1345 bool is_u64) 1345 const char *buf, bool is_u64)
1346{ 1346{
1347 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1347 struct blkcg *blkcg = css_to_blkcg(css);
1348 struct blkg_conf_ctx ctx; 1348 struct blkg_conf_ctx ctx;
1349 struct throtl_grp *tg; 1349 struct throtl_grp *tg;
1350 struct throtl_service_queue *sq; 1350 struct throtl_service_queue *sq;
1351 struct blkcg_gq *blkg; 1351 struct blkcg_gq *blkg;
1352 struct cgroup *pos_cgrp; 1352 struct cgroup_subsys_state *pos_css;
1353 int ret; 1353 int ret;
1354 1354
1355 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); 1355 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
@@ -1379,8 +1379,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
1379 * restrictions in the whole hierarchy and allows them to bypass 1379 * restrictions in the whole hierarchy and allows them to bypass
1380 * blk-throttle. 1380 * blk-throttle.
1381 */ 1381 */
1382 tg_update_has_rules(tg); 1382 blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg)
1383 blkg_for_each_descendant_pre(blkg, pos_cgrp, ctx.blkg)
1384 tg_update_has_rules(blkg_to_tg(blkg)); 1383 tg_update_has_rules(blkg_to_tg(blkg));
1385 1384
1386 /* 1385 /*
@@ -1403,16 +1402,16 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
1403 return 0; 1402 return 0;
1404} 1403}
1405 1404
1406static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft, 1405static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1407 const char *buf) 1406 const char *buf)
1408{ 1407{
1409 return tg_set_conf(cgrp, cft, buf, true); 1408 return tg_set_conf(css, cft, buf, true);
1410} 1409}
1411 1410
1412static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft, 1411static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft,
1413 const char *buf) 1412 const char *buf)
1414{ 1413{
1415 return tg_set_conf(cgrp, cft, buf, false); 1414 return tg_set_conf(css, cft, buf, false);
1416} 1415}
1417 1416
1418static struct cftype throtl_files[] = { 1417static struct cftype throtl_files[] = {
@@ -1623,7 +1622,7 @@ void blk_throtl_drain(struct request_queue *q)
1623{ 1622{
1624 struct throtl_data *td = q->td; 1623 struct throtl_data *td = q->td;
1625 struct blkcg_gq *blkg; 1624 struct blkcg_gq *blkg;
1626 struct cgroup *pos_cgrp; 1625 struct cgroup_subsys_state *pos_css;
1627 struct bio *bio; 1626 struct bio *bio;
1628 int rw; 1627 int rw;
1629 1628
@@ -1636,11 +1635,9 @@ void blk_throtl_drain(struct request_queue *q)
1636 * better to walk service_queue tree directly but blkg walk is 1635 * better to walk service_queue tree directly but blkg walk is
1637 * easier. 1636 * easier.
1638 */ 1637 */
1639 blkg_for_each_descendant_post(blkg, pos_cgrp, td->queue->root_blkg) 1638 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg)
1640 tg_drain_bios(&blkg_to_tg(blkg)->service_queue); 1639 tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
1641 1640
1642 tg_drain_bios(&td_root_tg(td)->service_queue);
1643
1644 /* finally, transfer bios from top-level tg's into the td */ 1641 /* finally, transfer bios from top-level tg's into the td */
1645 tg_drain_bios(&td->service_queue); 1642 tg_drain_bios(&td->service_queue);
1646 1643
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d5bbdcfd0dab..dabb9d02cf9a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1607,12 +1607,11 @@ static u64 cfqg_prfill_weight_device(struct seq_file *sf,
1607 return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); 1607 return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
1608} 1608}
1609 1609
1610static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft, 1610static int cfqg_print_weight_device(struct cgroup_subsys_state *css,
1611 struct seq_file *sf) 1611 struct cftype *cft, struct seq_file *sf)
1612{ 1612{
1613 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), 1613 blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_weight_device,
1614 cfqg_prfill_weight_device, &blkcg_policy_cfq, 0, 1614 &blkcg_policy_cfq, 0, false);
1615 false);
1616 return 0; 1615 return 0;
1617} 1616}
1618 1617
@@ -1626,35 +1625,34 @@ static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
1626 return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); 1625 return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
1627} 1626}
1628 1627
1629static int cfqg_print_leaf_weight_device(struct cgroup *cgrp, 1628static int cfqg_print_leaf_weight_device(struct cgroup_subsys_state *css,
1630 struct cftype *cft, 1629 struct cftype *cft,
1631 struct seq_file *sf) 1630 struct seq_file *sf)
1632{ 1631{
1633 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), 1632 blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_leaf_weight_device,
1634 cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0, 1633 &blkcg_policy_cfq, 0, false);
1635 false);
1636 return 0; 1634 return 0;
1637} 1635}
1638 1636
1639static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, 1637static int cfq_print_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1640 struct seq_file *sf) 1638 struct seq_file *sf)
1641{ 1639{
1642 seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight); 1640 seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_weight);
1643 return 0; 1641 return 0;
1644} 1642}
1645 1643
1646static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft, 1644static int cfq_print_leaf_weight(struct cgroup_subsys_state *css,
1647 struct seq_file *sf) 1645 struct cftype *cft, struct seq_file *sf)
1648{ 1646{
1649 seq_printf(sf, "%u\n", 1647 seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_leaf_weight);
1650 cgroup_to_blkcg(cgrp)->cfq_leaf_weight);
1651 return 0; 1648 return 0;
1652} 1649}
1653 1650
1654static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, 1651static int __cfqg_set_weight_device(struct cgroup_subsys_state *css,
1655 const char *buf, bool is_leaf_weight) 1652 struct cftype *cft, const char *buf,
1653 bool is_leaf_weight)
1656{ 1654{
1657 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1655 struct blkcg *blkcg = css_to_blkcg(css);
1658 struct blkg_conf_ctx ctx; 1656 struct blkg_conf_ctx ctx;
1659 struct cfq_group *cfqg; 1657 struct cfq_group *cfqg;
1660 int ret; 1658 int ret;
@@ -1680,22 +1678,22 @@ static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
1680 return ret; 1678 return ret;
1681} 1679}
1682 1680
1683static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, 1681static int cfqg_set_weight_device(struct cgroup_subsys_state *css,
1684 const char *buf) 1682 struct cftype *cft, const char *buf)
1685{ 1683{
1686 return __cfqg_set_weight_device(cgrp, cft, buf, false); 1684 return __cfqg_set_weight_device(css, cft, buf, false);
1687} 1685}
1688 1686
1689static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft, 1687static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css,
1690 const char *buf) 1688 struct cftype *cft, const char *buf)
1691{ 1689{
1692 return __cfqg_set_weight_device(cgrp, cft, buf, true); 1690 return __cfqg_set_weight_device(css, cft, buf, true);
1693} 1691}
1694 1692
1695static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val, 1693static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1696 bool is_leaf_weight) 1694 u64 val, bool is_leaf_weight)
1697{ 1695{
1698 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1696 struct blkcg *blkcg = css_to_blkcg(css);
1699 struct blkcg_gq *blkg; 1697 struct blkcg_gq *blkg;
1700 1698
1701 if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) 1699 if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
@@ -1727,30 +1725,32 @@ static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val,
1727 return 0; 1725 return 0;
1728} 1726}
1729 1727
1730static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) 1728static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1729 u64 val)
1731{ 1730{
1732 return __cfq_set_weight(cgrp, cft, val, false); 1731 return __cfq_set_weight(css, cft, val, false);
1733} 1732}
1734 1733
1735static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) 1734static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
1735 struct cftype *cft, u64 val)
1736{ 1736{
1737 return __cfq_set_weight(cgrp, cft, val, true); 1737 return __cfq_set_weight(css, cft, val, true);
1738} 1738}
1739 1739
1740static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft, 1740static int cfqg_print_stat(struct cgroup_subsys_state *css, struct cftype *cft,
1741 struct seq_file *sf) 1741 struct seq_file *sf)
1742{ 1742{
1743 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1743 struct blkcg *blkcg = css_to_blkcg(css);
1744 1744
1745 blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq, 1745 blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq,
1746 cft->private, false); 1746 cft->private, false);
1747 return 0; 1747 return 0;
1748} 1748}
1749 1749
1750static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft, 1750static int cfqg_print_rwstat(struct cgroup_subsys_state *css,
1751 struct seq_file *sf) 1751 struct cftype *cft, struct seq_file *sf)
1752{ 1752{
1753 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1753 struct blkcg *blkcg = css_to_blkcg(css);
1754 1754
1755 blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq, 1755 blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq,
1756 cft->private, true); 1756 cft->private, true);
@@ -1773,20 +1773,20 @@ static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
1773 return __blkg_prfill_rwstat(sf, pd, &sum); 1773 return __blkg_prfill_rwstat(sf, pd, &sum);
1774} 1774}
1775 1775
1776static int cfqg_print_stat_recursive(struct cgroup *cgrp, struct cftype *cft, 1776static int cfqg_print_stat_recursive(struct cgroup_subsys_state *css,
1777 struct seq_file *sf) 1777 struct cftype *cft, struct seq_file *sf)
1778{ 1778{
1779 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1779 struct blkcg *blkcg = css_to_blkcg(css);
1780 1780
1781 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive, 1781 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive,
1782 &blkcg_policy_cfq, cft->private, false); 1782 &blkcg_policy_cfq, cft->private, false);
1783 return 0; 1783 return 0;
1784} 1784}
1785 1785
1786static int cfqg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft, 1786static int cfqg_print_rwstat_recursive(struct cgroup_subsys_state *css,
1787 struct seq_file *sf) 1787 struct cftype *cft, struct seq_file *sf)
1788{ 1788{
1789 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1789 struct blkcg *blkcg = css_to_blkcg(css);
1790 1790
1791 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive, 1791 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive,
1792 &blkcg_policy_cfq, cft->private, true); 1792 &blkcg_policy_cfq, cft->private, true);
@@ -1810,10 +1810,10 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
1810} 1810}
1811 1811
1812/* print avg_queue_size */ 1812/* print avg_queue_size */
1813static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft, 1813static int cfqg_print_avg_queue_size(struct cgroup_subsys_state *css,
1814 struct seq_file *sf) 1814 struct cftype *cft, struct seq_file *sf)
1815{ 1815{
1816 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1816 struct blkcg *blkcg = css_to_blkcg(css);
1817 1817
1818 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size, 1818 blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
1819 &blkcg_policy_cfq, 0, false); 1819 &blkcg_policy_cfq, 0, false);
diff --git a/fs/bio.c b/fs/bio.c
index c5eae7251490..b3b20ed9510e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1956,7 +1956,7 @@ int bio_associate_current(struct bio *bio)
1956 1956
1957 /* associate blkcg if exists */ 1957 /* associate blkcg if exists */
1958 rcu_read_lock(); 1958 rcu_read_lock();
1959 css = task_subsys_state(current, blkio_subsys_id); 1959 css = task_css(current, blkio_subsys_id);
1960 if (css && css_tryget(css)) 1960 if (css && css_tryget(css))
1961 bio->bi_css = css; 1961 bio->bi_css = css;
1962 rcu_read_unlock(); 1962 rcu_read_unlock();
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e9ac882868c0..3561d305b1e0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -66,22 +66,25 @@ enum cgroup_subsys_id {
66 66
67/* Per-subsystem/per-cgroup state maintained by the system. */ 67/* Per-subsystem/per-cgroup state maintained by the system. */
68struct cgroup_subsys_state { 68struct cgroup_subsys_state {
69 /* 69 /* the cgroup that this css is attached to */
70 * The cgroup that this subsystem is attached to. Useful
71 * for subsystems that want to know about the cgroup
72 * hierarchy structure
73 */
74 struct cgroup *cgroup; 70 struct cgroup *cgroup;
75 71
72 /* the cgroup subsystem that this css is attached to */
73 struct cgroup_subsys *ss;
74
76 /* reference count - access via css_[try]get() and css_put() */ 75 /* reference count - access via css_[try]get() and css_put() */
77 struct percpu_ref refcnt; 76 struct percpu_ref refcnt;
78 77
78 /* the parent css */
79 struct cgroup_subsys_state *parent;
80
79 unsigned long flags; 81 unsigned long flags;
80 /* ID for this css, if possible */ 82 /* ID for this css, if possible */
81 struct css_id __rcu *id; 83 struct css_id __rcu *id;
82 84
83 /* Used to put @cgroup->dentry on the last css_put() */ 85 /* percpu_ref killing and RCU release */
84 struct work_struct dput_work; 86 struct rcu_head rcu_head;
87 struct work_struct destroy_work;
85}; 88};
86 89
87/* bits in struct cgroup_subsys_state flags field */ 90/* bits in struct cgroup_subsys_state flags field */
@@ -161,7 +164,16 @@ struct cgroup_name {
161struct cgroup { 164struct cgroup {
162 unsigned long flags; /* "unsigned long" so bitops work */ 165 unsigned long flags; /* "unsigned long" so bitops work */
163 166
164 int id; /* ida allocated in-hierarchy ID */ 167 /*
168 * idr allocated in-hierarchy ID.
169 *
170 * The ID of the root cgroup is always 0, and a new cgroup
171 * will be assigned with a smallest available ID.
172 */
173 int id;
174
175 /* the number of attached css's */
176 int nr_css;
165 177
166 /* 178 /*
167 * We link our 'sibling' struct into our parent's 'children'. 179 * We link our 'sibling' struct into our parent's 'children'.
@@ -196,7 +208,7 @@ struct cgroup {
196 struct cgroup_name __rcu *name; 208 struct cgroup_name __rcu *name;
197 209
198 /* Private pointers for each registered subsystem */ 210 /* Private pointers for each registered subsystem */
199 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 211 struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
200 212
201 struct cgroupfs_root *root; 213 struct cgroupfs_root *root;
202 214
@@ -220,10 +232,12 @@ struct cgroup {
220 struct list_head pidlists; 232 struct list_head pidlists;
221 struct mutex pidlist_mutex; 233 struct mutex pidlist_mutex;
222 234
235 /* dummy css with NULL ->ss, points back to this cgroup */
236 struct cgroup_subsys_state dummy_css;
237
223 /* For css percpu_ref killing and RCU-protected deletion */ 238 /* For css percpu_ref killing and RCU-protected deletion */
224 struct rcu_head rcu_head; 239 struct rcu_head rcu_head;
225 struct work_struct destroy_work; 240 struct work_struct destroy_work;
226 atomic_t css_kill_cnt;
227 241
228 /* List of events which userspace want to receive */ 242 /* List of events which userspace want to receive */
229 struct list_head event_list; 243 struct list_head event_list;
@@ -322,7 +336,7 @@ struct cgroupfs_root {
322 unsigned long flags; 336 unsigned long flags;
323 337
324 /* IDs for cgroups in this hierarchy */ 338 /* IDs for cgroups in this hierarchy */
325 struct ida cgroup_ida; 339 struct idr cgroup_idr;
326 340
327 /* The path to use for release notifications. */ 341 /* The path to use for release notifications. */
328 char release_agent_path[PATH_MAX]; 342 char release_agent_path[PATH_MAX];
@@ -394,9 +408,10 @@ struct cgroup_map_cb {
394 408
395/* cftype->flags */ 409/* cftype->flags */
396enum { 410enum {
397 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cg */ 411 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */
398 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cg */ 412 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */
399 CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */ 413 CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */
414 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
400}; 415};
401 416
402#define MAX_CFTYPE_NAME 64 417#define MAX_CFTYPE_NAME 64
@@ -424,35 +439,41 @@ struct cftype {
424 /* CFTYPE_* flags */ 439 /* CFTYPE_* flags */
425 unsigned int flags; 440 unsigned int flags;
426 441
442 /*
443 * The subsys this file belongs to. Initialized automatically
444 * during registration. NULL for cgroup core files.
445 */
446 struct cgroup_subsys *ss;
447
427 int (*open)(struct inode *inode, struct file *file); 448 int (*open)(struct inode *inode, struct file *file);
428 ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, 449 ssize_t (*read)(struct cgroup_subsys_state *css, struct cftype *cft,
429 struct file *file, 450 struct file *file,
430 char __user *buf, size_t nbytes, loff_t *ppos); 451 char __user *buf, size_t nbytes, loff_t *ppos);
431 /* 452 /*
432 * read_u64() is a shortcut for the common case of returning a 453 * read_u64() is a shortcut for the common case of returning a
433 * single integer. Use it in place of read() 454 * single integer. Use it in place of read()
434 */ 455 */
435 u64 (*read_u64)(struct cgroup *cgrp, struct cftype *cft); 456 u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft);
436 /* 457 /*
437 * read_s64() is a signed version of read_u64() 458 * read_s64() is a signed version of read_u64()
438 */ 459 */
439 s64 (*read_s64)(struct cgroup *cgrp, struct cftype *cft); 460 s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
440 /* 461 /*
441 * read_map() is used for defining a map of key/value 462 * read_map() is used for defining a map of key/value
442 * pairs. It should call cb->fill(cb, key, value) for each 463 * pairs. It should call cb->fill(cb, key, value) for each
443 * entry. The key/value pairs (and their ordering) should not 464 * entry. The key/value pairs (and their ordering) should not
444 * change between reboots. 465 * change between reboots.
445 */ 466 */
446 int (*read_map)(struct cgroup *cgrp, struct cftype *cft, 467 int (*read_map)(struct cgroup_subsys_state *css, struct cftype *cft,
447 struct cgroup_map_cb *cb); 468 struct cgroup_map_cb *cb);
448 /* 469 /*
449 * read_seq_string() is used for outputting a simple sequence 470 * read_seq_string() is used for outputting a simple sequence
450 * using seqfile. 471 * using seqfile.
451 */ 472 */
452 int (*read_seq_string)(struct cgroup *cgrp, struct cftype *cft, 473 int (*read_seq_string)(struct cgroup_subsys_state *css,
453 struct seq_file *m); 474 struct cftype *cft, struct seq_file *m);
454 475
455 ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft, 476 ssize_t (*write)(struct cgroup_subsys_state *css, struct cftype *cft,
456 struct file *file, 477 struct file *file,
457 const char __user *buf, size_t nbytes, loff_t *ppos); 478 const char __user *buf, size_t nbytes, loff_t *ppos);
458 479
@@ -461,18 +482,20 @@ struct cftype {
461 * a single integer (as parsed by simple_strtoull) from 482 * a single integer (as parsed by simple_strtoull) from
462 * userspace. Use in place of write(); return 0 or error. 483 * userspace. Use in place of write(); return 0 or error.
463 */ 484 */
464 int (*write_u64)(struct cgroup *cgrp, struct cftype *cft, u64 val); 485 int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft,
486 u64 val);
465 /* 487 /*
466 * write_s64() is a signed version of write_u64() 488 * write_s64() is a signed version of write_u64()
467 */ 489 */
468 int (*write_s64)(struct cgroup *cgrp, struct cftype *cft, s64 val); 490 int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft,
491 s64 val);
469 492
470 /* 493 /*
471 * write_string() is passed a nul-terminated kernelspace 494 * write_string() is passed a nul-terminated kernelspace
472 * buffer of maximum length determined by max_write_len. 495 * buffer of maximum length determined by max_write_len.
473 * Returns 0 or -ve error code. 496 * Returns 0 or -ve error code.
474 */ 497 */
475 int (*write_string)(struct cgroup *cgrp, struct cftype *cft, 498 int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft,
476 const char *buffer); 499 const char *buffer);
477 /* 500 /*
478 * trigger() callback can be used to get some kick from the 501 * trigger() callback can be used to get some kick from the
@@ -480,7 +503,7 @@ struct cftype {
480 * at all. The private field can be used to determine the 503 * at all. The private field can be used to determine the
481 * kick type for multiplexing. 504 * kick type for multiplexing.
482 */ 505 */
483 int (*trigger)(struct cgroup *cgrp, unsigned int event); 506 int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
484 507
485 int (*release)(struct inode *inode, struct file *file); 508 int (*release)(struct inode *inode, struct file *file);
486 509
@@ -490,16 +513,18 @@ struct cftype {
490 * you want to provide this functionality. Use eventfd_signal() 513 * you want to provide this functionality. Use eventfd_signal()
491 * on eventfd to send notification to userspace. 514 * on eventfd to send notification to userspace.
492 */ 515 */
493 int (*register_event)(struct cgroup *cgrp, struct cftype *cft, 516 int (*register_event)(struct cgroup_subsys_state *css,
494 struct eventfd_ctx *eventfd, const char *args); 517 struct cftype *cft, struct eventfd_ctx *eventfd,
518 const char *args);
495 /* 519 /*
496 * unregister_event() callback will be called when userspace 520 * unregister_event() callback will be called when userspace
497 * closes the eventfd or on cgroup removing. 521 * closes the eventfd or on cgroup removing.
498 * This callback must be implemented, if you want provide 522 * This callback must be implemented, if you want provide
499 * notification functionality. 523 * notification functionality.
500 */ 524 */
501 void (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, 525 void (*unregister_event)(struct cgroup_subsys_state *css,
502 struct eventfd_ctx *eventfd); 526 struct cftype *cft,
527 struct eventfd_ctx *eventfd);
503}; 528};
504 529
505/* 530/*
@@ -512,15 +537,6 @@ struct cftype_set {
512 struct cftype *cfts; 537 struct cftype *cfts;
513}; 538};
514 539
515struct cgroup_scanner {
516 struct cgroup *cg;
517 int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
518 void (*process_task)(struct task_struct *p,
519 struct cgroup_scanner *scan);
520 struct ptr_heap *heap;
521 void *data;
522};
523
524/* 540/*
525 * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This 541 * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This
526 * function can be called as long as @cgrp is accessible. 542 * function can be called as long as @cgrp is accessible.
@@ -537,7 +553,7 @@ static inline const char *cgroup_name(const struct cgroup *cgrp)
537} 553}
538 554
539int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 555int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
540int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 556int cgroup_rm_cftypes(struct cftype *cfts);
541 557
542bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); 558bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
543 559
@@ -553,20 +569,22 @@ int cgroup_task_count(const struct cgroup *cgrp);
553struct cgroup_taskset; 569struct cgroup_taskset;
554struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); 570struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
555struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); 571struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
556struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset); 572struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
573 int subsys_id);
557int cgroup_taskset_size(struct cgroup_taskset *tset); 574int cgroup_taskset_size(struct cgroup_taskset *tset);
558 575
559/** 576/**
560 * cgroup_taskset_for_each - iterate cgroup_taskset 577 * cgroup_taskset_for_each - iterate cgroup_taskset
561 * @task: the loop cursor 578 * @task: the loop cursor
562 * @skip_cgrp: skip if task's cgroup matches this, %NULL to iterate through all 579 * @skip_css: skip if task's css matches this, %NULL to iterate through all
563 * @tset: taskset to iterate 580 * @tset: taskset to iterate
564 */ 581 */
565#define cgroup_taskset_for_each(task, skip_cgrp, tset) \ 582#define cgroup_taskset_for_each(task, skip_css, tset) \
566 for ((task) = cgroup_taskset_first((tset)); (task); \ 583 for ((task) = cgroup_taskset_first((tset)); (task); \
567 (task) = cgroup_taskset_next((tset))) \ 584 (task) = cgroup_taskset_next((tset))) \
568 if (!(skip_cgrp) || \ 585 if (!(skip_css) || \
569 cgroup_taskset_cur_cgroup((tset)) != (skip_cgrp)) 586 cgroup_taskset_cur_css((tset), \
587 (skip_css)->ss->subsys_id) != (skip_css))
570 588
571/* 589/*
572 * Control Group subsystem type. 590 * Control Group subsystem type.
@@ -574,18 +592,22 @@ int cgroup_taskset_size(struct cgroup_taskset *tset);
574 */ 592 */
575 593
576struct cgroup_subsys { 594struct cgroup_subsys {
577 struct cgroup_subsys_state *(*css_alloc)(struct cgroup *cgrp); 595 struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
578 int (*css_online)(struct cgroup *cgrp); 596 int (*css_online)(struct cgroup_subsys_state *css);
579 void (*css_offline)(struct cgroup *cgrp); 597 void (*css_offline)(struct cgroup_subsys_state *css);
580 void (*css_free)(struct cgroup *cgrp); 598 void (*css_free)(struct cgroup_subsys_state *css);
581 599
582 int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); 600 int (*can_attach)(struct cgroup_subsys_state *css,
583 void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); 601 struct cgroup_taskset *tset);
584 void (*attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); 602 void (*cancel_attach)(struct cgroup_subsys_state *css,
603 struct cgroup_taskset *tset);
604 void (*attach)(struct cgroup_subsys_state *css,
605 struct cgroup_taskset *tset);
585 void (*fork)(struct task_struct *task); 606 void (*fork)(struct task_struct *task);
586 void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp, 607 void (*exit)(struct cgroup_subsys_state *css,
608 struct cgroup_subsys_state *old_css,
587 struct task_struct *task); 609 struct task_struct *task);
588 void (*bind)(struct cgroup *root); 610 void (*bind)(struct cgroup_subsys_state *root_css);
589 611
590 int subsys_id; 612 int subsys_id;
591 int disabled; 613 int disabled;
@@ -641,10 +663,17 @@ struct cgroup_subsys {
641#undef IS_SUBSYS_ENABLED 663#undef IS_SUBSYS_ENABLED
642#undef SUBSYS 664#undef SUBSYS
643 665
644static inline struct cgroup_subsys_state *cgroup_subsys_state( 666/**
645 struct cgroup *cgrp, int subsys_id) 667 * css_parent - find the parent css
668 * @css: the target cgroup_subsys_state
669 *
670 * Return the parent css of @css. This function is guaranteed to return
671 * non-NULL parent as long as @css isn't the root.
672 */
673static inline
674struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css)
646{ 675{
647 return cgrp->subsys[subsys_id]; 676 return css->parent;
648} 677}
649 678
650/** 679/**
@@ -672,7 +701,7 @@ extern struct mutex cgroup_mutex;
672#endif 701#endif
673 702
674/** 703/**
675 * task_subsys_state_check - obtain css for (task, subsys) w/ extra access conds 704 * task_css_check - obtain css for (task, subsys) w/ extra access conds
676 * @task: the target task 705 * @task: the target task
677 * @subsys_id: the target subsystem ID 706 * @subsys_id: the target subsystem ID
678 * @__c: extra condition expression to be passed to rcu_dereference_check() 707 * @__c: extra condition expression to be passed to rcu_dereference_check()
@@ -680,7 +709,7 @@ extern struct mutex cgroup_mutex;
680 * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The 709 * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The
681 * synchronization rules are the same as task_css_set_check(). 710 * synchronization rules are the same as task_css_set_check().
682 */ 711 */
683#define task_subsys_state_check(task, subsys_id, __c) \ 712#define task_css_check(task, subsys_id, __c) \
684 task_css_set_check((task), (__c))->subsys[(subsys_id)] 713 task_css_set_check((task), (__c))->subsys[(subsys_id)]
685 714
686/** 715/**
@@ -695,87 +724,92 @@ static inline struct css_set *task_css_set(struct task_struct *task)
695} 724}
696 725
697/** 726/**
698 * task_subsys_state - obtain css for (task, subsys) 727 * task_css - obtain css for (task, subsys)
699 * @task: the target task 728 * @task: the target task
700 * @subsys_id: the target subsystem ID 729 * @subsys_id: the target subsystem ID
701 * 730 *
702 * See task_subsys_state_check(). 731 * See task_css_check().
703 */ 732 */
704static inline struct cgroup_subsys_state * 733static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
705task_subsys_state(struct task_struct *task, int subsys_id) 734 int subsys_id)
706{ 735{
707 return task_subsys_state_check(task, subsys_id, false); 736 return task_css_check(task, subsys_id, false);
708} 737}
709 738
710static inline struct cgroup* task_cgroup(struct task_struct *task, 739static inline struct cgroup *task_cgroup(struct task_struct *task,
711 int subsys_id) 740 int subsys_id)
712{ 741{
713 return task_subsys_state(task, subsys_id)->cgroup; 742 return task_css(task, subsys_id)->cgroup;
714} 743}
715 744
716struct cgroup *cgroup_next_sibling(struct cgroup *pos); 745struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
746 struct cgroup_subsys_state *parent);
747
748struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
717 749
718/** 750/**
719 * cgroup_for_each_child - iterate through children of a cgroup 751 * css_for_each_child - iterate through children of a css
720 * @pos: the cgroup * to use as the loop cursor 752 * @pos: the css * to use as the loop cursor
721 * @cgrp: cgroup whose children to walk 753 * @parent: css whose children to walk
722 * 754 *
723 * Walk @cgrp's children. Must be called under rcu_read_lock(). A child 755 * Walk @parent's children. Must be called under rcu_read_lock(). A child
724 * cgroup which hasn't finished ->css_online() or already has finished 756 * css which hasn't finished ->css_online() or already has finished
725 * ->css_offline() may show up during traversal and it's each subsystem's 757 * ->css_offline() may show up during traversal and it's each subsystem's
726 * responsibility to verify that each @pos is alive. 758 * responsibility to verify that each @pos is alive.
727 * 759 *
728 * If a subsystem synchronizes against the parent in its ->css_online() and 760 * If a subsystem synchronizes against the parent in its ->css_online() and
729 * before starting iterating, a cgroup which finished ->css_online() is 761 * before starting iterating, a css which finished ->css_online() is
730 * guaranteed to be visible in the future iterations. 762 * guaranteed to be visible in the future iterations.
731 * 763 *
732 * It is allowed to temporarily drop RCU read lock during iteration. The 764 * It is allowed to temporarily drop RCU read lock during iteration. The
733 * caller is responsible for ensuring that @pos remains accessible until 765 * caller is responsible for ensuring that @pos remains accessible until
734 * the start of the next iteration by, for example, bumping the css refcnt. 766 * the start of the next iteration by, for example, bumping the css refcnt.
735 */ 767 */
736#define cgroup_for_each_child(pos, cgrp) \ 768#define css_for_each_child(pos, parent) \
737 for ((pos) = list_first_or_null_rcu(&(cgrp)->children, \ 769 for ((pos) = css_next_child(NULL, (parent)); (pos); \
738 struct cgroup, sibling); \ 770 (pos) = css_next_child((pos), (parent)))
739 (pos); (pos) = cgroup_next_sibling((pos))) 771
772struct cgroup_subsys_state *
773css_next_descendant_pre(struct cgroup_subsys_state *pos,
774 struct cgroup_subsys_state *css);
740 775
741struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, 776struct cgroup_subsys_state *
742 struct cgroup *cgroup); 777css_rightmost_descendant(struct cgroup_subsys_state *pos);
743struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos);
744 778
745/** 779/**
746 * cgroup_for_each_descendant_pre - pre-order walk of a cgroup's descendants 780 * css_for_each_descendant_pre - pre-order walk of a css's descendants
747 * @pos: the cgroup * to use as the loop cursor 781 * @pos: the css * to use as the loop cursor
748 * @cgroup: cgroup whose descendants to walk 782 * @root: css whose descendants to walk
749 * 783 *
750 * Walk @cgroup's descendants. Must be called under rcu_read_lock(). A 784 * Walk @root's descendants. @root is included in the iteration and the
751 * descendant cgroup which hasn't finished ->css_online() or already has 785 * first node to be visited. Must be called under rcu_read_lock(). A
786 * descendant css which hasn't finished ->css_online() or already has
752 * finished ->css_offline() may show up during traversal and it's each 787 * finished ->css_offline() may show up during traversal and it's each
753 * subsystem's responsibility to verify that each @pos is alive. 788 * subsystem's responsibility to verify that each @pos is alive.
754 * 789 *
755 * If a subsystem synchronizes against the parent in its ->css_online() and 790 * If a subsystem synchronizes against the parent in its ->css_online() and
756 * before starting iterating, and synchronizes against @pos on each 791 * before starting iterating, and synchronizes against @pos on each
757 * iteration, any descendant cgroup which finished ->css_online() is 792 * iteration, any descendant css which finished ->css_online() is
758 * guaranteed to be visible in the future iterations. 793 * guaranteed to be visible in the future iterations.
759 * 794 *
760 * In other words, the following guarantees that a descendant can't escape 795 * In other words, the following guarantees that a descendant can't escape
761 * state updates of its ancestors. 796 * state updates of its ancestors.
762 * 797 *
763 * my_online(@cgrp) 798 * my_online(@css)
764 * { 799 * {
765 * Lock @cgrp->parent and @cgrp; 800 * Lock @css's parent and @css;
766 * Inherit state from @cgrp->parent; 801 * Inherit state from the parent;
767 * Unlock both. 802 * Unlock both.
768 * } 803 * }
769 * 804 *
770 * my_update_state(@cgrp) 805 * my_update_state(@css)
771 * { 806 * {
772 * Lock @cgrp; 807 * css_for_each_descendant_pre(@pos, @css) {
773 * Update @cgrp's state;
774 * Unlock @cgrp;
775 *
776 * cgroup_for_each_descendant_pre(@pos, @cgrp) {
777 * Lock @pos; 808 * Lock @pos;
778 * Verify @pos is alive and inherit state from @pos->parent; 809 * if (@pos == @css)
810 * Update @css's state;
811 * else
812 * Verify @pos is alive and inherit state from its parent;
779 * Unlock @pos; 813 * Unlock @pos;
780 * } 814 * }
781 * } 815 * }
@@ -786,8 +820,7 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos);
786 * visible by walking order and, as long as inheriting operations to the 820 * visible by walking order and, as long as inheriting operations to the
787 * same @pos are atomic to each other, multiple updates racing each other 821 * same @pos are atomic to each other, multiple updates racing each other
788 * still result in the correct state. It's guaranateed that at least one 822 * still result in the correct state. It's guaranateed that at least one
789 * inheritance happens for any cgroup after the latest update to its 823 * inheritance happens for any css after the latest update to its parent.
790 * parent.
791 * 824 *
792 * If checking parent's state requires locking the parent, each inheriting 825 * If checking parent's state requires locking the parent, each inheriting
793 * iteration should lock and unlock both @pos->parent and @pos. 826 * iteration should lock and unlock both @pos->parent and @pos.
@@ -800,52 +833,45 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos);
800 * caller is responsible for ensuring that @pos remains accessible until 833 * caller is responsible for ensuring that @pos remains accessible until
801 * the start of the next iteration by, for example, bumping the css refcnt. 834 * the start of the next iteration by, for example, bumping the css refcnt.
802 */ 835 */
803#define cgroup_for_each_descendant_pre(pos, cgroup) \ 836#define css_for_each_descendant_pre(pos, css) \
804 for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos); \ 837 for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \
805 pos = cgroup_next_descendant_pre((pos), (cgroup))) 838 (pos) = css_next_descendant_pre((pos), (css)))
806 839
807struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, 840struct cgroup_subsys_state *
808 struct cgroup *cgroup); 841css_next_descendant_post(struct cgroup_subsys_state *pos,
842 struct cgroup_subsys_state *css);
809 843
810/** 844/**
811 * cgroup_for_each_descendant_post - post-order walk of a cgroup's descendants 845 * css_for_each_descendant_post - post-order walk of a css's descendants
812 * @pos: the cgroup * to use as the loop cursor 846 * @pos: the css * to use as the loop cursor
813 * @cgroup: cgroup whose descendants to walk 847 * @css: css whose descendants to walk
814 * 848 *
815 * Similar to cgroup_for_each_descendant_pre() but performs post-order 849 * Similar to css_for_each_descendant_pre() but performs post-order
816 * traversal instead. Note that the walk visibility guarantee described in 850 * traversal instead. @root is included in the iteration and the last
817 * pre-order walk doesn't apply the same to post-order walks. 851 * node to be visited. Note that the walk visibility guarantee described
852 * in pre-order walk doesn't apply the same to post-order walks.
818 */ 853 */
819#define cgroup_for_each_descendant_post(pos, cgroup) \ 854#define css_for_each_descendant_post(pos, css) \
820 for (pos = cgroup_next_descendant_post(NULL, (cgroup)); (pos); \ 855 for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \
821 pos = cgroup_next_descendant_post((pos), (cgroup))) 856 (pos) = css_next_descendant_post((pos), (css)))
822 857
823/* A cgroup_iter should be treated as an opaque object */ 858/* A css_task_iter should be treated as an opaque object */
824struct cgroup_iter { 859struct css_task_iter {
825 struct list_head *cset_link; 860 struct cgroup_subsys_state *origin_css;
826 struct list_head *task; 861 struct list_head *cset_link;
862 struct list_head *task;
827}; 863};
828 864
829/* 865void css_task_iter_start(struct cgroup_subsys_state *css,
830 * To iterate across the tasks in a cgroup: 866 struct css_task_iter *it);
831 * 867struct task_struct *css_task_iter_next(struct css_task_iter *it);
832 * 1) call cgroup_iter_start to initialize an iterator 868void css_task_iter_end(struct css_task_iter *it);
833 * 869
834 * 2) call cgroup_iter_next() to retrieve member tasks until it 870int css_scan_tasks(struct cgroup_subsys_state *css,
835 * returns NULL or until you want to end the iteration 871 bool (*test)(struct task_struct *, void *),
836 * 872 void (*process)(struct task_struct *, void *),
837 * 3) call cgroup_iter_end() to destroy the iterator. 873 void *data, struct ptr_heap *heap);
838 * 874
839 * Or, call cgroup_scan_tasks() to iterate through every task in a
840 * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling
841 * the test_task() callback, but not while calling the process_task()
842 * callback.
843 */
844void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it);
845struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
846 struct cgroup_iter *it);
847void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
848int cgroup_scan_tasks(struct cgroup_scanner *scan);
849int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); 875int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
850int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); 876int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
851 877
@@ -878,7 +904,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg,
878 904
879/* Get id and depth of css */ 905/* Get id and depth of css */
880unsigned short css_id(struct cgroup_subsys_state *css); 906unsigned short css_id(struct cgroup_subsys_state *css);
881struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id); 907struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
908 struct cgroup_subsys *ss);
882 909
883#else /* !CONFIG_CGROUPS */ 910#else /* !CONFIG_CGROUPS */
884 911
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7b4d9d79570b..6c416092e324 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -85,7 +85,7 @@ extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
85extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm); 85extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm);
86 86
87extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); 87extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
88extern struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont); 88extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css);
89 89
90static inline 90static inline
91bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg) 91bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg)
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 7dc17e2456de..3f3788d49362 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -34,10 +34,12 @@ extern void vmpressure_cleanup(struct vmpressure *vmpr);
34extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); 34extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
35extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); 35extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
36extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); 36extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css);
37extern int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, 37extern int vmpressure_register_event(struct cgroup_subsys_state *css,
38 struct cftype *cft,
38 struct eventfd_ctx *eventfd, 39 struct eventfd_ctx *eventfd,
39 const char *args); 40 const char *args);
40extern void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, 41extern void vmpressure_unregister_event(struct cgroup_subsys_state *css,
42 struct cftype *cft,
41 struct eventfd_ctx *eventfd); 43 struct eventfd_ctx *eventfd);
42#else 44#else
43static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, 45static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index 0fee0617fb7d..52adaa75dac9 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -35,7 +35,7 @@ static inline u32 task_cls_classid(struct task_struct *p)
35 return 0; 35 return 0;
36 36
37 rcu_read_lock(); 37 rcu_read_lock();
38 classid = container_of(task_subsys_state(p, net_cls_subsys_id), 38 classid = container_of(task_css(p, net_cls_subsys_id),
39 struct cgroup_cls_state, css)->classid; 39 struct cgroup_cls_state, css)->classid;
40 rcu_read_unlock(); 40 rcu_read_unlock();
41 41
@@ -51,7 +51,7 @@ static inline u32 task_cls_classid(struct task_struct *p)
51 return 0; 51 return 0;
52 52
53 rcu_read_lock(); 53 rcu_read_lock();
54 css = task_subsys_state(p, net_cls_subsys_id); 54 css = task_css(p, net_cls_subsys_id);
55 if (css) 55 if (css)
56 classid = container_of(css, 56 classid = container_of(css,
57 struct cgroup_cls_state, css)->classid; 57 struct cgroup_cls_state, css)->classid;
diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h
index 50ab8c26ab59..a24f8bb3ca47 100644
--- a/include/net/netprio_cgroup.h
+++ b/include/net/netprio_cgroup.h
@@ -25,10 +25,6 @@ struct netprio_map {
25 u32 priomap[]; 25 u32 priomap[];
26}; 26};
27 27
28struct cgroup_netprio_state {
29 struct cgroup_subsys_state css;
30};
31
32extern void sock_update_netprioidx(struct sock *sk); 28extern void sock_update_netprioidx(struct sock *sk);
33 29
34#if IS_BUILTIN(CONFIG_NETPRIO_CGROUP) 30#if IS_BUILTIN(CONFIG_NETPRIO_CGROUP)
@@ -39,7 +35,7 @@ static inline u32 task_netprioidx(struct task_struct *p)
39 u32 idx; 35 u32 idx;
40 36
41 rcu_read_lock(); 37 rcu_read_lock();
42 css = task_subsys_state(p, net_prio_subsys_id); 38 css = task_css(p, net_prio_subsys_id);
43 idx = css->cgroup->id; 39 idx = css->cgroup->id;
44 rcu_read_unlock(); 40 rcu_read_unlock();
45 return idx; 41 return idx;
@@ -53,7 +49,7 @@ static inline u32 task_netprioidx(struct task_struct *p)
53 u32 idx = 0; 49 u32 idx = 0;
54 50
55 rcu_read_lock(); 51 rcu_read_lock();
56 css = task_subsys_state(p, net_prio_subsys_id); 52 css = task_css(p, net_prio_subsys_id);
57 if (css) 53 if (css)
58 idx = css->cgroup->id; 54 idx = css->cgroup->id;
59 rcu_read_unlock(); 55 rcu_read_unlock();
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e91963302c0d..e0aeb32415ff 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -81,7 +81,7 @@
81 */ 81 */
82#ifdef CONFIG_PROVE_RCU 82#ifdef CONFIG_PROVE_RCU
83DEFINE_MUTEX(cgroup_mutex); 83DEFINE_MUTEX(cgroup_mutex);
84EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ 84EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */
85#else 85#else
86static DEFINE_MUTEX(cgroup_mutex); 86static DEFINE_MUTEX(cgroup_mutex);
87#endif 87#endif
@@ -117,6 +117,7 @@ struct cfent {
117 struct list_head node; 117 struct list_head node;
118 struct dentry *dentry; 118 struct dentry *dentry;
119 struct cftype *type; 119 struct cftype *type;
120 struct cgroup_subsys_state *css;
120 121
121 /* file xattrs */ 122 /* file xattrs */
122 struct simple_xattrs xattrs; 123 struct simple_xattrs xattrs;
@@ -159,9 +160,9 @@ struct css_id {
159 */ 160 */
160struct cgroup_event { 161struct cgroup_event {
161 /* 162 /*
162 * Cgroup which the event belongs to. 163 * css which the event belongs to.
163 */ 164 */
164 struct cgroup *cgrp; 165 struct cgroup_subsys_state *css;
165 /* 166 /*
166 * Control file which the event associated. 167 * Control file which the event associated.
167 */ 168 */
@@ -215,10 +216,33 @@ static u64 cgroup_serial_nr_next = 1;
215 */ 216 */
216static int need_forkexit_callback __read_mostly; 217static int need_forkexit_callback __read_mostly;
217 218
218static void cgroup_offline_fn(struct work_struct *work); 219static struct cftype cgroup_base_files[];
220
221static void cgroup_destroy_css_killed(struct cgroup *cgrp);
219static int cgroup_destroy_locked(struct cgroup *cgrp); 222static int cgroup_destroy_locked(struct cgroup *cgrp);
220static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 223static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
221 struct cftype cfts[], bool is_add); 224 bool is_add);
225
226/**
227 * cgroup_css - obtain a cgroup's css for the specified subsystem
228 * @cgrp: the cgroup of interest
229 * @ss: the subsystem of interest (%NULL returns the dummy_css)
230 *
231 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
232 * function must be called either under cgroup_mutex or rcu_read_lock() and
233 * the caller is responsible for pinning the returned css if it wants to
234 * keep accessing it outside the said locks. This function may return
235 * %NULL if @cgrp doesn't have @subsys_id enabled.
236 */
237static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
238 struct cgroup_subsys *ss)
239{
240 if (ss)
241 return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
242 lockdep_is_held(&cgroup_mutex));
243 else
244 return &cgrp->dummy_css;
245}
222 246
223/* convenient tests for these bits */ 247/* convenient tests for these bits */
224static inline bool cgroup_is_dead(const struct cgroup *cgrp) 248static inline bool cgroup_is_dead(const struct cgroup *cgrp)
@@ -365,9 +389,11 @@ static struct cgrp_cset_link init_cgrp_cset_link;
365static int cgroup_init_idr(struct cgroup_subsys *ss, 389static int cgroup_init_idr(struct cgroup_subsys *ss,
366 struct cgroup_subsys_state *css); 390 struct cgroup_subsys_state *css);
367 391
368/* css_set_lock protects the list of css_set objects, and the 392/*
369 * chain of tasks off each css_set. Nests outside task->alloc_lock 393 * css_set_lock protects the list of css_set objects, and the chain of
370 * due to cgroup_iter_start() */ 394 * tasks off each css_set. Nests outside task->alloc_lock due to
395 * css_task_iter_start().
396 */
371static DEFINE_RWLOCK(css_set_lock); 397static DEFINE_RWLOCK(css_set_lock);
372static int css_set_count; 398static int css_set_count;
373 399
@@ -392,10 +418,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
392 return key; 418 return key;
393} 419}
394 420
395/* We don't maintain the lists running through each css_set to its 421/*
396 * task until after the first call to cgroup_iter_start(). This 422 * We don't maintain the lists running through each css_set to its task
397 * reduces the fork()/exit() overhead for people who have cgroups 423 * until after the first call to css_task_iter_start(). This reduces the
398 * compiled into their kernel but not actually in use */ 424 * fork()/exit() overhead for people who have cgroups compiled into their
425 * kernel but not actually in use.
426 */
399static int use_task_css_set_links __read_mostly; 427static int use_task_css_set_links __read_mostly;
400 428
401static void __put_css_set(struct css_set *cset, int taskexit) 429static void __put_css_set(struct css_set *cset, int taskexit)
@@ -464,7 +492,7 @@ static inline void put_css_set_taskexit(struct css_set *cset)
464 * @new_cgrp: cgroup that's being entered by the task 492 * @new_cgrp: cgroup that's being entered by the task
465 * @template: desired set of css pointers in css_set (pre-calculated) 493 * @template: desired set of css pointers in css_set (pre-calculated)
466 * 494 *
467 * Returns true if "cg" matches "old_cg" except for the hierarchy 495 * Returns true if "cset" matches "old_cset" except for the hierarchy
468 * which "new_cgrp" belongs to, for which it should match "new_cgrp". 496 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
469 */ 497 */
470static bool compare_css_sets(struct css_set *cset, 498static bool compare_css_sets(struct css_set *cset,
@@ -555,7 +583,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
555 /* Subsystem is in this hierarchy. So we want 583 /* Subsystem is in this hierarchy. So we want
556 * the subsystem state from the new 584 * the subsystem state from the new
557 * cgroup */ 585 * cgroup */
558 template[i] = cgrp->subsys[i]; 586 template[i] = cgroup_css(cgrp, ss);
559 } else { 587 } else {
560 /* Subsystem is not in this hierarchy, so we 588 /* Subsystem is not in this hierarchy, so we
561 * don't want to change the subsystem state */ 589 * don't want to change the subsystem state */
@@ -803,8 +831,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
803 831
804static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); 832static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
805static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 833static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
806static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 834static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
807 unsigned long subsys_mask);
808static const struct inode_operations cgroup_dir_inode_operations; 835static const struct inode_operations cgroup_dir_inode_operations;
809static const struct file_operations proc_cgroupstats_operations; 836static const struct file_operations proc_cgroupstats_operations;
810 837
@@ -813,8 +840,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {
813 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 840 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
814}; 841};
815 842
816static int alloc_css_id(struct cgroup_subsys *ss, 843static int alloc_css_id(struct cgroup_subsys_state *child_css);
817 struct cgroup *parent, struct cgroup *child);
818 844
819static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) 845static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
820{ 846{
@@ -845,15 +871,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
845static void cgroup_free_fn(struct work_struct *work) 871static void cgroup_free_fn(struct work_struct *work)
846{ 872{
847 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 873 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
848 struct cgroup_subsys *ss;
849 874
850 mutex_lock(&cgroup_mutex); 875 mutex_lock(&cgroup_mutex);
851 /*
852 * Release the subsystem state objects.
853 */
854 for_each_root_subsys(cgrp->root, ss)
855 ss->css_free(cgrp);
856
857 cgrp->root->number_of_cgroups--; 876 cgrp->root->number_of_cgroups--;
858 mutex_unlock(&cgroup_mutex); 877 mutex_unlock(&cgroup_mutex);
859 878
@@ -864,8 +883,6 @@ static void cgroup_free_fn(struct work_struct *work)
864 */ 883 */
865 dput(cgrp->parent->dentry); 884 dput(cgrp->parent->dentry);
866 885
867 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
868
869 /* 886 /*
870 * Drop the active superblock reference that we took when we 887 * Drop the active superblock reference that we took when we
871 * created the cgroup. This will free cgrp->root, if we are 888 * created the cgroup. This will free cgrp->root, if we are
@@ -956,27 +973,22 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
956} 973}
957 974
958/** 975/**
959 * cgroup_clear_directory - selective removal of base and subsystem files 976 * cgroup_clear_dir - remove subsys files in a cgroup directory
960 * @dir: directory containing the files 977 * @cgrp: target cgroup
961 * @base_files: true if the base files should be removed
962 * @subsys_mask: mask of the subsystem ids whose files should be removed 978 * @subsys_mask: mask of the subsystem ids whose files should be removed
963 */ 979 */
964static void cgroup_clear_directory(struct dentry *dir, bool base_files, 980static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
965 unsigned long subsys_mask)
966{ 981{
967 struct cgroup *cgrp = __d_cgrp(dir);
968 struct cgroup_subsys *ss; 982 struct cgroup_subsys *ss;
983 int i;
969 984
970 for_each_root_subsys(cgrp->root, ss) { 985 for_each_subsys(ss, i) {
971 struct cftype_set *set; 986 struct cftype_set *set;
972 if (!test_bit(ss->subsys_id, &subsys_mask)) 987
988 if (!test_bit(i, &subsys_mask))
973 continue; 989 continue;
974 list_for_each_entry(set, &ss->cftsets, node) 990 list_for_each_entry(set, &ss->cftsets, node)
975 cgroup_addrm_files(cgrp, NULL, set->cfts, false); 991 cgroup_addrm_files(cgrp, set->cfts, false);
976 }
977 if (base_files) {
978 while (!list_empty(&cgrp->files))
979 cgroup_rm_file(cgrp, NULL);
980 } 992 }
981} 993}
982 994
@@ -986,9 +998,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
986static void cgroup_d_remove_dir(struct dentry *dentry) 998static void cgroup_d_remove_dir(struct dentry *dentry)
987{ 999{
988 struct dentry *parent; 1000 struct dentry *parent;
989 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
990
991 cgroup_clear_directory(dentry, true, root->subsys_mask);
992 1001
993 parent = dentry->d_parent; 1002 parent = dentry->d_parent;
994 spin_lock(&parent->d_lock); 1003 spin_lock(&parent->d_lock);
@@ -1009,79 +1018,84 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1009{ 1018{
1010 struct cgroup *cgrp = &root->top_cgroup; 1019 struct cgroup *cgrp = &root->top_cgroup;
1011 struct cgroup_subsys *ss; 1020 struct cgroup_subsys *ss;
1012 int i; 1021 unsigned long pinned = 0;
1022 int i, ret;
1013 1023
1014 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1024 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1015 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 1025 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
1016 1026
1017 /* Check that any added subsystems are currently free */ 1027 /* Check that any added subsystems are currently free */
1018 for_each_subsys(ss, i) { 1028 for_each_subsys(ss, i) {
1019 unsigned long bit = 1UL << i; 1029 if (!(added_mask & (1 << i)))
1020
1021 if (!(bit & added_mask))
1022 continue; 1030 continue;
1023 1031
1032 /* is the subsystem mounted elsewhere? */
1024 if (ss->root != &cgroup_dummy_root) { 1033 if (ss->root != &cgroup_dummy_root) {
1025 /* Subsystem isn't free */ 1034 ret = -EBUSY;
1026 return -EBUSY; 1035 goto out_put;
1036 }
1037
1038 /* pin the module */
1039 if (!try_module_get(ss->module)) {
1040 ret = -ENOENT;
1041 goto out_put;
1027 } 1042 }
1043 pinned |= 1 << i;
1028 } 1044 }
1029 1045
1030 /* Currently we don't handle adding/removing subsystems when 1046 /* subsys could be missing if unloaded between parsing and here */
1031 * any child cgroups exist. This is theoretically supportable 1047 if (added_mask != pinned) {
1032 * but involves complex error handling, so it's being left until 1048 ret = -ENOENT;
1033 * later */ 1049 goto out_put;
1034 if (root->number_of_cgroups > 1) 1050 }
1035 return -EBUSY; 1051
1052 ret = cgroup_populate_dir(cgrp, added_mask);
1053 if (ret)
1054 goto out_put;
1055
1056 /*
1057 * Nothing can fail from this point on. Remove files for the
1058 * removed subsystems and rebind each subsystem.
1059 */
1060 cgroup_clear_dir(cgrp, removed_mask);
1036 1061
1037 /* Process each subsystem */
1038 for_each_subsys(ss, i) { 1062 for_each_subsys(ss, i) {
1039 unsigned long bit = 1UL << i; 1063 unsigned long bit = 1UL << i;
1040 1064
1041 if (bit & added_mask) { 1065 if (bit & added_mask) {
1042 /* We're binding this subsystem to this hierarchy */ 1066 /* We're binding this subsystem to this hierarchy */
1043 BUG_ON(cgrp->subsys[i]); 1067 BUG_ON(cgroup_css(cgrp, ss));
1044 BUG_ON(!cgroup_dummy_top->subsys[i]); 1068 BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
1045 BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); 1069 BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
1070
1071 rcu_assign_pointer(cgrp->subsys[i],
1072 cgroup_css(cgroup_dummy_top, ss));
1073 cgroup_css(cgrp, ss)->cgroup = cgrp;
1046 1074
1047 cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
1048 cgrp->subsys[i]->cgroup = cgrp;
1049 list_move(&ss->sibling, &root->subsys_list); 1075 list_move(&ss->sibling, &root->subsys_list);
1050 ss->root = root; 1076 ss->root = root;
1051 if (ss->bind) 1077 if (ss->bind)
1052 ss->bind(cgrp); 1078 ss->bind(cgroup_css(cgrp, ss));
1053 1079
1054 /* refcount was already taken, and we're keeping it */ 1080 /* refcount was already taken, and we're keeping it */
1055 root->subsys_mask |= bit; 1081 root->subsys_mask |= bit;
1056 } else if (bit & removed_mask) { 1082 } else if (bit & removed_mask) {
1057 /* We're removing this subsystem */ 1083 /* We're removing this subsystem */
1058 BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); 1084 BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
1059 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1085 BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
1060 1086
1061 if (ss->bind) 1087 if (ss->bind)
1062 ss->bind(cgroup_dummy_top); 1088 ss->bind(cgroup_css(cgroup_dummy_top, ss));
1063 cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; 1089
1064 cgrp->subsys[i] = NULL; 1090 cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top;
1091 RCU_INIT_POINTER(cgrp->subsys[i], NULL);
1092
1065 cgroup_subsys[i]->root = &cgroup_dummy_root; 1093 cgroup_subsys[i]->root = &cgroup_dummy_root;
1066 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); 1094 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1067 1095
1068 /* subsystem is now free - drop reference on module */ 1096 /* subsystem is now free - drop reference on module */
1069 module_put(ss->module); 1097 module_put(ss->module);
1070 root->subsys_mask &= ~bit; 1098 root->subsys_mask &= ~bit;
1071 } else if (bit & root->subsys_mask) {
1072 /* Subsystem state should already exist */
1073 BUG_ON(!cgrp->subsys[i]);
1074 /*
1075 * a refcount was taken, but we already had one, so
1076 * drop the extra reference.
1077 */
1078 module_put(ss->module);
1079#ifdef CONFIG_MODULE_UNLOAD
1080 BUG_ON(ss->module && !module_refcount(ss->module));
1081#endif
1082 } else {
1083 /* Subsystem state shouldn't exist */
1084 BUG_ON(cgrp->subsys[i]);
1085 } 1099 }
1086 } 1100 }
1087 1101
@@ -1092,6 +1106,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1092 root->flags |= CGRP_ROOT_SUBSYS_BOUND; 1106 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1093 1107
1094 return 0; 1108 return 0;
1109
1110out_put:
1111 for_each_subsys(ss, i)
1112 if (pinned & (1 << i))
1113 module_put(ss->module);
1114 return ret;
1095} 1115}
1096 1116
1097static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) 1117static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
@@ -1142,7 +1162,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1142 char *token, *o = data; 1162 char *token, *o = data;
1143 bool all_ss = false, one_ss = false; 1163 bool all_ss = false, one_ss = false;
1144 unsigned long mask = (unsigned long)-1; 1164 unsigned long mask = (unsigned long)-1;
1145 bool module_pin_failed = false;
1146 struct cgroup_subsys *ss; 1165 struct cgroup_subsys *ss;
1147 int i; 1166 int i;
1148 1167
@@ -1285,52 +1304,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1285 if (!opts->subsys_mask && !opts->name) 1304 if (!opts->subsys_mask && !opts->name)
1286 return -EINVAL; 1305 return -EINVAL;
1287 1306
1288 /*
1289 * Grab references on all the modules we'll need, so the subsystems
1290 * don't dance around before rebind_subsystems attaches them. This may
1291 * take duplicate reference counts on a subsystem that's already used,
1292 * but rebind_subsystems handles this case.
1293 */
1294 for_each_subsys(ss, i) {
1295 if (!(opts->subsys_mask & (1UL << i)))
1296 continue;
1297 if (!try_module_get(cgroup_subsys[i]->module)) {
1298 module_pin_failed = true;
1299 break;
1300 }
1301 }
1302 if (module_pin_failed) {
1303 /*
1304 * oops, one of the modules was going away. this means that we
1305 * raced with a module_delete call, and to the user this is
1306 * essentially a "subsystem doesn't exist" case.
1307 */
1308 for (i--; i >= 0; i--) {
1309 /* drop refcounts only on the ones we took */
1310 unsigned long bit = 1UL << i;
1311
1312 if (!(bit & opts->subsys_mask))
1313 continue;
1314 module_put(cgroup_subsys[i]->module);
1315 }
1316 return -ENOENT;
1317 }
1318
1319 return 0; 1307 return 0;
1320} 1308}
1321 1309
1322static void drop_parsed_module_refcounts(unsigned long subsys_mask)
1323{
1324 struct cgroup_subsys *ss;
1325 int i;
1326
1327 mutex_lock(&cgroup_mutex);
1328 for_each_subsys(ss, i)
1329 if (subsys_mask & (1UL << i))
1330 module_put(cgroup_subsys[i]->module);
1331 mutex_unlock(&cgroup_mutex);
1332}
1333
1334static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1310static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1335{ 1311{
1336 int ret = 0; 1312 int ret = 0;
@@ -1370,22 +1346,15 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1370 goto out_unlock; 1346 goto out_unlock;
1371 } 1347 }
1372 1348
1373 /* 1349 /* remounting is not allowed for populated hierarchies */
1374 * Clear out the files of subsystems that should be removed, do 1350 if (root->number_of_cgroups > 1) {
1375 * this before rebind_subsystems, since rebind_subsystems may 1351 ret = -EBUSY;
1376 * change this hierarchy's subsys_list.
1377 */
1378 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1379
1380 ret = rebind_subsystems(root, added_mask, removed_mask);
1381 if (ret) {
1382 /* rebind_subsystems failed, re-populate the removed files */
1383 cgroup_populate_dir(cgrp, false, removed_mask);
1384 goto out_unlock; 1352 goto out_unlock;
1385 } 1353 }
1386 1354
1387 /* re-populate subsystem files */ 1355 ret = rebind_subsystems(root, added_mask, removed_mask);
1388 cgroup_populate_dir(cgrp, false, added_mask); 1356 if (ret)
1357 goto out_unlock;
1389 1358
1390 if (opts.release_agent) 1359 if (opts.release_agent)
1391 strcpy(root->release_agent_path, opts.release_agent); 1360 strcpy(root->release_agent_path, opts.release_agent);
@@ -1395,8 +1364,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1395 mutex_unlock(&cgroup_root_mutex); 1364 mutex_unlock(&cgroup_root_mutex);
1396 mutex_unlock(&cgroup_mutex); 1365 mutex_unlock(&cgroup_mutex);
1397 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1366 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1398 if (ret)
1399 drop_parsed_module_refcounts(opts.subsys_mask);
1400 return ret; 1367 return ret;
1401} 1368}
1402 1369
@@ -1416,6 +1383,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1416 INIT_LIST_HEAD(&cgrp->release_list); 1383 INIT_LIST_HEAD(&cgrp->release_list);
1417 INIT_LIST_HEAD(&cgrp->pidlists); 1384 INIT_LIST_HEAD(&cgrp->pidlists);
1418 mutex_init(&cgrp->pidlist_mutex); 1385 mutex_init(&cgrp->pidlist_mutex);
1386 cgrp->dummy_css.cgroup = cgrp;
1419 INIT_LIST_HEAD(&cgrp->event_list); 1387 INIT_LIST_HEAD(&cgrp->event_list);
1420 spin_lock_init(&cgrp->event_list_lock); 1388 spin_lock_init(&cgrp->event_list_lock);
1421 simple_xattrs_init(&cgrp->xattrs); 1389 simple_xattrs_init(&cgrp->xattrs);
@@ -1431,6 +1399,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1431 cgrp->root = root; 1399 cgrp->root = root;
1432 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); 1400 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1433 init_cgroup_housekeeping(cgrp); 1401 init_cgroup_housekeeping(cgrp);
1402 idr_init(&root->cgroup_idr);
1434} 1403}
1435 1404
1436static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) 1405static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
@@ -1503,7 +1472,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1503 */ 1472 */
1504 root->subsys_mask = opts->subsys_mask; 1473 root->subsys_mask = opts->subsys_mask;
1505 root->flags = opts->flags; 1474 root->flags = opts->flags;
1506 ida_init(&root->cgroup_ida);
1507 if (opts->release_agent) 1475 if (opts->release_agent)
1508 strcpy(root->release_agent_path, opts->release_agent); 1476 strcpy(root->release_agent_path, opts->release_agent);
1509 if (opts->name) 1477 if (opts->name)
@@ -1519,7 +1487,7 @@ static void cgroup_free_root(struct cgroupfs_root *root)
1519 /* hierarhcy ID shoulid already have been released */ 1487 /* hierarhcy ID shoulid already have been released */
1520 WARN_ON_ONCE(root->hierarchy_id); 1488 WARN_ON_ONCE(root->hierarchy_id);
1521 1489
1522 ida_destroy(&root->cgroup_ida); 1490 idr_destroy(&root->cgroup_idr);
1523 kfree(root); 1491 kfree(root);
1524 } 1492 }
1525} 1493}
@@ -1584,7 +1552,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1584 int ret = 0; 1552 int ret = 0;
1585 struct super_block *sb; 1553 struct super_block *sb;
1586 struct cgroupfs_root *new_root; 1554 struct cgroupfs_root *new_root;
1555 struct list_head tmp_links;
1587 struct inode *inode; 1556 struct inode *inode;
1557 const struct cred *cred;
1588 1558
1589 /* First find the desired set of subsystems */ 1559 /* First find the desired set of subsystems */
1590 mutex_lock(&cgroup_mutex); 1560 mutex_lock(&cgroup_mutex);
@@ -1600,7 +1570,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1600 new_root = cgroup_root_from_opts(&opts); 1570 new_root = cgroup_root_from_opts(&opts);
1601 if (IS_ERR(new_root)) { 1571 if (IS_ERR(new_root)) {
1602 ret = PTR_ERR(new_root); 1572 ret = PTR_ERR(new_root);
1603 goto drop_modules; 1573 goto out_err;
1604 } 1574 }
1605 opts.new_root = new_root; 1575 opts.new_root = new_root;
1606 1576
@@ -1609,17 +1579,15 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1609 if (IS_ERR(sb)) { 1579 if (IS_ERR(sb)) {
1610 ret = PTR_ERR(sb); 1580 ret = PTR_ERR(sb);
1611 cgroup_free_root(opts.new_root); 1581 cgroup_free_root(opts.new_root);
1612 goto drop_modules; 1582 goto out_err;
1613 } 1583 }
1614 1584
1615 root = sb->s_fs_info; 1585 root = sb->s_fs_info;
1616 BUG_ON(!root); 1586 BUG_ON(!root);
1617 if (root == opts.new_root) { 1587 if (root == opts.new_root) {
1618 /* We used the new root structure, so this is a new hierarchy */ 1588 /* We used the new root structure, so this is a new hierarchy */
1619 struct list_head tmp_links;
1620 struct cgroup *root_cgrp = &root->top_cgroup; 1589 struct cgroup *root_cgrp = &root->top_cgroup;
1621 struct cgroupfs_root *existing_root; 1590 struct cgroupfs_root *existing_root;
1622 const struct cred *cred;
1623 int i; 1591 int i;
1624 struct css_set *cset; 1592 struct css_set *cset;
1625 1593
@@ -1634,6 +1602,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1634 mutex_lock(&cgroup_mutex); 1602 mutex_lock(&cgroup_mutex);
1635 mutex_lock(&cgroup_root_mutex); 1603 mutex_lock(&cgroup_root_mutex);
1636 1604
1605 root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
1606 0, 1, GFP_KERNEL);
1607 if (root_cgrp->id < 0)
1608 goto unlock_drop;
1609
1637 /* Check for name clashes with existing mounts */ 1610 /* Check for name clashes with existing mounts */
1638 ret = -EBUSY; 1611 ret = -EBUSY;
1639 if (strlen(root->name)) 1612 if (strlen(root->name))
@@ -1657,26 +1630,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1657 if (ret) 1630 if (ret)
1658 goto unlock_drop; 1631 goto unlock_drop;
1659 1632
1633 sb->s_root->d_fsdata = root_cgrp;
1634 root_cgrp->dentry = sb->s_root;
1635
1636 /*
1637 * We're inside get_sb() and will call lookup_one_len() to
1638 * create the root files, which doesn't work if SELinux is
1639 * in use. The following cred dancing somehow works around
1640 * it. See 2ce9738ba ("cgroupfs: use init_cred when
1641 * populating new cgroupfs mount") for more details.
1642 */
1643 cred = override_creds(&init_cred);
1644
1645 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1646 if (ret)
1647 goto rm_base_files;
1648
1660 ret = rebind_subsystems(root, root->subsys_mask, 0); 1649 ret = rebind_subsystems(root, root->subsys_mask, 0);
1661 if (ret == -EBUSY) { 1650 if (ret)
1662 free_cgrp_cset_links(&tmp_links); 1651 goto rm_base_files;
1663 goto unlock_drop; 1652
1664 } 1653 revert_creds(cred);
1654
1665 /* 1655 /*
1666 * There must be no failure case after here, since rebinding 1656 * There must be no failure case after here, since rebinding
1667 * takes care of subsystems' refcounts, which are explicitly 1657 * takes care of subsystems' refcounts, which are explicitly
1668 * dropped in the failure exit path. 1658 * dropped in the failure exit path.
1669 */ 1659 */
1670 1660
1671 /* EBUSY should be the only error here */
1672 BUG_ON(ret);
1673
1674 list_add(&root->root_list, &cgroup_roots); 1661 list_add(&root->root_list, &cgroup_roots);
1675 cgroup_root_count++; 1662 cgroup_root_count++;
1676 1663
1677 sb->s_root->d_fsdata = root_cgrp;
1678 root->top_cgroup.dentry = sb->s_root;
1679
1680 /* Link the top cgroup in this hierarchy into all 1664 /* Link the top cgroup in this hierarchy into all
1681 * the css_set objects */ 1665 * the css_set objects */
1682 write_lock(&css_set_lock); 1666 write_lock(&css_set_lock);
@@ -1689,9 +1673,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1689 BUG_ON(!list_empty(&root_cgrp->children)); 1673 BUG_ON(!list_empty(&root_cgrp->children));
1690 BUG_ON(root->number_of_cgroups != 1); 1674 BUG_ON(root->number_of_cgroups != 1);
1691 1675
1692 cred = override_creds(&init_cred);
1693 cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
1694 revert_creds(cred);
1695 mutex_unlock(&cgroup_root_mutex); 1676 mutex_unlock(&cgroup_root_mutex);
1696 mutex_unlock(&cgroup_mutex); 1677 mutex_unlock(&cgroup_mutex);
1697 mutex_unlock(&inode->i_mutex); 1678 mutex_unlock(&inode->i_mutex);
@@ -1711,15 +1692,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1711 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1692 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1712 } 1693 }
1713 } 1694 }
1714
1715 /* no subsys rebinding, so refcounts don't change */
1716 drop_parsed_module_refcounts(opts.subsys_mask);
1717 } 1695 }
1718 1696
1719 kfree(opts.release_agent); 1697 kfree(opts.release_agent);
1720 kfree(opts.name); 1698 kfree(opts.name);
1721 return dget(sb->s_root); 1699 return dget(sb->s_root);
1722 1700
1701 rm_base_files:
1702 free_cgrp_cset_links(&tmp_links);
1703 cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
1704 revert_creds(cred);
1723 unlock_drop: 1705 unlock_drop:
1724 cgroup_exit_root_id(root); 1706 cgroup_exit_root_id(root);
1725 mutex_unlock(&cgroup_root_mutex); 1707 mutex_unlock(&cgroup_root_mutex);
@@ -1727,8 +1709,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1727 mutex_unlock(&inode->i_mutex); 1709 mutex_unlock(&inode->i_mutex);
1728 drop_new_super: 1710 drop_new_super:
1729 deactivate_locked_super(sb); 1711 deactivate_locked_super(sb);
1730 drop_modules:
1731 drop_parsed_module_refcounts(opts.subsys_mask);
1732 out_err: 1712 out_err:
1733 kfree(opts.release_agent); 1713 kfree(opts.release_agent);
1734 kfree(opts.name); 1714 kfree(opts.name);
@@ -1746,6 +1726,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1746 BUG_ON(root->number_of_cgroups != 1); 1726 BUG_ON(root->number_of_cgroups != 1);
1747 BUG_ON(!list_empty(&cgrp->children)); 1727 BUG_ON(!list_empty(&cgrp->children));
1748 1728
1729 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1749 mutex_lock(&cgroup_mutex); 1730 mutex_lock(&cgroup_mutex);
1750 mutex_lock(&cgroup_root_mutex); 1731 mutex_lock(&cgroup_root_mutex);
1751 1732
@@ -1778,6 +1759,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1778 1759
1779 mutex_unlock(&cgroup_root_mutex); 1760 mutex_unlock(&cgroup_root_mutex);
1780 mutex_unlock(&cgroup_mutex); 1761 mutex_unlock(&cgroup_mutex);
1762 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1781 1763
1782 simple_xattrs_free(&cgrp->xattrs); 1764 simple_xattrs_free(&cgrp->xattrs);
1783 1765
@@ -1889,7 +1871,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path);
1889struct task_and_cgroup { 1871struct task_and_cgroup {
1890 struct task_struct *task; 1872 struct task_struct *task;
1891 struct cgroup *cgrp; 1873 struct cgroup *cgrp;
1892 struct css_set *cg; 1874 struct css_set *cset;
1893}; 1875};
1894 1876
1895struct cgroup_taskset { 1877struct cgroup_taskset {
@@ -1939,18 +1921,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1939EXPORT_SYMBOL_GPL(cgroup_taskset_next); 1921EXPORT_SYMBOL_GPL(cgroup_taskset_next);
1940 1922
1941/** 1923/**
1942 * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task 1924 * cgroup_taskset_cur_css - return the matching css for the current task
1943 * @tset: taskset of interest 1925 * @tset: taskset of interest
1926 * @subsys_id: the ID of the target subsystem
1944 * 1927 *
1945 * Return the cgroup for the current (last returned) task of @tset. This 1928 * Return the css for the current (last returned) task of @tset for
1946 * function must be preceded by either cgroup_taskset_first() or 1929 * subsystem specified by @subsys_id. This function must be preceded by
1947 * cgroup_taskset_next(). 1930 * either cgroup_taskset_first() or cgroup_taskset_next().
1948 */ 1931 */
1949struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) 1932struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
1933 int subsys_id)
1950{ 1934{
1951 return tset->cur_cgrp; 1935 return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
1952} 1936}
1953EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); 1937EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
1954 1938
1955/** 1939/**
1956 * cgroup_taskset_size - return the number of tasks in taskset 1940 * cgroup_taskset_size - return the number of tasks in taskset
@@ -2089,8 +2073,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2089 * step 1: check that we can legitimately attach to the cgroup. 2073 * step 1: check that we can legitimately attach to the cgroup.
2090 */ 2074 */
2091 for_each_root_subsys(root, ss) { 2075 for_each_root_subsys(root, ss) {
2076 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2077
2092 if (ss->can_attach) { 2078 if (ss->can_attach) {
2093 retval = ss->can_attach(cgrp, &tset); 2079 retval = ss->can_attach(css, &tset);
2094 if (retval) { 2080 if (retval) {
2095 failed_ss = ss; 2081 failed_ss = ss;
2096 goto out_cancel_attach; 2082 goto out_cancel_attach;
@@ -2107,8 +2093,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2107 2093
2108 tc = flex_array_get(group, i); 2094 tc = flex_array_get(group, i);
2109 old_cset = task_css_set(tc->task); 2095 old_cset = task_css_set(tc->task);
2110 tc->cg = find_css_set(old_cset, cgrp); 2096 tc->cset = find_css_set(old_cset, cgrp);
2111 if (!tc->cg) { 2097 if (!tc->cset) {
2112 retval = -ENOMEM; 2098 retval = -ENOMEM;
2113 goto out_put_css_set_refs; 2099 goto out_put_css_set_refs;
2114 } 2100 }
@@ -2121,7 +2107,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2121 */ 2107 */
2122 for (i = 0; i < group_size; i++) { 2108 for (i = 0; i < group_size; i++) {
2123 tc = flex_array_get(group, i); 2109 tc = flex_array_get(group, i);
2124 cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); 2110 cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
2125 } 2111 }
2126 /* nothing is sensitive to fork() after this point. */ 2112 /* nothing is sensitive to fork() after this point. */
2127 2113
@@ -2129,8 +2115,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2129 * step 4: do subsystem attach callbacks. 2115 * step 4: do subsystem attach callbacks.
2130 */ 2116 */
2131 for_each_root_subsys(root, ss) { 2117 for_each_root_subsys(root, ss) {
2118 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2119
2132 if (ss->attach) 2120 if (ss->attach)
2133 ss->attach(cgrp, &tset); 2121 ss->attach(css, &tset);
2134 } 2122 }
2135 2123
2136 /* 2124 /*
@@ -2141,18 +2129,20 @@ out_put_css_set_refs:
2141 if (retval) { 2129 if (retval) {
2142 for (i = 0; i < group_size; i++) { 2130 for (i = 0; i < group_size; i++) {
2143 tc = flex_array_get(group, i); 2131 tc = flex_array_get(group, i);
2144 if (!tc->cg) 2132 if (!tc->cset)
2145 break; 2133 break;
2146 put_css_set(tc->cg); 2134 put_css_set(tc->cset);
2147 } 2135 }
2148 } 2136 }
2149out_cancel_attach: 2137out_cancel_attach:
2150 if (retval) { 2138 if (retval) {
2151 for_each_root_subsys(root, ss) { 2139 for_each_root_subsys(root, ss) {
2140 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2141
2152 if (ss == failed_ss) 2142 if (ss == failed_ss)
2153 break; 2143 break;
2154 if (ss->cancel_attach) 2144 if (ss->cancel_attach)
2155 ss->cancel_attach(cgrp, &tset); 2145 ss->cancel_attach(css, &tset);
2156 } 2146 }
2157 } 2147 }
2158out_free_group_list: 2148out_free_group_list:
@@ -2253,9 +2243,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2253 2243
2254 mutex_lock(&cgroup_mutex); 2244 mutex_lock(&cgroup_mutex);
2255 for_each_active_root(root) { 2245 for_each_active_root(root) {
2256 struct cgroup *from_cg = task_cgroup_from_root(from, root); 2246 struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
2257 2247
2258 retval = cgroup_attach_task(from_cg, tsk, false); 2248 retval = cgroup_attach_task(from_cgrp, tsk, false);
2259 if (retval) 2249 if (retval)
2260 break; 2250 break;
2261 } 2251 }
@@ -2265,34 +2255,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2265} 2255}
2266EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 2256EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2267 2257
2268static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2258static int cgroup_tasks_write(struct cgroup_subsys_state *css,
2259 struct cftype *cft, u64 pid)
2269{ 2260{
2270 return attach_task_by_pid(cgrp, pid, false); 2261 return attach_task_by_pid(css->cgroup, pid, false);
2271} 2262}
2272 2263
2273static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) 2264static int cgroup_procs_write(struct cgroup_subsys_state *css,
2265 struct cftype *cft, u64 tgid)
2274{ 2266{
2275 return attach_task_by_pid(cgrp, tgid, true); 2267 return attach_task_by_pid(css->cgroup, tgid, true);
2276} 2268}
2277 2269
2278static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 2270static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2279 const char *buffer) 2271 struct cftype *cft, const char *buffer)
2280{ 2272{
2281 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); 2273 BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX);
2282 if (strlen(buffer) >= PATH_MAX) 2274 if (strlen(buffer) >= PATH_MAX)
2283 return -EINVAL; 2275 return -EINVAL;
2284 if (!cgroup_lock_live_group(cgrp)) 2276 if (!cgroup_lock_live_group(css->cgroup))
2285 return -ENODEV; 2277 return -ENODEV;
2286 mutex_lock(&cgroup_root_mutex); 2278 mutex_lock(&cgroup_root_mutex);
2287 strcpy(cgrp->root->release_agent_path, buffer); 2279 strcpy(css->cgroup->root->release_agent_path, buffer);
2288 mutex_unlock(&cgroup_root_mutex); 2280 mutex_unlock(&cgroup_root_mutex);
2289 mutex_unlock(&cgroup_mutex); 2281 mutex_unlock(&cgroup_mutex);
2290 return 0; 2282 return 0;
2291} 2283}
2292 2284
2293static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, 2285static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
2294 struct seq_file *seq) 2286 struct cftype *cft, struct seq_file *seq)
2295{ 2287{
2288 struct cgroup *cgrp = css->cgroup;
2289
2296 if (!cgroup_lock_live_group(cgrp)) 2290 if (!cgroup_lock_live_group(cgrp))
2297 return -ENODEV; 2291 return -ENODEV;
2298 seq_puts(seq, cgrp->root->release_agent_path); 2292 seq_puts(seq, cgrp->root->release_agent_path);
@@ -2301,20 +2295,20 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
2301 return 0; 2295 return 0;
2302} 2296}
2303 2297
2304static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, 2298static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css,
2305 struct seq_file *seq) 2299 struct cftype *cft, struct seq_file *seq)
2306{ 2300{
2307 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); 2301 seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup));
2308 return 0; 2302 return 0;
2309} 2303}
2310 2304
2311/* A buffer size big enough for numbers or short strings */ 2305/* A buffer size big enough for numbers or short strings */
2312#define CGROUP_LOCAL_BUFFER_SIZE 64 2306#define CGROUP_LOCAL_BUFFER_SIZE 64
2313 2307
2314static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, 2308static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css,
2315 struct file *file, 2309 struct cftype *cft, struct file *file,
2316 const char __user *userbuf, 2310 const char __user *userbuf, size_t nbytes,
2317 size_t nbytes, loff_t *unused_ppos) 2311 loff_t *unused_ppos)
2318{ 2312{
2319 char buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2313 char buffer[CGROUP_LOCAL_BUFFER_SIZE];
2320 int retval = 0; 2314 int retval = 0;
@@ -2332,22 +2326,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
2332 u64 val = simple_strtoull(strstrip(buffer), &end, 0); 2326 u64 val = simple_strtoull(strstrip(buffer), &end, 0);
2333 if (*end) 2327 if (*end)
2334 return -EINVAL; 2328 return -EINVAL;
2335 retval = cft->write_u64(cgrp, cft, val); 2329 retval = cft->write_u64(css, cft, val);
2336 } else { 2330 } else {
2337 s64 val = simple_strtoll(strstrip(buffer), &end, 0); 2331 s64 val = simple_strtoll(strstrip(buffer), &end, 0);
2338 if (*end) 2332 if (*end)
2339 return -EINVAL; 2333 return -EINVAL;
2340 retval = cft->write_s64(cgrp, cft, val); 2334 retval = cft->write_s64(css, cft, val);
2341 } 2335 }
2342 if (!retval) 2336 if (!retval)
2343 retval = nbytes; 2337 retval = nbytes;
2344 return retval; 2338 return retval;
2345} 2339}
2346 2340
2347static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, 2341static ssize_t cgroup_write_string(struct cgroup_subsys_state *css,
2348 struct file *file, 2342 struct cftype *cft, struct file *file,
2349 const char __user *userbuf, 2343 const char __user *userbuf, size_t nbytes,
2350 size_t nbytes, loff_t *unused_ppos) 2344 loff_t *unused_ppos)
2351{ 2345{
2352 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2346 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
2353 int retval = 0; 2347 int retval = 0;
@@ -2370,7 +2364,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
2370 } 2364 }
2371 2365
2372 buffer[nbytes] = 0; /* nul-terminate */ 2366 buffer[nbytes] = 0; /* nul-terminate */
2373 retval = cft->write_string(cgrp, cft, strstrip(buffer)); 2367 retval = cft->write_string(css, cft, strstrip(buffer));
2374 if (!retval) 2368 if (!retval)
2375 retval = nbytes; 2369 retval = nbytes;
2376out: 2370out:
@@ -2380,65 +2374,60 @@ out:
2380} 2374}
2381 2375
2382static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 2376static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2383 size_t nbytes, loff_t *ppos) 2377 size_t nbytes, loff_t *ppos)
2384{ 2378{
2379 struct cfent *cfe = __d_cfe(file->f_dentry);
2385 struct cftype *cft = __d_cft(file->f_dentry); 2380 struct cftype *cft = __d_cft(file->f_dentry);
2386 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2381 struct cgroup_subsys_state *css = cfe->css;
2387 2382
2388 if (cgroup_is_dead(cgrp))
2389 return -ENODEV;
2390 if (cft->write) 2383 if (cft->write)
2391 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 2384 return cft->write(css, cft, file, buf, nbytes, ppos);
2392 if (cft->write_u64 || cft->write_s64) 2385 if (cft->write_u64 || cft->write_s64)
2393 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); 2386 return cgroup_write_X64(css, cft, file, buf, nbytes, ppos);
2394 if (cft->write_string) 2387 if (cft->write_string)
2395 return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); 2388 return cgroup_write_string(css, cft, file, buf, nbytes, ppos);
2396 if (cft->trigger) { 2389 if (cft->trigger) {
2397 int ret = cft->trigger(cgrp, (unsigned int)cft->private); 2390 int ret = cft->trigger(css, (unsigned int)cft->private);
2398 return ret ? ret : nbytes; 2391 return ret ? ret : nbytes;
2399 } 2392 }
2400 return -EINVAL; 2393 return -EINVAL;
2401} 2394}
2402 2395
2403static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, 2396static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css,
2404 struct file *file, 2397 struct cftype *cft, struct file *file,
2405 char __user *buf, size_t nbytes, 2398 char __user *buf, size_t nbytes, loff_t *ppos)
2406 loff_t *ppos)
2407{ 2399{
2408 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2400 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2409 u64 val = cft->read_u64(cgrp, cft); 2401 u64 val = cft->read_u64(css, cft);
2410 int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 2402 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2411 2403
2412 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2404 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2413} 2405}
2414 2406
2415static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, 2407static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css,
2416 struct file *file, 2408 struct cftype *cft, struct file *file,
2417 char __user *buf, size_t nbytes, 2409 char __user *buf, size_t nbytes, loff_t *ppos)
2418 loff_t *ppos)
2419{ 2410{
2420 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2411 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2421 s64 val = cft->read_s64(cgrp, cft); 2412 s64 val = cft->read_s64(css, cft);
2422 int len = sprintf(tmp, "%lld\n", (long long) val); 2413 int len = sprintf(tmp, "%lld\n", (long long) val);
2423 2414
2424 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2415 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2425} 2416}
2426 2417
2427static ssize_t cgroup_file_read(struct file *file, char __user *buf, 2418static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2428 size_t nbytes, loff_t *ppos) 2419 size_t nbytes, loff_t *ppos)
2429{ 2420{
2421 struct cfent *cfe = __d_cfe(file->f_dentry);
2430 struct cftype *cft = __d_cft(file->f_dentry); 2422 struct cftype *cft = __d_cft(file->f_dentry);
2431 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2423 struct cgroup_subsys_state *css = cfe->css;
2432
2433 if (cgroup_is_dead(cgrp))
2434 return -ENODEV;
2435 2424
2436 if (cft->read) 2425 if (cft->read)
2437 return cft->read(cgrp, cft, file, buf, nbytes, ppos); 2426 return cft->read(css, cft, file, buf, nbytes, ppos);
2438 if (cft->read_u64) 2427 if (cft->read_u64)
2439 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); 2428 return cgroup_read_u64(css, cft, file, buf, nbytes, ppos);
2440 if (cft->read_s64) 2429 if (cft->read_s64)
2441 return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); 2430 return cgroup_read_s64(css, cft, file, buf, nbytes, ppos);
2442 return -EINVAL; 2431 return -EINVAL;
2443} 2432}
2444 2433
@@ -2447,11 +2436,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2447 * supports string->u64 maps, but can be extended in future. 2436 * supports string->u64 maps, but can be extended in future.
2448 */ 2437 */
2449 2438
2450struct cgroup_seqfile_state {
2451 struct cftype *cft;
2452 struct cgroup *cgroup;
2453};
2454
2455static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) 2439static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2456{ 2440{
2457 struct seq_file *sf = cb->state; 2441 struct seq_file *sf = cb->state;
@@ -2460,69 +2444,86 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2460 2444
2461static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2445static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2462{ 2446{
2463 struct cgroup_seqfile_state *state = m->private; 2447 struct cfent *cfe = m->private;
2464 struct cftype *cft = state->cft; 2448 struct cftype *cft = cfe->type;
2449 struct cgroup_subsys_state *css = cfe->css;
2450
2465 if (cft->read_map) { 2451 if (cft->read_map) {
2466 struct cgroup_map_cb cb = { 2452 struct cgroup_map_cb cb = {
2467 .fill = cgroup_map_add, 2453 .fill = cgroup_map_add,
2468 .state = m, 2454 .state = m,
2469 }; 2455 };
2470 return cft->read_map(state->cgroup, cft, &cb); 2456 return cft->read_map(css, cft, &cb);
2471 } 2457 }
2472 return cft->read_seq_string(state->cgroup, cft, m); 2458 return cft->read_seq_string(css, cft, m);
2473}
2474
2475static int cgroup_seqfile_release(struct inode *inode, struct file *file)
2476{
2477 struct seq_file *seq = file->private_data;
2478 kfree(seq->private);
2479 return single_release(inode, file);
2480} 2459}
2481 2460
2482static const struct file_operations cgroup_seqfile_operations = { 2461static const struct file_operations cgroup_seqfile_operations = {
2483 .read = seq_read, 2462 .read = seq_read,
2484 .write = cgroup_file_write, 2463 .write = cgroup_file_write,
2485 .llseek = seq_lseek, 2464 .llseek = seq_lseek,
2486 .release = cgroup_seqfile_release, 2465 .release = single_release,
2487}; 2466};
2488 2467
2489static int cgroup_file_open(struct inode *inode, struct file *file) 2468static int cgroup_file_open(struct inode *inode, struct file *file)
2490{ 2469{
2470 struct cfent *cfe = __d_cfe(file->f_dentry);
2471 struct cftype *cft = __d_cft(file->f_dentry);
2472 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
2473 struct cgroup_subsys_state *css;
2491 int err; 2474 int err;
2492 struct cftype *cft;
2493 2475
2494 err = generic_file_open(inode, file); 2476 err = generic_file_open(inode, file);
2495 if (err) 2477 if (err)
2496 return err; 2478 return err;
2497 cft = __d_cft(file->f_dentry);
2498 2479
2499 if (cft->read_map || cft->read_seq_string) { 2480 /*
2500 struct cgroup_seqfile_state *state; 2481 * If the file belongs to a subsystem, pin the css. Will be
2482 * unpinned either on open failure or release. This ensures that
2483 * @css stays alive for all file operations.
2484 */
2485 rcu_read_lock();
2486 css = cgroup_css(cgrp, cft->ss);
2487 if (cft->ss && !css_tryget(css))
2488 css = NULL;
2489 rcu_read_unlock();
2501 2490
2502 state = kzalloc(sizeof(*state), GFP_USER); 2491 if (!css)
2503 if (!state) 2492 return -ENODEV;
2504 return -ENOMEM; 2493
2494 /*
2495 * @cfe->css is used by read/write/close to determine the
2496 * associated css. @file->private_data would be a better place but
2497 * that's already used by seqfile. Multiple accessors may use it
2498 * simultaneously which is okay as the association never changes.
2499 */
2500 WARN_ON_ONCE(cfe->css && cfe->css != css);
2501 cfe->css = css;
2505 2502
2506 state->cft = cft; 2503 if (cft->read_map || cft->read_seq_string) {
2507 state->cgroup = __d_cgrp(file->f_dentry->d_parent);
2508 file->f_op = &cgroup_seqfile_operations; 2504 file->f_op = &cgroup_seqfile_operations;
2509 err = single_open(file, cgroup_seqfile_show, state); 2505 err = single_open(file, cgroup_seqfile_show, cfe);
2510 if (err < 0) 2506 } else if (cft->open) {
2511 kfree(state);
2512 } else if (cft->open)
2513 err = cft->open(inode, file); 2507 err = cft->open(inode, file);
2514 else 2508 }
2515 err = 0;
2516 2509
2510 if (css->ss && err)
2511 css_put(css);
2517 return err; 2512 return err;
2518} 2513}
2519 2514
2520static int cgroup_file_release(struct inode *inode, struct file *file) 2515static int cgroup_file_release(struct inode *inode, struct file *file)
2521{ 2516{
2517 struct cfent *cfe = __d_cfe(file->f_dentry);
2522 struct cftype *cft = __d_cft(file->f_dentry); 2518 struct cftype *cft = __d_cft(file->f_dentry);
2519 struct cgroup_subsys_state *css = cfe->css;
2520 int ret = 0;
2521
2523 if (cft->release) 2522 if (cft->release)
2524 return cft->release(inode, file); 2523 ret = cft->release(inode, file);
2525 return 0; 2524 if (css->ss)
2525 css_put(css);
2526 return ret;
2526} 2527}
2527 2528
2528/* 2529/*
@@ -2736,8 +2737,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2736 return mode; 2737 return mode;
2737} 2738}
2738 2739
2739static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2740static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2740 struct cftype *cft)
2741{ 2741{
2742 struct dentry *dir = cgrp->dentry; 2742 struct dentry *dir = cgrp->dentry;
2743 struct cgroup *parent = __d_cgrp(dir); 2743 struct cgroup *parent = __d_cgrp(dir);
@@ -2747,8 +2747,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2747 umode_t mode; 2747 umode_t mode;
2748 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2748 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2749 2749
2750 if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { 2750 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
2751 strcpy(name, subsys->name); 2751 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2752 strcpy(name, cft->ss->name);
2752 strcat(name, "."); 2753 strcat(name, ".");
2753 } 2754 }
2754 strcat(name, cft->name); 2755 strcat(name, cft->name);
@@ -2782,11 +2783,25 @@ out:
2782 return error; 2783 return error;
2783} 2784}
2784 2785
2785static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2786/**
2786 struct cftype cfts[], bool is_add) 2787 * cgroup_addrm_files - add or remove files to a cgroup directory
2788 * @cgrp: the target cgroup
2789 * @cfts: array of cftypes to be added
2790 * @is_add: whether to add or remove
2791 *
2792 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
2793 * For removals, this function never fails. If addition fails, this
2794 * function doesn't remove files already added. The caller is responsible
2795 * for cleaning up.
2796 */
2797static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2798 bool is_add)
2787{ 2799{
2788 struct cftype *cft; 2800 struct cftype *cft;
2789 int err, ret = 0; 2801 int ret;
2802
2803 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
2804 lockdep_assert_held(&cgroup_mutex);
2790 2805
2791 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2806 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2792 /* does cft->flags tell us to skip this file on @cgrp? */ 2807 /* does cft->flags tell us to skip this file on @cgrp? */
@@ -2798,16 +2813,17 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2798 continue; 2813 continue;
2799 2814
2800 if (is_add) { 2815 if (is_add) {
2801 err = cgroup_add_file(cgrp, subsys, cft); 2816 ret = cgroup_add_file(cgrp, cft);
2802 if (err) 2817 if (ret) {
2803 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", 2818 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2804 cft->name, err); 2819 cft->name, ret);
2805 ret = err; 2820 return ret;
2821 }
2806 } else { 2822 } else {
2807 cgroup_rm_file(cgrp, cft); 2823 cgroup_rm_file(cgrp, cft);
2808 } 2824 }
2809 } 2825 }
2810 return ret; 2826 return 0;
2811} 2827}
2812 2828
2813static void cgroup_cfts_prepare(void) 2829static void cgroup_cfts_prepare(void)
@@ -2816,28 +2832,30 @@ static void cgroup_cfts_prepare(void)
2816 /* 2832 /*
2817 * Thanks to the entanglement with vfs inode locking, we can't walk 2833 * Thanks to the entanglement with vfs inode locking, we can't walk
2818 * the existing cgroups under cgroup_mutex and create files. 2834 * the existing cgroups under cgroup_mutex and create files.
2819 * Instead, we use cgroup_for_each_descendant_pre() and drop RCU 2835 * Instead, we use css_for_each_descendant_pre() and drop RCU read
2820 * read lock before calling cgroup_addrm_files(). 2836 * lock before calling cgroup_addrm_files().
2821 */ 2837 */
2822 mutex_lock(&cgroup_mutex); 2838 mutex_lock(&cgroup_mutex);
2823} 2839}
2824 2840
2825static void cgroup_cfts_commit(struct cgroup_subsys *ss, 2841static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2826 struct cftype *cfts, bool is_add)
2827 __releases(&cgroup_mutex) 2842 __releases(&cgroup_mutex)
2828{ 2843{
2829 LIST_HEAD(pending); 2844 LIST_HEAD(pending);
2830 struct cgroup *cgrp, *root = &ss->root->top_cgroup; 2845 struct cgroup_subsys *ss = cfts[0].ss;
2846 struct cgroup *root = &ss->root->top_cgroup;
2831 struct super_block *sb = ss->root->sb; 2847 struct super_block *sb = ss->root->sb;
2832 struct dentry *prev = NULL; 2848 struct dentry *prev = NULL;
2833 struct inode *inode; 2849 struct inode *inode;
2850 struct cgroup_subsys_state *css;
2834 u64 update_before; 2851 u64 update_before;
2852 int ret = 0;
2835 2853
2836 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2854 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2837 if (!cfts || ss->root == &cgroup_dummy_root || 2855 if (!cfts || ss->root == &cgroup_dummy_root ||
2838 !atomic_inc_not_zero(&sb->s_active)) { 2856 !atomic_inc_not_zero(&sb->s_active)) {
2839 mutex_unlock(&cgroup_mutex); 2857 mutex_unlock(&cgroup_mutex);
2840 return; 2858 return 0;
2841 } 2859 }
2842 2860
2843 /* 2861 /*
@@ -2849,17 +2867,11 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2849 2867
2850 mutex_unlock(&cgroup_mutex); 2868 mutex_unlock(&cgroup_mutex);
2851 2869
2852 /* @root always needs to be updated */
2853 inode = root->dentry->d_inode;
2854 mutex_lock(&inode->i_mutex);
2855 mutex_lock(&cgroup_mutex);
2856 cgroup_addrm_files(root, ss, cfts, is_add);
2857 mutex_unlock(&cgroup_mutex);
2858 mutex_unlock(&inode->i_mutex);
2859
2860 /* add/rm files for all cgroups created before */ 2870 /* add/rm files for all cgroups created before */
2861 rcu_read_lock(); 2871 rcu_read_lock();
2862 cgroup_for_each_descendant_pre(cgrp, root) { 2872 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2873 struct cgroup *cgrp = css->cgroup;
2874
2863 if (cgroup_is_dead(cgrp)) 2875 if (cgroup_is_dead(cgrp))
2864 continue; 2876 continue;
2865 2877
@@ -2873,15 +2885,18 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2873 mutex_lock(&inode->i_mutex); 2885 mutex_lock(&inode->i_mutex);
2874 mutex_lock(&cgroup_mutex); 2886 mutex_lock(&cgroup_mutex);
2875 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) 2887 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2876 cgroup_addrm_files(cgrp, ss, cfts, is_add); 2888 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2877 mutex_unlock(&cgroup_mutex); 2889 mutex_unlock(&cgroup_mutex);
2878 mutex_unlock(&inode->i_mutex); 2890 mutex_unlock(&inode->i_mutex);
2879 2891
2880 rcu_read_lock(); 2892 rcu_read_lock();
2893 if (ret)
2894 break;
2881 } 2895 }
2882 rcu_read_unlock(); 2896 rcu_read_unlock();
2883 dput(prev); 2897 dput(prev);
2884 deactivate_super(sb); 2898 deactivate_super(sb);
2899 return ret;
2885} 2900}
2886 2901
2887/** 2902/**
@@ -2901,49 +2916,56 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2901int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2916int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2902{ 2917{
2903 struct cftype_set *set; 2918 struct cftype_set *set;
2919 struct cftype *cft;
2920 int ret;
2904 2921
2905 set = kzalloc(sizeof(*set), GFP_KERNEL); 2922 set = kzalloc(sizeof(*set), GFP_KERNEL);
2906 if (!set) 2923 if (!set)
2907 return -ENOMEM; 2924 return -ENOMEM;
2908 2925
2926 for (cft = cfts; cft->name[0] != '\0'; cft++)
2927 cft->ss = ss;
2928
2909 cgroup_cfts_prepare(); 2929 cgroup_cfts_prepare();
2910 set->cfts = cfts; 2930 set->cfts = cfts;
2911 list_add_tail(&set->node, &ss->cftsets); 2931 list_add_tail(&set->node, &ss->cftsets);
2912 cgroup_cfts_commit(ss, cfts, true); 2932 ret = cgroup_cfts_commit(cfts, true);
2913 2933 if (ret)
2914 return 0; 2934 cgroup_rm_cftypes(cfts);
2935 return ret;
2915} 2936}
2916EXPORT_SYMBOL_GPL(cgroup_add_cftypes); 2937EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2917 2938
2918/** 2939/**
2919 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem 2940 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2920 * @ss: target cgroup subsystem
2921 * @cfts: zero-length name terminated array of cftypes 2941 * @cfts: zero-length name terminated array of cftypes
2922 * 2942 *
2923 * Unregister @cfts from @ss. Files described by @cfts are removed from 2943 * Unregister @cfts. Files described by @cfts are removed from all
2924 * all existing cgroups to which @ss is attached and all future cgroups 2944 * existing cgroups and all future cgroups won't have them either. This
2925 * won't have them either. This function can be called anytime whether @ss 2945 * function can be called anytime whether @cfts' subsys is attached or not.
2926 * is attached or not.
2927 * 2946 *
2928 * Returns 0 on successful unregistration, -ENOENT if @cfts is not 2947 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2929 * registered with @ss. 2948 * registered.
2930 */ 2949 */
2931int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2950int cgroup_rm_cftypes(struct cftype *cfts)
2932{ 2951{
2933 struct cftype_set *set; 2952 struct cftype_set *set;
2934 2953
2954 if (!cfts || !cfts[0].ss)
2955 return -ENOENT;
2956
2935 cgroup_cfts_prepare(); 2957 cgroup_cfts_prepare();
2936 2958
2937 list_for_each_entry(set, &ss->cftsets, node) { 2959 list_for_each_entry(set, &cfts[0].ss->cftsets, node) {
2938 if (set->cfts == cfts) { 2960 if (set->cfts == cfts) {
2939 list_del(&set->node); 2961 list_del(&set->node);
2940 kfree(set); 2962 kfree(set);
2941 cgroup_cfts_commit(ss, cfts, false); 2963 cgroup_cfts_commit(cfts, false);
2942 return 0; 2964 return 0;
2943 } 2965 }
2944 } 2966 }
2945 2967
2946 cgroup_cfts_commit(ss, NULL, false); 2968 cgroup_cfts_commit(NULL, false);
2947 return -ENOENT; 2969 return -ENOENT;
2948} 2970}
2949 2971
@@ -2966,34 +2988,10 @@ int cgroup_task_count(const struct cgroup *cgrp)
2966} 2988}
2967 2989
2968/* 2990/*
2969 * Advance a list_head iterator. The iterator should be positioned at 2991 * To reduce the fork() overhead for systems that are not actually using
2970 * the start of a css_set 2992 * their cgroups capability, we don't maintain the lists running through
2971 */ 2993 * each css_set to its tasks until we see the list actually used - in other
2972static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) 2994 * words after the first call to css_task_iter_start().
2973{
2974 struct list_head *l = it->cset_link;
2975 struct cgrp_cset_link *link;
2976 struct css_set *cset;
2977
2978 /* Advance to the next non-empty css_set */
2979 do {
2980 l = l->next;
2981 if (l == &cgrp->cset_links) {
2982 it->cset_link = NULL;
2983 return;
2984 }
2985 link = list_entry(l, struct cgrp_cset_link, cset_link);
2986 cset = link->cset;
2987 } while (list_empty(&cset->tasks));
2988 it->cset_link = l;
2989 it->task = cset->tasks.next;
2990}
2991
2992/*
2993 * To reduce the fork() overhead for systems that are not actually
2994 * using their cgroups capability, we don't maintain the lists running
2995 * through each css_set to its tasks until we see the list actually
2996 * used - in other words after the first call to cgroup_iter_start().
2997 */ 2995 */
2998static void cgroup_enable_task_cg_lists(void) 2996static void cgroup_enable_task_cg_lists(void)
2999{ 2997{
@@ -3024,16 +3022,21 @@ static void cgroup_enable_task_cg_lists(void)
3024} 3022}
3025 3023
3026/** 3024/**
3027 * cgroup_next_sibling - find the next sibling of a given cgroup 3025 * css_next_child - find the next child of a given css
3028 * @pos: the current cgroup 3026 * @pos_css: the current position (%NULL to initiate traversal)
3027 * @parent_css: css whose children to walk
3029 * 3028 *
3030 * This function returns the next sibling of @pos and should be called 3029 * This function returns the next child of @parent_css and should be called
3031 * under RCU read lock. The only requirement is that @pos is accessible. 3030 * under RCU read lock. The only requirement is that @parent_css and
3032 * The next sibling is guaranteed to be returned regardless of @pos's 3031 * @pos_css are accessible. The next sibling is guaranteed to be returned
3033 * state. 3032 * regardless of their states.
3034 */ 3033 */
3035struct cgroup *cgroup_next_sibling(struct cgroup *pos) 3034struct cgroup_subsys_state *
3035css_next_child(struct cgroup_subsys_state *pos_css,
3036 struct cgroup_subsys_state *parent_css)
3036{ 3037{
3038 struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
3039 struct cgroup *cgrp = parent_css->cgroup;
3037 struct cgroup *next; 3040 struct cgroup *next;
3038 3041
3039 WARN_ON_ONCE(!rcu_read_lock_held()); 3042 WARN_ON_ONCE(!rcu_read_lock_held());
@@ -3048,78 +3051,81 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos)
3048 * safe to dereference from this RCU critical section. If 3051 * safe to dereference from this RCU critical section. If
3049 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed 3052 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
3050 * to be visible as %true here. 3053 * to be visible as %true here.
3054 *
3055 * If @pos is dead, its next pointer can't be dereferenced;
3056 * however, as each cgroup is given a monotonically increasing
3057 * unique serial number and always appended to the sibling list,
3058 * the next one can be found by walking the parent's children until
3059 * we see a cgroup with higher serial number than @pos's. While
3060 * this path can be slower, it's taken only when either the current
3061 * cgroup is removed or iteration and removal race.
3051 */ 3062 */
3052 if (likely(!cgroup_is_dead(pos))) { 3063 if (!pos) {
3064 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
3065 } else if (likely(!cgroup_is_dead(pos))) {
3053 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3066 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3054 if (&next->sibling != &pos->parent->children) 3067 } else {
3055 return next; 3068 list_for_each_entry_rcu(next, &cgrp->children, sibling)
3056 return NULL; 3069 if (next->serial_nr > pos->serial_nr)
3070 break;
3057 } 3071 }
3058 3072
3059 /* 3073 if (&next->sibling == &cgrp->children)
3060 * Can't dereference the next pointer. Each cgroup is given a 3074 return NULL;
3061 * monotonically increasing unique serial number and always 3075
3062 * appended to the sibling list, so the next one can be found by 3076 return cgroup_css(next, parent_css->ss);
3063 * walking the parent's children until we see a cgroup with higher
3064 * serial number than @pos's.
3065 *
3066 * While this path can be slow, it's taken only when either the
3067 * current cgroup is removed or iteration and removal race.
3068 */
3069 list_for_each_entry_rcu(next, &pos->parent->children, sibling)
3070 if (next->serial_nr > pos->serial_nr)
3071 return next;
3072 return NULL;
3073} 3077}
3074EXPORT_SYMBOL_GPL(cgroup_next_sibling); 3078EXPORT_SYMBOL_GPL(css_next_child);
3075 3079
3076/** 3080/**
3077 * cgroup_next_descendant_pre - find the next descendant for pre-order walk 3081 * css_next_descendant_pre - find the next descendant for pre-order walk
3078 * @pos: the current position (%NULL to initiate traversal) 3082 * @pos: the current position (%NULL to initiate traversal)
3079 * @cgroup: cgroup whose descendants to walk 3083 * @root: css whose descendants to walk
3080 * 3084 *
3081 * To be used by cgroup_for_each_descendant_pre(). Find the next 3085 * To be used by css_for_each_descendant_pre(). Find the next descendant
3082 * descendant to visit for pre-order traversal of @cgroup's descendants. 3086 * to visit for pre-order traversal of @root's descendants. @root is
3087 * included in the iteration and the first node to be visited.
3083 * 3088 *
3084 * While this function requires RCU read locking, it doesn't require the 3089 * While this function requires RCU read locking, it doesn't require the
3085 * whole traversal to be contained in a single RCU critical section. This 3090 * whole traversal to be contained in a single RCU critical section. This
3086 * function will return the correct next descendant as long as both @pos 3091 * function will return the correct next descendant as long as both @pos
3087 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3092 * and @root are accessible and @pos is a descendant of @root.
3088 */ 3093 */
3089struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, 3094struct cgroup_subsys_state *
3090 struct cgroup *cgroup) 3095css_next_descendant_pre(struct cgroup_subsys_state *pos,
3096 struct cgroup_subsys_state *root)
3091{ 3097{
3092 struct cgroup *next; 3098 struct cgroup_subsys_state *next;
3093 3099
3094 WARN_ON_ONCE(!rcu_read_lock_held()); 3100 WARN_ON_ONCE(!rcu_read_lock_held());
3095 3101
3096 /* if first iteration, pretend we just visited @cgroup */ 3102 /* if first iteration, visit @root */
3097 if (!pos) 3103 if (!pos)
3098 pos = cgroup; 3104 return root;
3099 3105
3100 /* visit the first child if exists */ 3106 /* visit the first child if exists */
3101 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); 3107 next = css_next_child(NULL, pos);
3102 if (next) 3108 if (next)
3103 return next; 3109 return next;
3104 3110
3105 /* no child, visit my or the closest ancestor's next sibling */ 3111 /* no child, visit my or the closest ancestor's next sibling */
3106 while (pos != cgroup) { 3112 while (pos != root) {
3107 next = cgroup_next_sibling(pos); 3113 next = css_next_child(pos, css_parent(pos));
3108 if (next) 3114 if (next)
3109 return next; 3115 return next;
3110 pos = pos->parent; 3116 pos = css_parent(pos);
3111 } 3117 }
3112 3118
3113 return NULL; 3119 return NULL;
3114} 3120}
3115EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); 3121EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3116 3122
3117/** 3123/**
3118 * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup 3124 * css_rightmost_descendant - return the rightmost descendant of a css
3119 * @pos: cgroup of interest 3125 * @pos: css of interest
3120 * 3126 *
3121 * Return the rightmost descendant of @pos. If there's no descendant, 3127 * Return the rightmost descendant of @pos. If there's no descendant, @pos
3122 * @pos is returned. This can be used during pre-order traversal to skip 3128 * is returned. This can be used during pre-order traversal to skip
3123 * subtree of @pos. 3129 * subtree of @pos.
3124 * 3130 *
3125 * While this function requires RCU read locking, it doesn't require the 3131 * While this function requires RCU read locking, it doesn't require the
@@ -3127,9 +3133,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3127 * function will return the correct rightmost descendant as long as @pos is 3133 * function will return the correct rightmost descendant as long as @pos is
3128 * accessible. 3134 * accessible.
3129 */ 3135 */
3130struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) 3136struct cgroup_subsys_state *
3137css_rightmost_descendant(struct cgroup_subsys_state *pos)
3131{ 3138{
3132 struct cgroup *last, *tmp; 3139 struct cgroup_subsys_state *last, *tmp;
3133 3140
3134 WARN_ON_ONCE(!rcu_read_lock_held()); 3141 WARN_ON_ONCE(!rcu_read_lock_held());
3135 3142
@@ -3137,82 +3144,138 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
3137 last = pos; 3144 last = pos;
3138 /* ->prev isn't RCU safe, walk ->next till the end */ 3145 /* ->prev isn't RCU safe, walk ->next till the end */
3139 pos = NULL; 3146 pos = NULL;
3140 list_for_each_entry_rcu(tmp, &last->children, sibling) 3147 css_for_each_child(tmp, last)
3141 pos = tmp; 3148 pos = tmp;
3142 } while (pos); 3149 } while (pos);
3143 3150
3144 return last; 3151 return last;
3145} 3152}
3146EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); 3153EXPORT_SYMBOL_GPL(css_rightmost_descendant);
3147 3154
3148static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) 3155static struct cgroup_subsys_state *
3156css_leftmost_descendant(struct cgroup_subsys_state *pos)
3149{ 3157{
3150 struct cgroup *last; 3158 struct cgroup_subsys_state *last;
3151 3159
3152 do { 3160 do {
3153 last = pos; 3161 last = pos;
3154 pos = list_first_or_null_rcu(&pos->children, struct cgroup, 3162 pos = css_next_child(NULL, pos);
3155 sibling);
3156 } while (pos); 3163 } while (pos);
3157 3164
3158 return last; 3165 return last;
3159} 3166}
3160 3167
3161/** 3168/**
3162 * cgroup_next_descendant_post - find the next descendant for post-order walk 3169 * css_next_descendant_post - find the next descendant for post-order walk
3163 * @pos: the current position (%NULL to initiate traversal) 3170 * @pos: the current position (%NULL to initiate traversal)
3164 * @cgroup: cgroup whose descendants to walk 3171 * @root: css whose descendants to walk
3165 * 3172 *
3166 * To be used by cgroup_for_each_descendant_post(). Find the next 3173 * To be used by css_for_each_descendant_post(). Find the next descendant
3167 * descendant to visit for post-order traversal of @cgroup's descendants. 3174 * to visit for post-order traversal of @root's descendants. @root is
3175 * included in the iteration and the last node to be visited.
3168 * 3176 *
3169 * While this function requires RCU read locking, it doesn't require the 3177 * While this function requires RCU read locking, it doesn't require the
3170 * whole traversal to be contained in a single RCU critical section. This 3178 * whole traversal to be contained in a single RCU critical section. This
3171 * function will return the correct next descendant as long as both @pos 3179 * function will return the correct next descendant as long as both @pos
3172 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3180 * and @cgroup are accessible and @pos is a descendant of @cgroup.
3173 */ 3181 */
3174struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, 3182struct cgroup_subsys_state *
3175 struct cgroup *cgroup) 3183css_next_descendant_post(struct cgroup_subsys_state *pos,
3184 struct cgroup_subsys_state *root)
3176{ 3185{
3177 struct cgroup *next; 3186 struct cgroup_subsys_state *next;
3178 3187
3179 WARN_ON_ONCE(!rcu_read_lock_held()); 3188 WARN_ON_ONCE(!rcu_read_lock_held());
3180 3189
3181 /* if first iteration, visit the leftmost descendant */ 3190 /* if first iteration, visit the leftmost descendant */
3182 if (!pos) { 3191 if (!pos) {
3183 next = cgroup_leftmost_descendant(cgroup); 3192 next = css_leftmost_descendant(root);
3184 return next != cgroup ? next : NULL; 3193 return next != root ? next : NULL;
3185 } 3194 }
3186 3195
3196 /* if we visited @root, we're done */
3197 if (pos == root)
3198 return NULL;
3199
3187 /* if there's an unvisited sibling, visit its leftmost descendant */ 3200 /* if there's an unvisited sibling, visit its leftmost descendant */
3188 next = cgroup_next_sibling(pos); 3201 next = css_next_child(pos, css_parent(pos));
3189 if (next) 3202 if (next)
3190 return cgroup_leftmost_descendant(next); 3203 return css_leftmost_descendant(next);
3191 3204
3192 /* no sibling left, visit parent */ 3205 /* no sibling left, visit parent */
3193 next = pos->parent; 3206 return css_parent(pos);
3194 return next != cgroup ? next : NULL;
3195} 3207}
3196EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); 3208EXPORT_SYMBOL_GPL(css_next_descendant_post);
3197 3209
3198void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 3210/**
3211 * css_advance_task_iter - advance a task itererator to the next css_set
3212 * @it: the iterator to advance
3213 *
3214 * Advance @it to the next css_set to walk.
3215 */
3216static void css_advance_task_iter(struct css_task_iter *it)
3217{
3218 struct list_head *l = it->cset_link;
3219 struct cgrp_cset_link *link;
3220 struct css_set *cset;
3221
3222 /* Advance to the next non-empty css_set */
3223 do {
3224 l = l->next;
3225 if (l == &it->origin_css->cgroup->cset_links) {
3226 it->cset_link = NULL;
3227 return;
3228 }
3229 link = list_entry(l, struct cgrp_cset_link, cset_link);
3230 cset = link->cset;
3231 } while (list_empty(&cset->tasks));
3232 it->cset_link = l;
3233 it->task = cset->tasks.next;
3234}
3235
3236/**
3237 * css_task_iter_start - initiate task iteration
3238 * @css: the css to walk tasks of
3239 * @it: the task iterator to use
3240 *
3241 * Initiate iteration through the tasks of @css. The caller can call
3242 * css_task_iter_next() to walk through the tasks until the function
3243 * returns NULL. On completion of iteration, css_task_iter_end() must be
3244 * called.
3245 *
3246 * Note that this function acquires a lock which is released when the
3247 * iteration finishes. The caller can't sleep while iteration is in
3248 * progress.
3249 */
3250void css_task_iter_start(struct cgroup_subsys_state *css,
3251 struct css_task_iter *it)
3199 __acquires(css_set_lock) 3252 __acquires(css_set_lock)
3200{ 3253{
3201 /* 3254 /*
3202 * The first time anyone tries to iterate across a cgroup, 3255 * The first time anyone tries to iterate across a css, we need to
3203 * we need to enable the list linking each css_set to its 3256 * enable the list linking each css_set to its tasks, and fix up
3204 * tasks, and fix up all existing tasks. 3257 * all existing tasks.
3205 */ 3258 */
3206 if (!use_task_css_set_links) 3259 if (!use_task_css_set_links)
3207 cgroup_enable_task_cg_lists(); 3260 cgroup_enable_task_cg_lists();
3208 3261
3209 read_lock(&css_set_lock); 3262 read_lock(&css_set_lock);
3210 it->cset_link = &cgrp->cset_links; 3263
3211 cgroup_advance_iter(cgrp, it); 3264 it->origin_css = css;
3265 it->cset_link = &css->cgroup->cset_links;
3266
3267 css_advance_task_iter(it);
3212} 3268}
3213 3269
3214struct task_struct *cgroup_iter_next(struct cgroup *cgrp, 3270/**
3215 struct cgroup_iter *it) 3271 * css_task_iter_next - return the next task for the iterator
3272 * @it: the task iterator being iterated
3273 *
3274 * The "next" function for task iteration. @it should have been
3275 * initialized via css_task_iter_start(). Returns NULL when the iteration
3276 * reaches the end.
3277 */
3278struct task_struct *css_task_iter_next(struct css_task_iter *it)
3216{ 3279{
3217 struct task_struct *res; 3280 struct task_struct *res;
3218 struct list_head *l = it->task; 3281 struct list_head *l = it->task;
@@ -3226,16 +3289,24 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
3226 l = l->next; 3289 l = l->next;
3227 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); 3290 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
3228 if (l == &link->cset->tasks) { 3291 if (l == &link->cset->tasks) {
3229 /* We reached the end of this task list - move on to 3292 /*
3230 * the next cg_cgroup_link */ 3293 * We reached the end of this task list - move on to the
3231 cgroup_advance_iter(cgrp, it); 3294 * next cgrp_cset_link.
3295 */
3296 css_advance_task_iter(it);
3232 } else { 3297 } else {
3233 it->task = l; 3298 it->task = l;
3234 } 3299 }
3235 return res; 3300 return res;
3236} 3301}
3237 3302
3238void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) 3303/**
3304 * css_task_iter_end - finish task iteration
3305 * @it: the task iterator to finish
3306 *
3307 * Finish task iteration started by css_task_iter_start().
3308 */
3309void css_task_iter_end(struct css_task_iter *it)
3239 __releases(css_set_lock) 3310 __releases(css_set_lock)
3240{ 3311{
3241 read_unlock(&css_set_lock); 3312 read_unlock(&css_set_lock);
@@ -3276,46 +3347,49 @@ static inline int started_after(void *p1, void *p2)
3276} 3347}
3277 3348
3278/** 3349/**
3279 * cgroup_scan_tasks - iterate though all the tasks in a cgroup 3350 * css_scan_tasks - iterate though all the tasks in a css
3280 * @scan: struct cgroup_scanner containing arguments for the scan 3351 * @css: the css to iterate tasks of
3352 * @test: optional test callback
3353 * @process: process callback
3354 * @data: data passed to @test and @process
3355 * @heap: optional pre-allocated heap used for task iteration
3356 *
3357 * Iterate through all the tasks in @css, calling @test for each, and if it
3358 * returns %true, call @process for it also.
3359 *
3360 * @test may be NULL, meaning always true (select all tasks), which
3361 * effectively duplicates css_task_iter_{start,next,end}() but does not
3362 * lock css_set_lock for the call to @process.
3363 *
3364 * It is guaranteed that @process will act on every task that is a member
3365 * of @css for the duration of this call. This function may or may not
3366 * call @process for tasks that exit or move to a different css during the
3367 * call, or are forked or move into the css during the call.
3281 * 3368 *
3282 * Arguments include pointers to callback functions test_task() and 3369 * Note that @test may be called with locks held, and may in some
3283 * process_task(). 3370 * situations be called multiple times for the same task, so it should be
3284 * Iterate through all the tasks in a cgroup, calling test_task() for each, 3371 * cheap.
3285 * and if it returns true, call process_task() for it also.
3286 * The test_task pointer may be NULL, meaning always true (select all tasks).
3287 * Effectively duplicates cgroup_iter_{start,next,end}()
3288 * but does not lock css_set_lock for the call to process_task().
3289 * The struct cgroup_scanner may be embedded in any structure of the caller's
3290 * creation.
3291 * It is guaranteed that process_task() will act on every task that
3292 * is a member of the cgroup for the duration of this call. This
3293 * function may or may not call process_task() for tasks that exit
3294 * or move to a different cgroup during the call, or are forked or
3295 * move into the cgroup during the call.
3296 * 3372 *
3297 * Note that test_task() may be called with locks held, and may in some 3373 * If @heap is non-NULL, a heap has been pre-allocated and will be used for
3298 * situations be called multiple times for the same task, so it should 3374 * heap operations (and its "gt" member will be overwritten), else a
3299 * be cheap. 3375 * temporary heap will be used (allocation of which may cause this function
3300 * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been 3376 * to fail).
3301 * pre-allocated and will be used for heap operations (and its "gt" member will
3302 * be overwritten), else a temporary heap will be used (allocation of which
3303 * may cause this function to fail).
3304 */ 3377 */
3305int cgroup_scan_tasks(struct cgroup_scanner *scan) 3378int css_scan_tasks(struct cgroup_subsys_state *css,
3379 bool (*test)(struct task_struct *, void *),
3380 void (*process)(struct task_struct *, void *),
3381 void *data, struct ptr_heap *heap)
3306{ 3382{
3307 int retval, i; 3383 int retval, i;
3308 struct cgroup_iter it; 3384 struct css_task_iter it;
3309 struct task_struct *p, *dropped; 3385 struct task_struct *p, *dropped;
3310 /* Never dereference latest_task, since it's not refcounted */ 3386 /* Never dereference latest_task, since it's not refcounted */
3311 struct task_struct *latest_task = NULL; 3387 struct task_struct *latest_task = NULL;
3312 struct ptr_heap tmp_heap; 3388 struct ptr_heap tmp_heap;
3313 struct ptr_heap *heap;
3314 struct timespec latest_time = { 0, 0 }; 3389 struct timespec latest_time = { 0, 0 };
3315 3390
3316 if (scan->heap) { 3391 if (heap) {
3317 /* The caller supplied our heap and pre-allocated its memory */ 3392 /* The caller supplied our heap and pre-allocated its memory */
3318 heap = scan->heap;
3319 heap->gt = &started_after; 3393 heap->gt = &started_after;
3320 } else { 3394 } else {
3321 /* We need to allocate our own heap memory */ 3395 /* We need to allocate our own heap memory */
@@ -3328,25 +3402,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3328 3402
3329 again: 3403 again:
3330 /* 3404 /*
3331 * Scan tasks in the cgroup, using the scanner's "test_task" callback 3405 * Scan tasks in the css, using the @test callback to determine
3332 * to determine which are of interest, and using the scanner's 3406 * which are of interest, and invoking @process callback on the
3333 * "process_task" callback to process any of them that need an update. 3407 * ones which need an update. Since we don't want to hold any
3334 * Since we don't want to hold any locks during the task updates, 3408 * locks during the task updates, gather tasks to be processed in a
3335 * gather tasks to be processed in a heap structure. 3409 * heap structure. The heap is sorted by descending task start
3336 * The heap is sorted by descending task start time. 3410 * time. If the statically-sized heap fills up, we overflow tasks
3337 * If the statically-sized heap fills up, we overflow tasks that 3411 * that started later, and in future iterations only consider tasks
3338 * started later, and in future iterations only consider tasks that 3412 * that started after the latest task in the previous pass. This
3339 * started after the latest task in the previous pass. This
3340 * guarantees forward progress and that we don't miss any tasks. 3413 * guarantees forward progress and that we don't miss any tasks.
3341 */ 3414 */
3342 heap->size = 0; 3415 heap->size = 0;
3343 cgroup_iter_start(scan->cg, &it); 3416 css_task_iter_start(css, &it);
3344 while ((p = cgroup_iter_next(scan->cg, &it))) { 3417 while ((p = css_task_iter_next(&it))) {
3345 /* 3418 /*
3346 * Only affect tasks that qualify per the caller's callback, 3419 * Only affect tasks that qualify per the caller's callback,
3347 * if he provided one 3420 * if he provided one
3348 */ 3421 */
3349 if (scan->test_task && !scan->test_task(p, scan)) 3422 if (test && !test(p, data))
3350 continue; 3423 continue;
3351 /* 3424 /*
3352 * Only process tasks that started after the last task 3425 * Only process tasks that started after the last task
@@ -3374,7 +3447,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3374 * the heap and wasn't inserted 3447 * the heap and wasn't inserted
3375 */ 3448 */
3376 } 3449 }
3377 cgroup_iter_end(scan->cg, &it); 3450 css_task_iter_end(&it);
3378 3451
3379 if (heap->size) { 3452 if (heap->size) {
3380 for (i = 0; i < heap->size; i++) { 3453 for (i = 0; i < heap->size; i++) {
@@ -3384,7 +3457,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3384 latest_task = q; 3457 latest_task = q;
3385 } 3458 }
3386 /* Process the task per the caller's callback */ 3459 /* Process the task per the caller's callback */
3387 scan->process_task(q, scan); 3460 process(q, data);
3388 put_task_struct(q); 3461 put_task_struct(q);
3389 } 3462 }
3390 /* 3463 /*
@@ -3401,10 +3474,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3401 return 0; 3474 return 0;
3402} 3475}
3403 3476
3404static void cgroup_transfer_one_task(struct task_struct *task, 3477static void cgroup_transfer_one_task(struct task_struct *task, void *data)
3405 struct cgroup_scanner *scan)
3406{ 3478{
3407 struct cgroup *new_cgroup = scan->data; 3479 struct cgroup *new_cgroup = data;
3408 3480
3409 mutex_lock(&cgroup_mutex); 3481 mutex_lock(&cgroup_mutex);
3410 cgroup_attach_task(new_cgroup, task, false); 3482 cgroup_attach_task(new_cgroup, task, false);
@@ -3418,15 +3490,8 @@ static void cgroup_transfer_one_task(struct task_struct *task,
3418 */ 3490 */
3419int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) 3491int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3420{ 3492{
3421 struct cgroup_scanner scan; 3493 return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
3422 3494 to, NULL);
3423 scan.cg = from;
3424 scan.test_task = NULL; /* select all tasks in cgroup */
3425 scan.process_task = cgroup_transfer_one_task;
3426 scan.heap = NULL;
3427 scan.data = to;
3428
3429 return cgroup_scan_tasks(&scan);
3430} 3495}
3431 3496
3432/* 3497/*
@@ -3468,7 +3533,7 @@ struct cgroup_pidlist {
3468 /* pointer to the cgroup we belong to, for list removal purposes */ 3533 /* pointer to the cgroup we belong to, for list removal purposes */
3469 struct cgroup *owner; 3534 struct cgroup *owner;
3470 /* protects the other fields */ 3535 /* protects the other fields */
3471 struct rw_semaphore mutex; 3536 struct rw_semaphore rwsem;
3472}; 3537};
3473 3538
3474/* 3539/*
@@ -3541,7 +3606,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3541 struct pid_namespace *ns = task_active_pid_ns(current); 3606 struct pid_namespace *ns = task_active_pid_ns(current);
3542 3607
3543 /* 3608 /*
3544 * We can't drop the pidlist_mutex before taking the l->mutex in case 3609 * We can't drop the pidlist_mutex before taking the l->rwsem in case
3545 * the last ref-holder is trying to remove l from the list at the same 3610 * the last ref-holder is trying to remove l from the list at the same
3546 * time. Holding the pidlist_mutex precludes somebody taking whichever 3611 * time. Holding the pidlist_mutex precludes somebody taking whichever
3547 * list we find out from under us - compare release_pid_array(). 3612 * list we find out from under us - compare release_pid_array().
@@ -3550,7 +3615,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3550 list_for_each_entry(l, &cgrp->pidlists, links) { 3615 list_for_each_entry(l, &cgrp->pidlists, links) {
3551 if (l->key.type == type && l->key.ns == ns) { 3616 if (l->key.type == type && l->key.ns == ns) {
3552 /* make sure l doesn't vanish out from under us */ 3617 /* make sure l doesn't vanish out from under us */
3553 down_write(&l->mutex); 3618 down_write(&l->rwsem);
3554 mutex_unlock(&cgrp->pidlist_mutex); 3619 mutex_unlock(&cgrp->pidlist_mutex);
3555 return l; 3620 return l;
3556 } 3621 }
@@ -3561,8 +3626,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3561 mutex_unlock(&cgrp->pidlist_mutex); 3626 mutex_unlock(&cgrp->pidlist_mutex);
3562 return l; 3627 return l;
3563 } 3628 }
3564 init_rwsem(&l->mutex); 3629 init_rwsem(&l->rwsem);
3565 down_write(&l->mutex); 3630 down_write(&l->rwsem);
3566 l->key.type = type; 3631 l->key.type = type;
3567 l->key.ns = get_pid_ns(ns); 3632 l->key.ns = get_pid_ns(ns);
3568 l->owner = cgrp; 3633 l->owner = cgrp;
@@ -3580,7 +3645,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3580 pid_t *array; 3645 pid_t *array;
3581 int length; 3646 int length;
3582 int pid, n = 0; /* used for populating the array */ 3647 int pid, n = 0; /* used for populating the array */
3583 struct cgroup_iter it; 3648 struct css_task_iter it;
3584 struct task_struct *tsk; 3649 struct task_struct *tsk;
3585 struct cgroup_pidlist *l; 3650 struct cgroup_pidlist *l;
3586 3651
@@ -3595,8 +3660,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3595 if (!array) 3660 if (!array)
3596 return -ENOMEM; 3661 return -ENOMEM;
3597 /* now, populate the array */ 3662 /* now, populate the array */
3598 cgroup_iter_start(cgrp, &it); 3663 css_task_iter_start(&cgrp->dummy_css, &it);
3599 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3664 while ((tsk = css_task_iter_next(&it))) {
3600 if (unlikely(n == length)) 3665 if (unlikely(n == length))
3601 break; 3666 break;
3602 /* get tgid or pid for procs or tasks file respectively */ 3667 /* get tgid or pid for procs or tasks file respectively */
@@ -3607,7 +3672,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3607 if (pid > 0) /* make sure to only use valid results */ 3672 if (pid > 0) /* make sure to only use valid results */
3608 array[n++] = pid; 3673 array[n++] = pid;
3609 } 3674 }
3610 cgroup_iter_end(cgrp, &it); 3675 css_task_iter_end(&it);
3611 length = n; 3676 length = n;
3612 /* now sort & (if procs) strip out duplicates */ 3677 /* now sort & (if procs) strip out duplicates */
3613 sort(array, length, sizeof(pid_t), cmppid, NULL); 3678 sort(array, length, sizeof(pid_t), cmppid, NULL);
@@ -3623,7 +3688,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3623 l->list = array; 3688 l->list = array;
3624 l->length = length; 3689 l->length = length;
3625 l->use_count++; 3690 l->use_count++;
3626 up_write(&l->mutex); 3691 up_write(&l->rwsem);
3627 *lp = l; 3692 *lp = l;
3628 return 0; 3693 return 0;
3629} 3694}
@@ -3641,7 +3706,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3641{ 3706{
3642 int ret = -EINVAL; 3707 int ret = -EINVAL;
3643 struct cgroup *cgrp; 3708 struct cgroup *cgrp;
3644 struct cgroup_iter it; 3709 struct css_task_iter it;
3645 struct task_struct *tsk; 3710 struct task_struct *tsk;
3646 3711
3647 /* 3712 /*
@@ -3655,8 +3720,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3655 ret = 0; 3720 ret = 0;
3656 cgrp = dentry->d_fsdata; 3721 cgrp = dentry->d_fsdata;
3657 3722
3658 cgroup_iter_start(cgrp, &it); 3723 css_task_iter_start(&cgrp->dummy_css, &it);
3659 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3724 while ((tsk = css_task_iter_next(&it))) {
3660 switch (tsk->state) { 3725 switch (tsk->state) {
3661 case TASK_RUNNING: 3726 case TASK_RUNNING:
3662 stats->nr_running++; 3727 stats->nr_running++;
@@ -3676,7 +3741,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3676 break; 3741 break;
3677 } 3742 }
3678 } 3743 }
3679 cgroup_iter_end(cgrp, &it); 3744 css_task_iter_end(&it);
3680 3745
3681err: 3746err:
3682 return ret; 3747 return ret;
@@ -3701,7 +3766,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3701 int index = 0, pid = *pos; 3766 int index = 0, pid = *pos;
3702 int *iter; 3767 int *iter;
3703 3768
3704 down_read(&l->mutex); 3769 down_read(&l->rwsem);
3705 if (pid) { 3770 if (pid) {
3706 int end = l->length; 3771 int end = l->length;
3707 3772
@@ -3728,7 +3793,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3728static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3793static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3729{ 3794{
3730 struct cgroup_pidlist *l = s->private; 3795 struct cgroup_pidlist *l = s->private;
3731 up_read(&l->mutex); 3796 up_read(&l->rwsem);
3732} 3797}
3733 3798
3734static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3799static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
@@ -3774,7 +3839,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3774 * pidlist_mutex, we have to take pidlist_mutex first. 3839 * pidlist_mutex, we have to take pidlist_mutex first.
3775 */ 3840 */
3776 mutex_lock(&l->owner->pidlist_mutex); 3841 mutex_lock(&l->owner->pidlist_mutex);
3777 down_write(&l->mutex); 3842 down_write(&l->rwsem);
3778 BUG_ON(!l->use_count); 3843 BUG_ON(!l->use_count);
3779 if (!--l->use_count) { 3844 if (!--l->use_count) {
3780 /* we're the last user if refcount is 0; remove and free */ 3845 /* we're the last user if refcount is 0; remove and free */
@@ -3782,12 +3847,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3782 mutex_unlock(&l->owner->pidlist_mutex); 3847 mutex_unlock(&l->owner->pidlist_mutex);
3783 pidlist_free(l->list); 3848 pidlist_free(l->list);
3784 put_pid_ns(l->key.ns); 3849 put_pid_ns(l->key.ns);
3785 up_write(&l->mutex); 3850 up_write(&l->rwsem);
3786 kfree(l); 3851 kfree(l);
3787 return; 3852 return;
3788 } 3853 }
3789 mutex_unlock(&l->owner->pidlist_mutex); 3854 mutex_unlock(&l->owner->pidlist_mutex);
3790 up_write(&l->mutex); 3855 up_write(&l->rwsem);
3791} 3856}
3792 3857
3793static int cgroup_pidlist_release(struct inode *inode, struct file *file) 3858static int cgroup_pidlist_release(struct inode *inode, struct file *file)
@@ -3851,21 +3916,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file)
3851 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); 3916 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3852} 3917}
3853 3918
3854static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 3919static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3855 struct cftype *cft) 3920 struct cftype *cft)
3856{ 3921{
3857 return notify_on_release(cgrp); 3922 return notify_on_release(css->cgroup);
3858} 3923}
3859 3924
3860static int cgroup_write_notify_on_release(struct cgroup *cgrp, 3925static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3861 struct cftype *cft, 3926 struct cftype *cft, u64 val)
3862 u64 val)
3863{ 3927{
3864 clear_bit(CGRP_RELEASABLE, &cgrp->flags); 3928 clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
3865 if (val) 3929 if (val)
3866 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3930 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3867 else 3931 else
3868 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3932 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3869 return 0; 3933 return 0;
3870} 3934}
3871 3935
@@ -3895,18 +3959,18 @@ static void cgroup_event_remove(struct work_struct *work)
3895{ 3959{
3896 struct cgroup_event *event = container_of(work, struct cgroup_event, 3960 struct cgroup_event *event = container_of(work, struct cgroup_event,
3897 remove); 3961 remove);
3898 struct cgroup *cgrp = event->cgrp; 3962 struct cgroup_subsys_state *css = event->css;
3899 3963
3900 remove_wait_queue(event->wqh, &event->wait); 3964 remove_wait_queue(event->wqh, &event->wait);
3901 3965
3902 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3966 event->cft->unregister_event(css, event->cft, event->eventfd);
3903 3967
3904 /* Notify userspace the event is going away. */ 3968 /* Notify userspace the event is going away. */
3905 eventfd_signal(event->eventfd, 1); 3969 eventfd_signal(event->eventfd, 1);
3906 3970
3907 eventfd_ctx_put(event->eventfd); 3971 eventfd_ctx_put(event->eventfd);
3908 kfree(event); 3972 kfree(event);
3909 cgroup_dput(cgrp); 3973 css_put(css);
3910} 3974}
3911 3975
3912/* 3976/*
@@ -3919,7 +3983,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3919{ 3983{
3920 struct cgroup_event *event = container_of(wait, 3984 struct cgroup_event *event = container_of(wait,
3921 struct cgroup_event, wait); 3985 struct cgroup_event, wait);
3922 struct cgroup *cgrp = event->cgrp; 3986 struct cgroup *cgrp = event->css->cgroup;
3923 unsigned long flags = (unsigned long)key; 3987 unsigned long flags = (unsigned long)key;
3924 3988
3925 if (flags & POLLHUP) { 3989 if (flags & POLLHUP) {
@@ -3963,14 +4027,15 @@ static void cgroup_event_ptable_queue_proc(struct file *file,
3963 * Input must be in format '<event_fd> <control_fd> <args>'. 4027 * Input must be in format '<event_fd> <control_fd> <args>'.
3964 * Interpretation of args is defined by control file implementation. 4028 * Interpretation of args is defined by control file implementation.
3965 */ 4029 */
3966static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, 4030static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
3967 const char *buffer) 4031 struct cftype *cft, const char *buffer)
3968{ 4032{
3969 struct cgroup_event *event = NULL; 4033 struct cgroup *cgrp = dummy_css->cgroup;
3970 struct cgroup *cgrp_cfile; 4034 struct cgroup_event *event;
4035 struct cgroup_subsys_state *cfile_css;
3971 unsigned int efd, cfd; 4036 unsigned int efd, cfd;
3972 struct file *efile = NULL; 4037 struct file *efile;
3973 struct file *cfile = NULL; 4038 struct file *cfile;
3974 char *endp; 4039 char *endp;
3975 int ret; 4040 int ret;
3976 4041
@@ -3987,7 +4052,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3987 event = kzalloc(sizeof(*event), GFP_KERNEL); 4052 event = kzalloc(sizeof(*event), GFP_KERNEL);
3988 if (!event) 4053 if (!event)
3989 return -ENOMEM; 4054 return -ENOMEM;
3990 event->cgrp = cgrp; 4055
3991 INIT_LIST_HEAD(&event->list); 4056 INIT_LIST_HEAD(&event->list);
3992 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); 4057 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3993 init_waitqueue_func_entry(&event->wait, cgroup_event_wake); 4058 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
@@ -3996,62 +4061,68 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3996 efile = eventfd_fget(efd); 4061 efile = eventfd_fget(efd);
3997 if (IS_ERR(efile)) { 4062 if (IS_ERR(efile)) {
3998 ret = PTR_ERR(efile); 4063 ret = PTR_ERR(efile);
3999 goto fail; 4064 goto out_kfree;
4000 } 4065 }
4001 4066
4002 event->eventfd = eventfd_ctx_fileget(efile); 4067 event->eventfd = eventfd_ctx_fileget(efile);
4003 if (IS_ERR(event->eventfd)) { 4068 if (IS_ERR(event->eventfd)) {
4004 ret = PTR_ERR(event->eventfd); 4069 ret = PTR_ERR(event->eventfd);
4005 goto fail; 4070 goto out_put_efile;
4006 } 4071 }
4007 4072
4008 cfile = fget(cfd); 4073 cfile = fget(cfd);
4009 if (!cfile) { 4074 if (!cfile) {
4010 ret = -EBADF; 4075 ret = -EBADF;
4011 goto fail; 4076 goto out_put_eventfd;
4012 } 4077 }
4013 4078
4014 /* the process need read permission on control file */ 4079 /* the process need read permission on control file */
4015 /* AV: shouldn't we check that it's been opened for read instead? */ 4080 /* AV: shouldn't we check that it's been opened for read instead? */
4016 ret = inode_permission(file_inode(cfile), MAY_READ); 4081 ret = inode_permission(file_inode(cfile), MAY_READ);
4017 if (ret < 0) 4082 if (ret < 0)
4018 goto fail; 4083 goto out_put_cfile;
4019 4084
4020 event->cft = __file_cft(cfile); 4085 event->cft = __file_cft(cfile);
4021 if (IS_ERR(event->cft)) { 4086 if (IS_ERR(event->cft)) {
4022 ret = PTR_ERR(event->cft); 4087 ret = PTR_ERR(event->cft);
4023 goto fail; 4088 goto out_put_cfile;
4089 }
4090
4091 if (!event->cft->ss) {
4092 ret = -EBADF;
4093 goto out_put_cfile;
4024 } 4094 }
4025 4095
4026 /* 4096 /*
4027 * The file to be monitored must be in the same cgroup as 4097 * Determine the css of @cfile, verify it belongs to the same
4028 * cgroup.event_control is. 4098 * cgroup as cgroup.event_control, and associate @event with it.
4099 * Remaining events are automatically removed on cgroup destruction
4100 * but the removal is asynchronous, so take an extra ref.
4029 */ 4101 */
4030 cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); 4102 rcu_read_lock();
4031 if (cgrp_cfile != cgrp) { 4103
4032 ret = -EINVAL; 4104 ret = -EINVAL;
4033 goto fail; 4105 event->css = cgroup_css(cgrp, event->cft->ss);
4034 } 4106 cfile_css = css_from_dir(cfile->f_dentry->d_parent, event->cft->ss);
4107 if (event->css && event->css == cfile_css && css_tryget(event->css))
4108 ret = 0;
4109
4110 rcu_read_unlock();
4111 if (ret)
4112 goto out_put_cfile;
4035 4113
4036 if (!event->cft->register_event || !event->cft->unregister_event) { 4114 if (!event->cft->register_event || !event->cft->unregister_event) {
4037 ret = -EINVAL; 4115 ret = -EINVAL;
4038 goto fail; 4116 goto out_put_css;
4039 } 4117 }
4040 4118
4041 ret = event->cft->register_event(cgrp, event->cft, 4119 ret = event->cft->register_event(event->css, event->cft,
4042 event->eventfd, buffer); 4120 event->eventfd, buffer);
4043 if (ret) 4121 if (ret)
4044 goto fail; 4122 goto out_put_css;
4045 4123
4046 efile->f_op->poll(efile, &event->pt); 4124 efile->f_op->poll(efile, &event->pt);
4047 4125
4048 /*
4049 * Events should be removed after rmdir of cgroup directory, but before
4050 * destroying subsystem state objects. Let's take reference to cgroup
4051 * directory dentry to do that.
4052 */
4053 dget(cgrp->dentry);
4054
4055 spin_lock(&cgrp->event_list_lock); 4126 spin_lock(&cgrp->event_list_lock);
4056 list_add(&event->list, &cgrp->event_list); 4127 list_add(&event->list, &cgrp->event_list);
4057 spin_unlock(&cgrp->event_list_lock); 4128 spin_unlock(&cgrp->event_list_lock);
@@ -4061,35 +4132,33 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
4061 4132
4062 return 0; 4133 return 0;
4063 4134
4064fail: 4135out_put_css:
4065 if (cfile) 4136 css_put(event->css);
4066 fput(cfile); 4137out_put_cfile:
4067 4138 fput(cfile);
4068 if (event && event->eventfd && !IS_ERR(event->eventfd)) 4139out_put_eventfd:
4069 eventfd_ctx_put(event->eventfd); 4140 eventfd_ctx_put(event->eventfd);
4070 4141out_put_efile:
4071 if (!IS_ERR_OR_NULL(efile)) 4142 fput(efile);
4072 fput(efile); 4143out_kfree:
4073
4074 kfree(event); 4144 kfree(event);
4075 4145
4076 return ret; 4146 return ret;
4077} 4147}
4078 4148
4079static u64 cgroup_clone_children_read(struct cgroup *cgrp, 4149static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4080 struct cftype *cft) 4150 struct cftype *cft)
4081{ 4151{
4082 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4152 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4083} 4153}
4084 4154
4085static int cgroup_clone_children_write(struct cgroup *cgrp, 4155static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4086 struct cftype *cft, 4156 struct cftype *cft, u64 val)
4087 u64 val)
4088{ 4157{
4089 if (val) 4158 if (val)
4090 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4159 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4091 else 4160 else
4092 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4161 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4093 return 0; 4162 return 0;
4094} 4163}
4095 4164
@@ -4148,36 +4217,34 @@ static struct cftype cgroup_base_files[] = {
4148}; 4217};
4149 4218
4150/** 4219/**
4151 * cgroup_populate_dir - selectively creation of files in a directory 4220 * cgroup_populate_dir - create subsys files in a cgroup directory
4152 * @cgrp: target cgroup 4221 * @cgrp: target cgroup
4153 * @base_files: true if the base files should be added
4154 * @subsys_mask: mask of the subsystem ids whose files should be added 4222 * @subsys_mask: mask of the subsystem ids whose files should be added
4223 *
4224 * On failure, no file is added.
4155 */ 4225 */
4156static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 4226static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
4157 unsigned long subsys_mask)
4158{ 4227{
4159 int err;
4160 struct cgroup_subsys *ss; 4228 struct cgroup_subsys *ss;
4161 4229 int i, ret = 0;
4162 if (base_files) {
4163 err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);
4164 if (err < 0)
4165 return err;
4166 }
4167 4230
4168 /* process cftsets of each subsystem */ 4231 /* process cftsets of each subsystem */
4169 for_each_root_subsys(cgrp->root, ss) { 4232 for_each_subsys(ss, i) {
4170 struct cftype_set *set; 4233 struct cftype_set *set;
4171 if (!test_bit(ss->subsys_id, &subsys_mask)) 4234
4235 if (!test_bit(i, &subsys_mask))
4172 continue; 4236 continue;
4173 4237
4174 list_for_each_entry(set, &ss->cftsets, node) 4238 list_for_each_entry(set, &ss->cftsets, node) {
4175 cgroup_addrm_files(cgrp, ss, set->cfts, true); 4239 ret = cgroup_addrm_files(cgrp, set->cfts, true);
4240 if (ret < 0)
4241 goto err;
4242 }
4176 } 4243 }
4177 4244
4178 /* This cgroup is ready now */ 4245 /* This cgroup is ready now */
4179 for_each_root_subsys(cgrp->root, ss) { 4246 for_each_root_subsys(cgrp->root, ss) {
4180 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4247 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
4181 struct css_id *id = rcu_dereference_protected(css->id, true); 4248 struct css_id *id = rcu_dereference_protected(css->id, true);
4182 4249
4183 /* 4250 /*
@@ -4190,14 +4257,57 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
4190 } 4257 }
4191 4258
4192 return 0; 4259 return 0;
4260err:
4261 cgroup_clear_dir(cgrp, subsys_mask);
4262 return ret;
4193} 4263}
4194 4264
4195static void css_dput_fn(struct work_struct *work) 4265/*
4266 * css destruction is four-stage process.
4267 *
4268 * 1. Destruction starts. Killing of the percpu_ref is initiated.
4269 * Implemented in kill_css().
4270 *
4271 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
4272 * and thus css_tryget() is guaranteed to fail, the css can be offlined
4273 * by invoking offline_css(). After offlining, the base ref is put.
4274 * Implemented in css_killed_work_fn().
4275 *
4276 * 3. When the percpu_ref reaches zero, the only possible remaining
4277 * accessors are inside RCU read sections. css_release() schedules the
4278 * RCU callback.
4279 *
4280 * 4. After the grace period, the css can be freed. Implemented in
4281 * css_free_work_fn().
4282 *
4283 * It is actually hairier because both step 2 and 4 require process context
4284 * and thus involve punting to css->destroy_work adding two additional
4285 * steps to the already complex sequence.
4286 */
4287static void css_free_work_fn(struct work_struct *work)
4196{ 4288{
4197 struct cgroup_subsys_state *css = 4289 struct cgroup_subsys_state *css =
4198 container_of(work, struct cgroup_subsys_state, dput_work); 4290 container_of(work, struct cgroup_subsys_state, destroy_work);
4291 struct cgroup *cgrp = css->cgroup;
4199 4292
4200 cgroup_dput(css->cgroup); 4293 if (css->parent)
4294 css_put(css->parent);
4295
4296 css->ss->css_free(css);
4297 cgroup_dput(cgrp);
4298}
4299
4300static void css_free_rcu_fn(struct rcu_head *rcu_head)
4301{
4302 struct cgroup_subsys_state *css =
4303 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4304
4305 /*
4306 * css holds an extra ref to @cgrp->dentry which is put on the last
4307 * css_put(). dput() requires process context which we don't have.
4308 */
4309 INIT_WORK(&css->destroy_work, css_free_work_fn);
4310 schedule_work(&css->destroy_work);
4201} 4311}
4202 4312
4203static void css_release(struct percpu_ref *ref) 4313static void css_release(struct percpu_ref *ref)
@@ -4205,49 +4315,47 @@ static void css_release(struct percpu_ref *ref)
4205 struct cgroup_subsys_state *css = 4315 struct cgroup_subsys_state *css =
4206 container_of(ref, struct cgroup_subsys_state, refcnt); 4316 container_of(ref, struct cgroup_subsys_state, refcnt);
4207 4317
4208 schedule_work(&css->dput_work); 4318 call_rcu(&css->rcu_head, css_free_rcu_fn);
4209} 4319}
4210 4320
4211static void init_cgroup_css(struct cgroup_subsys_state *css, 4321static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
4212 struct cgroup_subsys *ss, 4322 struct cgroup *cgrp)
4213 struct cgroup *cgrp)
4214{ 4323{
4215 css->cgroup = cgrp; 4324 css->cgroup = cgrp;
4325 css->ss = ss;
4216 css->flags = 0; 4326 css->flags = 0;
4217 css->id = NULL; 4327 css->id = NULL;
4218 if (cgrp == cgroup_dummy_top) 4328
4329 if (cgrp->parent)
4330 css->parent = cgroup_css(cgrp->parent, ss);
4331 else
4219 css->flags |= CSS_ROOT; 4332 css->flags |= CSS_ROOT;
4220 BUG_ON(cgrp->subsys[ss->subsys_id]);
4221 cgrp->subsys[ss->subsys_id] = css;
4222 4333
4223 /* 4334 BUG_ON(cgroup_css(cgrp, ss));
4224 * css holds an extra ref to @cgrp->dentry which is put on the last
4225 * css_put(). dput() requires process context, which css_put() may
4226 * be called without. @css->dput_work will be used to invoke
4227 * dput() asynchronously from css_put().
4228 */
4229 INIT_WORK(&css->dput_work, css_dput_fn);
4230} 4335}
4231 4336
4232/* invoke ->post_create() on a new CSS and mark it online if successful */ 4337/* invoke ->css_online() on a new CSS and mark it online if successful */
4233static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) 4338static int online_css(struct cgroup_subsys_state *css)
4234{ 4339{
4340 struct cgroup_subsys *ss = css->ss;
4235 int ret = 0; 4341 int ret = 0;
4236 4342
4237 lockdep_assert_held(&cgroup_mutex); 4343 lockdep_assert_held(&cgroup_mutex);
4238 4344
4239 if (ss->css_online) 4345 if (ss->css_online)
4240 ret = ss->css_online(cgrp); 4346 ret = ss->css_online(css);
4241 if (!ret) 4347 if (!ret) {
4242 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; 4348 css->flags |= CSS_ONLINE;
4349 css->cgroup->nr_css++;
4350 rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css);
4351 }
4243 return ret; 4352 return ret;
4244} 4353}
4245 4354
4246/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ 4355/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
4247static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) 4356static void offline_css(struct cgroup_subsys_state *css)
4248 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4249{ 4357{
4250 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4358 struct cgroup_subsys *ss = css->ss;
4251 4359
4252 lockdep_assert_held(&cgroup_mutex); 4360 lockdep_assert_held(&cgroup_mutex);
4253 4361
@@ -4255,9 +4363,11 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4255 return; 4363 return;
4256 4364
4257 if (ss->css_offline) 4365 if (ss->css_offline)
4258 ss->css_offline(cgrp); 4366 ss->css_offline(css);
4259 4367
4260 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; 4368 css->flags &= ~CSS_ONLINE;
4369 css->cgroup->nr_css--;
4370 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
4261} 4371}
4262 4372
4263/* 4373/*
@@ -4271,6 +4381,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4271static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 4381static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4272 umode_t mode) 4382 umode_t mode)
4273{ 4383{
4384 struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
4274 struct cgroup *cgrp; 4385 struct cgroup *cgrp;
4275 struct cgroup_name *name; 4386 struct cgroup_name *name;
4276 struct cgroupfs_root *root = parent->root; 4387 struct cgroupfs_root *root = parent->root;
@@ -4288,7 +4399,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4288 goto err_free_cgrp; 4399 goto err_free_cgrp;
4289 rcu_assign_pointer(cgrp->name, name); 4400 rcu_assign_pointer(cgrp->name, name);
4290 4401
4291 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); 4402 /*
4403 * Temporarily set the pointer to NULL, so idr_find() won't return
4404 * a half-baked cgroup.
4405 */
4406 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
4292 if (cgrp->id < 0) 4407 if (cgrp->id < 0)
4293 goto err_free_name; 4408 goto err_free_name;
4294 4409
@@ -4317,6 +4432,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4317 cgrp->dentry = dentry; 4432 cgrp->dentry = dentry;
4318 4433
4319 cgrp->parent = parent; 4434 cgrp->parent = parent;
4435 cgrp->dummy_css.parent = &parent->dummy_css;
4320 cgrp->root = parent->root; 4436 cgrp->root = parent->root;
4321 4437
4322 if (notify_on_release(parent)) 4438 if (notify_on_release(parent))
@@ -4328,22 +4444,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4328 for_each_root_subsys(root, ss) { 4444 for_each_root_subsys(root, ss) {
4329 struct cgroup_subsys_state *css; 4445 struct cgroup_subsys_state *css;
4330 4446
4331 css = ss->css_alloc(cgrp); 4447 css = ss->css_alloc(cgroup_css(parent, ss));
4332 if (IS_ERR(css)) { 4448 if (IS_ERR(css)) {
4333 err = PTR_ERR(css); 4449 err = PTR_ERR(css);
4334 goto err_free_all; 4450 goto err_free_all;
4335 } 4451 }
4452 css_ar[ss->subsys_id] = css;
4336 4453
4337 err = percpu_ref_init(&css->refcnt, css_release); 4454 err = percpu_ref_init(&css->refcnt, css_release);
4338 if (err) { 4455 if (err)
4339 ss->css_free(cgrp);
4340 goto err_free_all; 4456 goto err_free_all;
4341 }
4342 4457
4343 init_cgroup_css(css, ss, cgrp); 4458 init_css(css, ss, cgrp);
4344 4459
4345 if (ss->use_id) { 4460 if (ss->use_id) {
4346 err = alloc_css_id(ss, parent, cgrp); 4461 err = alloc_css_id(css);
4347 if (err) 4462 if (err)
4348 goto err_free_all; 4463 goto err_free_all;
4349 } 4464 }
@@ -4365,16 +4480,22 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4365 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4480 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4366 root->number_of_cgroups++; 4481 root->number_of_cgroups++;
4367 4482
4368 /* each css holds a ref to the cgroup's dentry */ 4483 /* each css holds a ref to the cgroup's dentry and the parent css */
4369 for_each_root_subsys(root, ss) 4484 for_each_root_subsys(root, ss) {
4485 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4486
4370 dget(dentry); 4487 dget(dentry);
4488 css_get(css->parent);
4489 }
4371 4490
4372 /* hold a ref to the parent's dentry */ 4491 /* hold a ref to the parent's dentry */
4373 dget(parent->dentry); 4492 dget(parent->dentry);
4374 4493
4375 /* creation succeeded, notify subsystems */ 4494 /* creation succeeded, notify subsystems */
4376 for_each_root_subsys(root, ss) { 4495 for_each_root_subsys(root, ss) {
4377 err = online_css(ss, cgrp); 4496 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4497
4498 err = online_css(css);
4378 if (err) 4499 if (err)
4379 goto err_destroy; 4500 goto err_destroy;
4380 4501
@@ -4388,7 +4509,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4388 } 4509 }
4389 } 4510 }
4390 4511
4391 err = cgroup_populate_dir(cgrp, true, root->subsys_mask); 4512 idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4513
4514 err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4515 if (err)
4516 goto err_destroy;
4517
4518 err = cgroup_populate_dir(cgrp, root->subsys_mask);
4392 if (err) 4519 if (err)
4393 goto err_destroy; 4520 goto err_destroy;
4394 4521
@@ -4399,18 +4526,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4399 4526
4400err_free_all: 4527err_free_all:
4401 for_each_root_subsys(root, ss) { 4528 for_each_root_subsys(root, ss) {
4402 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4529 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4403 4530
4404 if (css) { 4531 if (css) {
4405 percpu_ref_cancel_init(&css->refcnt); 4532 percpu_ref_cancel_init(&css->refcnt);
4406 ss->css_free(cgrp); 4533 ss->css_free(css);
4407 } 4534 }
4408 } 4535 }
4409 mutex_unlock(&cgroup_mutex); 4536 mutex_unlock(&cgroup_mutex);
4410 /* Release the reference count that we took on the superblock */ 4537 /* Release the reference count that we took on the superblock */
4411 deactivate_super(sb); 4538 deactivate_super(sb);
4412err_free_id: 4539err_free_id:
4413 ida_simple_remove(&root->cgroup_ida, cgrp->id); 4540 idr_remove(&root->cgroup_idr, cgrp->id);
4414err_free_name: 4541err_free_name:
4415 kfree(rcu_dereference_raw(cgrp->name)); 4542 kfree(rcu_dereference_raw(cgrp->name));
4416err_free_cgrp: 4543err_free_cgrp:
@@ -4432,22 +4559,84 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4432 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4559 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4433} 4560}
4434 4561
4435static void cgroup_css_killed(struct cgroup *cgrp) 4562/*
4563 * This is called when the refcnt of a css is confirmed to be killed.
4564 * css_tryget() is now guaranteed to fail.
4565 */
4566static void css_killed_work_fn(struct work_struct *work)
4436{ 4567{
4437 if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) 4568 struct cgroup_subsys_state *css =
4438 return; 4569 container_of(work, struct cgroup_subsys_state, destroy_work);
4570 struct cgroup *cgrp = css->cgroup;
4439 4571
4440 /* percpu ref's of all css's are killed, kick off the next step */ 4572 mutex_lock(&cgroup_mutex);
4441 INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); 4573
4442 schedule_work(&cgrp->destroy_work); 4574 /*
4575 * css_tryget() is guaranteed to fail now. Tell subsystems to
4576 * initate destruction.
4577 */
4578 offline_css(css);
4579
4580 /*
4581 * If @cgrp is marked dead, it's waiting for refs of all css's to
4582 * be disabled before proceeding to the second phase of cgroup
4583 * destruction. If we are the last one, kick it off.
4584 */
4585 if (!cgrp->nr_css && cgroup_is_dead(cgrp))
4586 cgroup_destroy_css_killed(cgrp);
4587
4588 mutex_unlock(&cgroup_mutex);
4589
4590 /*
4591 * Put the css refs from kill_css(). Each css holds an extra
4592 * reference to the cgroup's dentry and cgroup removal proceeds
4593 * regardless of css refs. On the last put of each css, whenever
4594 * that may be, the extra dentry ref is put so that dentry
4595 * destruction happens only after all css's are released.
4596 */
4597 css_put(css);
4443} 4598}
4444 4599
4445static void css_ref_killed_fn(struct percpu_ref *ref) 4600/* css kill confirmation processing requires process context, bounce */
4601static void css_killed_ref_fn(struct percpu_ref *ref)
4446{ 4602{
4447 struct cgroup_subsys_state *css = 4603 struct cgroup_subsys_state *css =
4448 container_of(ref, struct cgroup_subsys_state, refcnt); 4604 container_of(ref, struct cgroup_subsys_state, refcnt);
4449 4605
4450 cgroup_css_killed(css->cgroup); 4606 INIT_WORK(&css->destroy_work, css_killed_work_fn);
4607 schedule_work(&css->destroy_work);
4608}
4609
4610/**
4611 * kill_css - destroy a css
4612 * @css: css to destroy
4613 *
4614 * This function initiates destruction of @css by removing cgroup interface
4615 * files and putting its base reference. ->css_offline() will be invoked
4616 * asynchronously once css_tryget() is guaranteed to fail and when the
4617 * reference count reaches zero, @css will be released.
4618 */
4619static void kill_css(struct cgroup_subsys_state *css)
4620{
4621 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
4622
4623 /*
4624 * Killing would put the base ref, but we need to keep it alive
4625 * until after ->css_offline().
4626 */
4627 css_get(css);
4628
4629 /*
4630 * cgroup core guarantees that, by the time ->css_offline() is
4631 * invoked, no new css reference will be given out via
4632 * css_tryget(). We can't simply call percpu_ref_kill() and
4633 * proceed to offlining css's because percpu_ref_kill() doesn't
4634 * guarantee that the ref is seen as killed on all CPUs on return.
4635 *
4636 * Use percpu_ref_kill_and_confirm() to get notifications as each
4637 * css is confirmed to be seen as killed on all CPUs.
4638 */
4639 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4451} 4640}
4452 4641
4453/** 4642/**
@@ -4513,41 +4702,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4513 return -EBUSY; 4702 return -EBUSY;
4514 4703
4515 /* 4704 /*
4516 * Block new css_tryget() by killing css refcnts. cgroup core 4705 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4517 * guarantees that, by the time ->css_offline() is invoked, no new 4706 * will be invoked to perform the rest of destruction once the
4518 * css reference will be given out via css_tryget(). We can't 4707 * percpu refs of all css's are confirmed to be killed.
4519 * simply call percpu_ref_kill() and proceed to offlining css's
4520 * because percpu_ref_kill() doesn't guarantee that the ref is seen
4521 * as killed on all CPUs on return.
4522 *
4523 * Use percpu_ref_kill_and_confirm() to get notifications as each
4524 * css is confirmed to be seen as killed on all CPUs. The
4525 * notification callback keeps track of the number of css's to be
4526 * killed and schedules cgroup_offline_fn() to perform the rest of
4527 * destruction once the percpu refs of all css's are confirmed to
4528 * be killed.
4529 */ 4708 */
4530 atomic_set(&cgrp->css_kill_cnt, 1); 4709 for_each_root_subsys(cgrp->root, ss)
4531 for_each_root_subsys(cgrp->root, ss) { 4710 kill_css(cgroup_css(cgrp, ss));
4532 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
4533
4534 /*
4535 * Killing would put the base ref, but we need to keep it
4536 * alive until after ->css_offline.
4537 */
4538 percpu_ref_get(&css->refcnt);
4539
4540 atomic_inc(&cgrp->css_kill_cnt);
4541 percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
4542 }
4543 cgroup_css_killed(cgrp);
4544 4711
4545 /* 4712 /*
4546 * Mark @cgrp dead. This prevents further task migration and child 4713 * Mark @cgrp dead. This prevents further task migration and child
4547 * creation by disabling cgroup_lock_live_group(). Note that 4714 * creation by disabling cgroup_lock_live_group(). Note that
4548 * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to 4715 * CGRP_DEAD assertion is depended upon by css_next_child() to
4549 * resume iteration after dropping RCU read lock. See 4716 * resume iteration after dropping RCU read lock. See
4550 * cgroup_next_sibling() for details. 4717 * css_next_child() for details.
4551 */ 4718 */
4552 set_bit(CGRP_DEAD, &cgrp->flags); 4719 set_bit(CGRP_DEAD, &cgrp->flags);
4553 4720
@@ -4558,9 +4725,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4558 raw_spin_unlock(&release_list_lock); 4725 raw_spin_unlock(&release_list_lock);
4559 4726
4560 /* 4727 /*
4561 * Remove @cgrp directory. The removal puts the base ref but we 4728 * If @cgrp has css's attached, the second stage of cgroup
4562 * aren't quite done with @cgrp yet, so hold onto it. 4729 * destruction is kicked off from css_killed_work_fn() after the
4730 * refs of all attached css's are killed. If @cgrp doesn't have
4731 * any css, we kick it off here.
4732 */
4733 if (!cgrp->nr_css)
4734 cgroup_destroy_css_killed(cgrp);
4735
4736 /*
4737 * Clear the base files and remove @cgrp directory. The removal
4738 * puts the base ref but we aren't quite done with @cgrp yet, so
4739 * hold onto it.
4563 */ 4740 */
4741 cgroup_addrm_files(cgrp, cgroup_base_files, false);
4564 dget(d); 4742 dget(d);
4565 cgroup_d_remove_dir(d); 4743 cgroup_d_remove_dir(d);
4566 4744
@@ -4580,50 +4758,36 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4580}; 4758};
4581 4759
4582/** 4760/**
4583 * cgroup_offline_fn - the second step of cgroup destruction 4761 * cgroup_destroy_css_killed - the second step of cgroup destruction
4584 * @work: cgroup->destroy_free_work 4762 * @work: cgroup->destroy_free_work
4585 * 4763 *
4586 * This function is invoked from a work item for a cgroup which is being 4764 * This function is invoked from a work item for a cgroup which is being
4587 * destroyed after the percpu refcnts of all css's are guaranteed to be 4765 * destroyed after all css's are offlined and performs the rest of
4588 * seen as killed on all CPUs, and performs the rest of destruction. This 4766 * destruction. This is the second step of destruction described in the
4589 * is the second step of destruction described in the comment above 4767 * comment above cgroup_destroy_locked().
4590 * cgroup_destroy_locked().
4591 */ 4768 */
4592static void cgroup_offline_fn(struct work_struct *work) 4769static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4593{ 4770{
4594 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
4595 struct cgroup *parent = cgrp->parent; 4771 struct cgroup *parent = cgrp->parent;
4596 struct dentry *d = cgrp->dentry; 4772 struct dentry *d = cgrp->dentry;
4597 struct cgroup_subsys *ss;
4598 4773
4599 mutex_lock(&cgroup_mutex); 4774 lockdep_assert_held(&cgroup_mutex);
4600 4775
4601 /* 4776 /* delete this cgroup from parent->children */
4602 * css_tryget() is guaranteed to fail now. Tell subsystems to 4777 list_del_rcu(&cgrp->sibling);
4603 * initate destruction.
4604 */
4605 for_each_root_subsys(cgrp->root, ss)
4606 offline_css(ss, cgrp);
4607 4778
4608 /* 4779 /*
4609 * Put the css refs from cgroup_destroy_locked(). Each css holds 4780 * We should remove the cgroup object from idr before its grace
4610 * an extra reference to the cgroup's dentry and cgroup removal 4781 * period starts, so we won't be looking up a cgroup while the
4611 * proceeds regardless of css refs. On the last put of each css, 4782 * cgroup is being freed.
4612 * whenever that may be, the extra dentry ref is put so that dentry
4613 * destruction happens only after all css's are released.
4614 */ 4783 */
4615 for_each_root_subsys(cgrp->root, ss) 4784 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4616 css_put(cgrp->subsys[ss->subsys_id]); 4785 cgrp->id = -1;
4617
4618 /* delete this cgroup from parent->children */
4619 list_del_rcu(&cgrp->sibling);
4620 4786
4621 dput(d); 4787 dput(d);
4622 4788
4623 set_bit(CGRP_RELEASABLE, &parent->flags); 4789 set_bit(CGRP_RELEASABLE, &parent->flags);
4624 check_for_release(parent); 4790 check_for_release(parent);
4625
4626 mutex_unlock(&cgroup_mutex);
4627} 4791}
4628 4792
4629static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4793static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4646,6 +4810,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4646 * deregistration. 4810 * deregistration.
4647 */ 4811 */
4648 if (ss->base_cftypes) { 4812 if (ss->base_cftypes) {
4813 struct cftype *cft;
4814
4815 for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++)
4816 cft->ss = ss;
4817
4649 ss->base_cftset.cfts = ss->base_cftypes; 4818 ss->base_cftset.cfts = ss->base_cftypes;
4650 list_add_tail(&ss->base_cftset.node, &ss->cftsets); 4819 list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4651 } 4820 }
@@ -4665,10 +4834,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4665 /* Create the top cgroup state for this subsystem */ 4834 /* Create the top cgroup state for this subsystem */
4666 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); 4835 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4667 ss->root = &cgroup_dummy_root; 4836 ss->root = &cgroup_dummy_root;
4668 css = ss->css_alloc(cgroup_dummy_top); 4837 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4669 /* We don't handle early failures gracefully */ 4838 /* We don't handle early failures gracefully */
4670 BUG_ON(IS_ERR(css)); 4839 BUG_ON(IS_ERR(css));
4671 init_cgroup_css(css, ss, cgroup_dummy_top); 4840 init_css(css, ss, cgroup_dummy_top);
4672 4841
4673 /* Update the init_css_set to contain a subsys 4842 /* Update the init_css_set to contain a subsys
4674 * pointer to this state - since the subsystem is 4843 * pointer to this state - since the subsystem is
@@ -4683,7 +4852,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4683 * need to invoke fork callbacks here. */ 4852 * need to invoke fork callbacks here. */
4684 BUG_ON(!list_empty(&init_task.tasks)); 4853 BUG_ON(!list_empty(&init_task.tasks));
4685 4854
4686 BUG_ON(online_css(ss, cgroup_dummy_top)); 4855 BUG_ON(online_css(css));
4687 4856
4688 mutex_unlock(&cgroup_mutex); 4857 mutex_unlock(&cgroup_mutex);
4689 4858
@@ -4744,7 +4913,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4744 * struct, so this can happen first (i.e. before the dummy root 4913 * struct, so this can happen first (i.e. before the dummy root
4745 * attachment). 4914 * attachment).
4746 */ 4915 */
4747 css = ss->css_alloc(cgroup_dummy_top); 4916 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4748 if (IS_ERR(css)) { 4917 if (IS_ERR(css)) {
4749 /* failure case - need to deassign the cgroup_subsys[] slot. */ 4918 /* failure case - need to deassign the cgroup_subsys[] slot. */
4750 cgroup_subsys[ss->subsys_id] = NULL; 4919 cgroup_subsys[ss->subsys_id] = NULL;
@@ -4756,8 +4925,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4756 ss->root = &cgroup_dummy_root; 4925 ss->root = &cgroup_dummy_root;
4757 4926
4758 /* our new subsystem will be attached to the dummy hierarchy. */ 4927 /* our new subsystem will be attached to the dummy hierarchy. */
4759 init_cgroup_css(css, ss, cgroup_dummy_top); 4928 init_css(css, ss, cgroup_dummy_top);
4760 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4929 /* init_idr must be after init_css() because it sets css->id. */
4761 if (ss->use_id) { 4930 if (ss->use_id) {
4762 ret = cgroup_init_idr(ss, css); 4931 ret = cgroup_init_idr(ss, css);
4763 if (ret) 4932 if (ret)
@@ -4787,7 +4956,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4787 } 4956 }
4788 write_unlock(&css_set_lock); 4957 write_unlock(&css_set_lock);
4789 4958
4790 ret = online_css(ss, cgroup_dummy_top); 4959 ret = online_css(css);
4791 if (ret) 4960 if (ret)
4792 goto err_unload; 4961 goto err_unload;
4793 4962
@@ -4819,14 +4988,14 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4819 4988
4820 /* 4989 /*
4821 * we shouldn't be called if the subsystem is in use, and the use of 4990 * we shouldn't be called if the subsystem is in use, and the use of
4822 * try_module_get in parse_cgroupfs_options should ensure that it 4991 * try_module_get() in rebind_subsystems() should ensure that it
4823 * doesn't start being used while we're killing it off. 4992 * doesn't start being used while we're killing it off.
4824 */ 4993 */
4825 BUG_ON(ss->root != &cgroup_dummy_root); 4994 BUG_ON(ss->root != &cgroup_dummy_root);
4826 4995
4827 mutex_lock(&cgroup_mutex); 4996 mutex_lock(&cgroup_mutex);
4828 4997
4829 offline_css(ss, cgroup_dummy_top); 4998 offline_css(cgroup_css(cgroup_dummy_top, ss));
4830 4999
4831 if (ss->use_id) 5000 if (ss->use_id)
4832 idr_destroy(&ss->idr); 5001 idr_destroy(&ss->idr);
@@ -4860,8 +5029,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4860 * the cgrp->subsys pointer to find their state. note that this 5029 * the cgrp->subsys pointer to find their state. note that this
4861 * also takes care of freeing the css_id. 5030 * also takes care of freeing the css_id.
4862 */ 5031 */
4863 ss->css_free(cgroup_dummy_top); 5032 ss->css_free(cgroup_css(cgroup_dummy_top, ss));
4864 cgroup_dummy_top->subsys[ss->subsys_id] = NULL; 5033 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4865 5034
4866 mutex_unlock(&cgroup_mutex); 5035 mutex_unlock(&cgroup_mutex);
4867} 5036}
@@ -4943,6 +5112,10 @@ int __init cgroup_init(void)
4943 5112
4944 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); 5113 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
4945 5114
5115 err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
5116 0, 1, GFP_KERNEL);
5117 BUG_ON(err < 0);
5118
4946 mutex_unlock(&cgroup_root_mutex); 5119 mutex_unlock(&cgroup_root_mutex);
4947 mutex_unlock(&cgroup_mutex); 5120 mutex_unlock(&cgroup_mutex);
4948 5121
@@ -5099,7 +5272,7 @@ void cgroup_fork(struct task_struct *child)
5099 * Adds the task to the list running through its css_set if necessary and 5272 * Adds the task to the list running through its css_set if necessary and
5100 * call the subsystem fork() callbacks. Has to be after the task is 5273 * call the subsystem fork() callbacks. Has to be after the task is
5101 * visible on the task list in case we race with the first call to 5274 * visible on the task list in case we race with the first call to
5102 * cgroup_iter_start() - to guarantee that the new task ends up on its 5275 * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5103 * list. 5276 * list.
5104 */ 5277 */
5105void cgroup_post_fork(struct task_struct *child) 5278void cgroup_post_fork(struct task_struct *child)
@@ -5212,10 +5385,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5212 */ 5385 */
5213 for_each_builtin_subsys(ss, i) { 5386 for_each_builtin_subsys(ss, i) {
5214 if (ss->exit) { 5387 if (ss->exit) {
5215 struct cgroup *old_cgrp = cset->subsys[i]->cgroup; 5388 struct cgroup_subsys_state *old_css = cset->subsys[i];
5216 struct cgroup *cgrp = task_cgroup(tsk, i); 5389 struct cgroup_subsys_state *css = task_css(tsk, i);
5217 5390
5218 ss->exit(cgrp, old_cgrp, tsk); 5391 ss->exit(css, old_css, tsk);
5219 } 5392 }
5220 } 5393 }
5221 } 5394 }
@@ -5474,20 +5647,16 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
5474 return 0; 5647 return 0;
5475} 5648}
5476 5649
5477static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, 5650static int alloc_css_id(struct cgroup_subsys_state *child_css)
5478 struct cgroup *child)
5479{ 5651{
5480 int subsys_id, i, depth = 0; 5652 struct cgroup_subsys_state *parent_css = css_parent(child_css);
5481 struct cgroup_subsys_state *parent_css, *child_css;
5482 struct css_id *child_id, *parent_id; 5653 struct css_id *child_id, *parent_id;
5654 int i, depth;
5483 5655
5484 subsys_id = ss->subsys_id;
5485 parent_css = parent->subsys[subsys_id];
5486 child_css = child->subsys[subsys_id];
5487 parent_id = rcu_dereference_protected(parent_css->id, true); 5656 parent_id = rcu_dereference_protected(parent_css->id, true);
5488 depth = parent_id->depth + 1; 5657 depth = parent_id->depth + 1;
5489 5658
5490 child_id = get_new_cssid(ss, depth); 5659 child_id = get_new_cssid(child_css->ss, depth);
5491 if (IS_ERR(child_id)) 5660 if (IS_ERR(child_id))
5492 return PTR_ERR(child_id); 5661 return PTR_ERR(child_id);
5493 5662
@@ -5525,31 +5694,56 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
5525} 5694}
5526EXPORT_SYMBOL_GPL(css_lookup); 5695EXPORT_SYMBOL_GPL(css_lookup);
5527 5696
5528/* 5697/**
5529 * get corresponding css from file open on cgroupfs directory 5698 * css_from_dir - get corresponding css from the dentry of a cgroup dir
5699 * @dentry: directory dentry of interest
5700 * @ss: subsystem of interest
5701 *
5702 * Must be called under RCU read lock. The caller is responsible for
5703 * pinning the returned css if it needs to be accessed outside the RCU
5704 * critical section.
5530 */ 5705 */
5531struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) 5706struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
5707 struct cgroup_subsys *ss)
5532{ 5708{
5533 struct cgroup *cgrp; 5709 struct cgroup *cgrp;
5534 struct inode *inode;
5535 struct cgroup_subsys_state *css;
5536 5710
5537 inode = file_inode(f); 5711 WARN_ON_ONCE(!rcu_read_lock_held());
5538 /* check in cgroup filesystem dir */ 5712
5539 if (inode->i_op != &cgroup_dir_inode_operations) 5713 /* is @dentry a cgroup dir? */
5714 if (!dentry->d_inode ||
5715 dentry->d_inode->i_op != &cgroup_dir_inode_operations)
5540 return ERR_PTR(-EBADF); 5716 return ERR_PTR(-EBADF);
5541 5717
5542 if (id < 0 || id >= CGROUP_SUBSYS_COUNT) 5718 cgrp = __d_cgrp(dentry);
5543 return ERR_PTR(-EINVAL); 5719 return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT);
5720}
5544 5721
5545 /* get cgroup */ 5722/**
5546 cgrp = __d_cgrp(f->f_dentry); 5723 * css_from_id - lookup css by id
5547 css = cgrp->subsys[id]; 5724 * @id: the cgroup id
5548 return css ? css : ERR_PTR(-ENOENT); 5725 * @ss: cgroup subsys to be looked into
5726 *
5727 * Returns the css if there's valid one with @id, otherwise returns NULL.
5728 * Should be called under rcu_read_lock().
5729 */
5730struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5731{
5732 struct cgroup *cgrp;
5733
5734 rcu_lockdep_assert(rcu_read_lock_held() ||
5735 lockdep_is_held(&cgroup_mutex),
5736 "css_from_id() needs proper protection");
5737
5738 cgrp = idr_find(&ss->root->cgroup_idr, id);
5739 if (cgrp)
5740 return cgroup_css(cgrp, ss);
5741 return NULL;
5549} 5742}
5550 5743
5551#ifdef CONFIG_CGROUP_DEBUG 5744#ifdef CONFIG_CGROUP_DEBUG
5552static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) 5745static struct cgroup_subsys_state *
5746debug_css_alloc(struct cgroup_subsys_state *parent_css)
5553{ 5747{
5554 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5748 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5555 5749
@@ -5559,22 +5753,24 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
5559 return css; 5753 return css;
5560} 5754}
5561 5755
5562static void debug_css_free(struct cgroup *cgrp) 5756static void debug_css_free(struct cgroup_subsys_state *css)
5563{ 5757{
5564 kfree(cgrp->subsys[debug_subsys_id]); 5758 kfree(css);
5565} 5759}
5566 5760
5567static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) 5761static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
5762 struct cftype *cft)
5568{ 5763{
5569 return cgroup_task_count(cgrp); 5764 return cgroup_task_count(css->cgroup);
5570} 5765}
5571 5766
5572static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) 5767static u64 current_css_set_read(struct cgroup_subsys_state *css,
5768 struct cftype *cft)
5573{ 5769{
5574 return (u64)(unsigned long)current->cgroups; 5770 return (u64)(unsigned long)current->cgroups;
5575} 5771}
5576 5772
5577static u64 current_css_set_refcount_read(struct cgroup *cgrp, 5773static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5578 struct cftype *cft) 5774 struct cftype *cft)
5579{ 5775{
5580 u64 count; 5776 u64 count;
@@ -5585,7 +5781,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp,
5585 return count; 5781 return count;
5586} 5782}
5587 5783
5588static int current_css_set_cg_links_read(struct cgroup *cgrp, 5784static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
5589 struct cftype *cft, 5785 struct cftype *cft,
5590 struct seq_file *seq) 5786 struct seq_file *seq)
5591{ 5787{
@@ -5612,14 +5808,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp,
5612} 5808}
5613 5809
5614#define MAX_TASKS_SHOWN_PER_CSS 25 5810#define MAX_TASKS_SHOWN_PER_CSS 25
5615static int cgroup_css_links_read(struct cgroup *cgrp, 5811static int cgroup_css_links_read(struct cgroup_subsys_state *css,
5616 struct cftype *cft, 5812 struct cftype *cft, struct seq_file *seq)
5617 struct seq_file *seq)
5618{ 5813{
5619 struct cgrp_cset_link *link; 5814 struct cgrp_cset_link *link;
5620 5815
5621 read_lock(&css_set_lock); 5816 read_lock(&css_set_lock);
5622 list_for_each_entry(link, &cgrp->cset_links, cset_link) { 5817 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5623 struct css_set *cset = link->cset; 5818 struct css_set *cset = link->cset;
5624 struct task_struct *task; 5819 struct task_struct *task;
5625 int count = 0; 5820 int count = 0;
@@ -5638,9 +5833,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp,
5638 return 0; 5833 return 0;
5639} 5834}
5640 5835
5641static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) 5836static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5642{ 5837{
5643 return test_bit(CGRP_RELEASABLE, &cgrp->flags); 5838 return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
5644} 5839}
5645 5840
5646static struct cftype debug_files[] = { 5841static struct cftype debug_files[] = {
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 75dda1ea5026..f0ff64d0ebaa 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -45,25 +45,19 @@ struct freezer {
45 spinlock_t lock; 45 spinlock_t lock;
46}; 46};
47 47
48static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) 48static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
49{ 49{
50 return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), 50 return css ? container_of(css, struct freezer, css) : NULL;
51 struct freezer, css);
52} 51}
53 52
54static inline struct freezer *task_freezer(struct task_struct *task) 53static inline struct freezer *task_freezer(struct task_struct *task)
55{ 54{
56 return container_of(task_subsys_state(task, freezer_subsys_id), 55 return css_freezer(task_css(task, freezer_subsys_id));
57 struct freezer, css);
58} 56}
59 57
60static struct freezer *parent_freezer(struct freezer *freezer) 58static struct freezer *parent_freezer(struct freezer *freezer)
61{ 59{
62 struct cgroup *pcg = freezer->css.cgroup->parent; 60 return css_freezer(css_parent(&freezer->css));
63
64 if (pcg)
65 return cgroup_freezer(pcg);
66 return NULL;
67} 61}
68 62
69bool cgroup_freezing(struct task_struct *task) 63bool cgroup_freezing(struct task_struct *task)
@@ -92,7 +86,8 @@ static const char *freezer_state_strs(unsigned int state)
92 86
93struct cgroup_subsys freezer_subsys; 87struct cgroup_subsys freezer_subsys;
94 88
95static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) 89static struct cgroup_subsys_state *
90freezer_css_alloc(struct cgroup_subsys_state *parent_css)
96{ 91{
97 struct freezer *freezer; 92 struct freezer *freezer;
98 93
@@ -105,22 +100,22 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
105} 100}
106 101
107/** 102/**
108 * freezer_css_online - commit creation of a freezer cgroup 103 * freezer_css_online - commit creation of a freezer css
109 * @cgroup: cgroup being created 104 * @css: css being created
110 * 105 *
111 * We're committing to creation of @cgroup. Mark it online and inherit 106 * We're committing to creation of @css. Mark it online and inherit
112 * parent's freezing state while holding both parent's and our 107 * parent's freezing state while holding both parent's and our
113 * freezer->lock. 108 * freezer->lock.
114 */ 109 */
115static int freezer_css_online(struct cgroup *cgroup) 110static int freezer_css_online(struct cgroup_subsys_state *css)
116{ 111{
117 struct freezer *freezer = cgroup_freezer(cgroup); 112 struct freezer *freezer = css_freezer(css);
118 struct freezer *parent = parent_freezer(freezer); 113 struct freezer *parent = parent_freezer(freezer);
119 114
120 /* 115 /*
121 * The following double locking and freezing state inheritance 116 * The following double locking and freezing state inheritance
122 * guarantee that @cgroup can never escape ancestors' freezing 117 * guarantee that @cgroup can never escape ancestors' freezing
123 * states. See cgroup_for_each_descendant_pre() for details. 118 * states. See css_for_each_descendant_pre() for details.
124 */ 119 */
125 if (parent) 120 if (parent)
126 spin_lock_irq(&parent->lock); 121 spin_lock_irq(&parent->lock);
@@ -141,15 +136,15 @@ static int freezer_css_online(struct cgroup *cgroup)
141} 136}
142 137
143/** 138/**
144 * freezer_css_offline - initiate destruction of @cgroup 139 * freezer_css_offline - initiate destruction of a freezer css
145 * @cgroup: cgroup being destroyed 140 * @css: css being destroyed
146 * 141 *
147 * @cgroup is going away. Mark it dead and decrement system_freezing_count 142 * @css is going away. Mark it dead and decrement system_freezing_count if
148 * if it was holding one. 143 * it was holding one.
149 */ 144 */
150static void freezer_css_offline(struct cgroup *cgroup) 145static void freezer_css_offline(struct cgroup_subsys_state *css)
151{ 146{
152 struct freezer *freezer = cgroup_freezer(cgroup); 147 struct freezer *freezer = css_freezer(css);
153 148
154 spin_lock_irq(&freezer->lock); 149 spin_lock_irq(&freezer->lock);
155 150
@@ -161,9 +156,9 @@ static void freezer_css_offline(struct cgroup *cgroup)
161 spin_unlock_irq(&freezer->lock); 156 spin_unlock_irq(&freezer->lock);
162} 157}
163 158
164static void freezer_css_free(struct cgroup *cgroup) 159static void freezer_css_free(struct cgroup_subsys_state *css)
165{ 160{
166 kfree(cgroup_freezer(cgroup)); 161 kfree(css_freezer(css));
167} 162}
168 163
169/* 164/*
@@ -175,25 +170,26 @@ static void freezer_css_free(struct cgroup *cgroup)
175 * @freezer->lock. freezer_attach() makes the new tasks conform to the 170 * @freezer->lock. freezer_attach() makes the new tasks conform to the
176 * current state and all following state changes can see the new tasks. 171 * current state and all following state changes can see the new tasks.
177 */ 172 */
178static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) 173static void freezer_attach(struct cgroup_subsys_state *new_css,
174 struct cgroup_taskset *tset)
179{ 175{
180 struct freezer *freezer = cgroup_freezer(new_cgrp); 176 struct freezer *freezer = css_freezer(new_css);
181 struct task_struct *task; 177 struct task_struct *task;
182 bool clear_frozen = false; 178 bool clear_frozen = false;
183 179
184 spin_lock_irq(&freezer->lock); 180 spin_lock_irq(&freezer->lock);
185 181
186 /* 182 /*
187 * Make the new tasks conform to the current state of @new_cgrp. 183 * Make the new tasks conform to the current state of @new_css.
188 * For simplicity, when migrating any task to a FROZEN cgroup, we 184 * For simplicity, when migrating any task to a FROZEN cgroup, we
189 * revert it to FREEZING and let update_if_frozen() determine the 185 * revert it to FREEZING and let update_if_frozen() determine the
190 * correct state later. 186 * correct state later.
191 * 187 *
192 * Tasks in @tset are on @new_cgrp but may not conform to its 188 * Tasks in @tset are on @new_css but may not conform to its
193 * current state before executing the following - !frozen tasks may 189 * current state before executing the following - !frozen tasks may
194 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. 190 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
195 */ 191 */
196 cgroup_taskset_for_each(task, new_cgrp, tset) { 192 cgroup_taskset_for_each(task, new_css, tset) {
197 if (!(freezer->state & CGROUP_FREEZING)) { 193 if (!(freezer->state & CGROUP_FREEZING)) {
198 __thaw_task(task); 194 __thaw_task(task);
199 } else { 195 } else {
@@ -231,7 +227,7 @@ static void freezer_fork(struct task_struct *task)
231 * The root cgroup is non-freezable, so we can skip the 227 * The root cgroup is non-freezable, so we can skip the
232 * following check. 228 * following check.
233 */ 229 */
234 if (!freezer->css.cgroup->parent) 230 if (!parent_freezer(freezer))
235 goto out; 231 goto out;
236 232
237 spin_lock_irq(&freezer->lock); 233 spin_lock_irq(&freezer->lock);
@@ -244,7 +240,7 @@ out:
244 240
245/** 241/**
246 * update_if_frozen - update whether a cgroup finished freezing 242 * update_if_frozen - update whether a cgroup finished freezing
247 * @cgroup: cgroup of interest 243 * @css: css of interest
248 * 244 *
249 * Once FREEZING is initiated, transition to FROZEN is lazily updated by 245 * Once FREEZING is initiated, transition to FROZEN is lazily updated by
250 * calling this function. If the current state is FREEZING but not FROZEN, 246 * calling this function. If the current state is FREEZING but not FROZEN,
@@ -255,14 +251,14 @@ out:
255 * update_if_frozen() on all descendants prior to invoking this function. 251 * update_if_frozen() on all descendants prior to invoking this function.
256 * 252 *
257 * Task states and freezer state might disagree while tasks are being 253 * Task states and freezer state might disagree while tasks are being
258 * migrated into or out of @cgroup, so we can't verify task states against 254 * migrated into or out of @css, so we can't verify task states against
259 * @freezer state here. See freezer_attach() for details. 255 * @freezer state here. See freezer_attach() for details.
260 */ 256 */
261static void update_if_frozen(struct cgroup *cgroup) 257static void update_if_frozen(struct cgroup_subsys_state *css)
262{ 258{
263 struct freezer *freezer = cgroup_freezer(cgroup); 259 struct freezer *freezer = css_freezer(css);
264 struct cgroup *pos; 260 struct cgroup_subsys_state *pos;
265 struct cgroup_iter it; 261 struct css_task_iter it;
266 struct task_struct *task; 262 struct task_struct *task;
267 263
268 WARN_ON_ONCE(!rcu_read_lock_held()); 264 WARN_ON_ONCE(!rcu_read_lock_held());
@@ -274,8 +270,8 @@ static void update_if_frozen(struct cgroup *cgroup)
274 goto out_unlock; 270 goto out_unlock;
275 271
276 /* are all (live) children frozen? */ 272 /* are all (live) children frozen? */
277 cgroup_for_each_child(pos, cgroup) { 273 css_for_each_child(pos, css) {
278 struct freezer *child = cgroup_freezer(pos); 274 struct freezer *child = css_freezer(pos);
279 275
280 if ((child->state & CGROUP_FREEZER_ONLINE) && 276 if ((child->state & CGROUP_FREEZER_ONLINE) &&
281 !(child->state & CGROUP_FROZEN)) 277 !(child->state & CGROUP_FROZEN))
@@ -283,9 +279,9 @@ static void update_if_frozen(struct cgroup *cgroup)
283 } 279 }
284 280
285 /* are all tasks frozen? */ 281 /* are all tasks frozen? */
286 cgroup_iter_start(cgroup, &it); 282 css_task_iter_start(css, &it);
287 283
288 while ((task = cgroup_iter_next(cgroup, &it))) { 284 while ((task = css_task_iter_next(&it))) {
289 if (freezing(task)) { 285 if (freezing(task)) {
290 /* 286 /*
291 * freezer_should_skip() indicates that the task 287 * freezer_should_skip() indicates that the task
@@ -300,52 +296,49 @@ static void update_if_frozen(struct cgroup *cgroup)
300 296
301 freezer->state |= CGROUP_FROZEN; 297 freezer->state |= CGROUP_FROZEN;
302out_iter_end: 298out_iter_end:
303 cgroup_iter_end(cgroup, &it); 299 css_task_iter_end(&it);
304out_unlock: 300out_unlock:
305 spin_unlock_irq(&freezer->lock); 301 spin_unlock_irq(&freezer->lock);
306} 302}
307 303
308static int freezer_read(struct cgroup *cgroup, struct cftype *cft, 304static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft,
309 struct seq_file *m) 305 struct seq_file *m)
310{ 306{
311 struct cgroup *pos; 307 struct cgroup_subsys_state *pos;
312 308
313 rcu_read_lock(); 309 rcu_read_lock();
314 310
315 /* update states bottom-up */ 311 /* update states bottom-up */
316 cgroup_for_each_descendant_post(pos, cgroup) 312 css_for_each_descendant_post(pos, css)
317 update_if_frozen(pos); 313 update_if_frozen(pos);
318 update_if_frozen(cgroup);
319 314
320 rcu_read_unlock(); 315 rcu_read_unlock();
321 316
322 seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); 317 seq_puts(m, freezer_state_strs(css_freezer(css)->state));
323 seq_putc(m, '\n'); 318 seq_putc(m, '\n');
324 return 0; 319 return 0;
325} 320}
326 321
327static void freeze_cgroup(struct freezer *freezer) 322static void freeze_cgroup(struct freezer *freezer)
328{ 323{
329 struct cgroup *cgroup = freezer->css.cgroup; 324 struct css_task_iter it;
330 struct cgroup_iter it;
331 struct task_struct *task; 325 struct task_struct *task;
332 326
333 cgroup_iter_start(cgroup, &it); 327 css_task_iter_start(&freezer->css, &it);
334 while ((task = cgroup_iter_next(cgroup, &it))) 328 while ((task = css_task_iter_next(&it)))
335 freeze_task(task); 329 freeze_task(task);
336 cgroup_iter_end(cgroup, &it); 330 css_task_iter_end(&it);
337} 331}
338 332
339static void unfreeze_cgroup(struct freezer *freezer) 333static void unfreeze_cgroup(struct freezer *freezer)
340{ 334{
341 struct cgroup *cgroup = freezer->css.cgroup; 335 struct css_task_iter it;
342 struct cgroup_iter it;
343 struct task_struct *task; 336 struct task_struct *task;
344 337
345 cgroup_iter_start(cgroup, &it); 338 css_task_iter_start(&freezer->css, &it);
346 while ((task = cgroup_iter_next(cgroup, &it))) 339 while ((task = css_task_iter_next(&it)))
347 __thaw_task(task); 340 __thaw_task(task);
348 cgroup_iter_end(cgroup, &it); 341 css_task_iter_end(&it);
349} 342}
350 343
351/** 344/**
@@ -395,12 +388,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
395 */ 388 */
396static void freezer_change_state(struct freezer *freezer, bool freeze) 389static void freezer_change_state(struct freezer *freezer, bool freeze)
397{ 390{
398 struct cgroup *pos; 391 struct cgroup_subsys_state *pos;
399
400 /* update @freezer */
401 spin_lock_irq(&freezer->lock);
402 freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
403 spin_unlock_irq(&freezer->lock);
404 392
405 /* 393 /*
406 * Update all its descendants in pre-order traversal. Each 394 * Update all its descendants in pre-order traversal. Each
@@ -408,24 +396,33 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
408 * CGROUP_FREEZING_PARENT. 396 * CGROUP_FREEZING_PARENT.
409 */ 397 */
410 rcu_read_lock(); 398 rcu_read_lock();
411 cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { 399 css_for_each_descendant_pre(pos, &freezer->css) {
412 struct freezer *pos_f = cgroup_freezer(pos); 400 struct freezer *pos_f = css_freezer(pos);
413 struct freezer *parent = parent_freezer(pos_f); 401 struct freezer *parent = parent_freezer(pos_f);
414 402
415 /*
416 * Our update to @parent->state is already visible which is
417 * all we need. No need to lock @parent. For more info on
418 * synchronization, see freezer_post_create().
419 */
420 spin_lock_irq(&pos_f->lock); 403 spin_lock_irq(&pos_f->lock);
421 freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, 404
422 CGROUP_FREEZING_PARENT); 405 if (pos_f == freezer) {
406 freezer_apply_state(pos_f, freeze,
407 CGROUP_FREEZING_SELF);
408 } else {
409 /*
410 * Our update to @parent->state is already visible
411 * which is all we need. No need to lock @parent.
412 * For more info on synchronization, see
413 * freezer_post_create().
414 */
415 freezer_apply_state(pos_f,
416 parent->state & CGROUP_FREEZING,
417 CGROUP_FREEZING_PARENT);
418 }
419
423 spin_unlock_irq(&pos_f->lock); 420 spin_unlock_irq(&pos_f->lock);
424 } 421 }
425 rcu_read_unlock(); 422 rcu_read_unlock();
426} 423}
427 424
428static int freezer_write(struct cgroup *cgroup, struct cftype *cft, 425static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
429 const char *buffer) 426 const char *buffer)
430{ 427{
431 bool freeze; 428 bool freeze;
@@ -437,20 +434,22 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
437 else 434 else
438 return -EINVAL; 435 return -EINVAL;
439 436
440 freezer_change_state(cgroup_freezer(cgroup), freeze); 437 freezer_change_state(css_freezer(css), freeze);
441 return 0; 438 return 0;
442} 439}
443 440
444static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) 441static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
442 struct cftype *cft)
445{ 443{
446 struct freezer *freezer = cgroup_freezer(cgroup); 444 struct freezer *freezer = css_freezer(css);
447 445
448 return (bool)(freezer->state & CGROUP_FREEZING_SELF); 446 return (bool)(freezer->state & CGROUP_FREEZING_SELF);
449} 447}
450 448
451static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) 449static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
450 struct cftype *cft)
452{ 451{
453 struct freezer *freezer = cgroup_freezer(cgroup); 452 struct freezer *freezer = css_freezer(css);
454 453
455 return (bool)(freezer->state & CGROUP_FREEZING_PARENT); 454 return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
456} 455}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ea1966db34f2..6bf981e13c43 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -68,10 +68,6 @@
68 */ 68 */
69int number_of_cpusets __read_mostly; 69int number_of_cpusets __read_mostly;
70 70
71/* Forward declare cgroup structures */
72struct cgroup_subsys cpuset_subsys;
73struct cpuset;
74
75/* See "Frequency meter" comments, below. */ 71/* See "Frequency meter" comments, below. */
76 72
77struct fmeter { 73struct fmeter {
@@ -115,27 +111,20 @@ struct cpuset {
115 int relax_domain_level; 111 int relax_domain_level;
116}; 112};
117 113
118/* Retrieve the cpuset for a cgroup */ 114static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
119static inline struct cpuset *cgroup_cs(struct cgroup *cgrp)
120{ 115{
121 return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id), 116 return css ? container_of(css, struct cpuset, css) : NULL;
122 struct cpuset, css);
123} 117}
124 118
125/* Retrieve the cpuset for a task */ 119/* Retrieve the cpuset for a task */
126static inline struct cpuset *task_cs(struct task_struct *task) 120static inline struct cpuset *task_cs(struct task_struct *task)
127{ 121{
128 return container_of(task_subsys_state(task, cpuset_subsys_id), 122 return css_cs(task_css(task, cpuset_subsys_id));
129 struct cpuset, css);
130} 123}
131 124
132static inline struct cpuset *parent_cs(const struct cpuset *cs) 125static inline struct cpuset *parent_cs(struct cpuset *cs)
133{ 126{
134 struct cgroup *pcgrp = cs->css.cgroup->parent; 127 return css_cs(css_parent(&cs->css));
135
136 if (pcgrp)
137 return cgroup_cs(pcgrp);
138 return NULL;
139} 128}
140 129
141#ifdef CONFIG_NUMA 130#ifdef CONFIG_NUMA
@@ -212,29 +201,30 @@ static struct cpuset top_cpuset = {
212/** 201/**
213 * cpuset_for_each_child - traverse online children of a cpuset 202 * cpuset_for_each_child - traverse online children of a cpuset
214 * @child_cs: loop cursor pointing to the current child 203 * @child_cs: loop cursor pointing to the current child
215 * @pos_cgrp: used for iteration 204 * @pos_css: used for iteration
216 * @parent_cs: target cpuset to walk children of 205 * @parent_cs: target cpuset to walk children of
217 * 206 *
218 * Walk @child_cs through the online children of @parent_cs. Must be used 207 * Walk @child_cs through the online children of @parent_cs. Must be used
219 * with RCU read locked. 208 * with RCU read locked.
220 */ 209 */
221#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ 210#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
222 cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ 211 css_for_each_child((pos_css), &(parent_cs)->css) \
223 if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) 212 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
224 213
225/** 214/**
226 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants 215 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
227 * @des_cs: loop cursor pointing to the current descendant 216 * @des_cs: loop cursor pointing to the current descendant
228 * @pos_cgrp: used for iteration 217 * @pos_css: used for iteration
229 * @root_cs: target cpuset to walk ancestor of 218 * @root_cs: target cpuset to walk ancestor of
230 * 219 *
231 * Walk @des_cs through the online descendants of @root_cs. Must be used 220 * Walk @des_cs through the online descendants of @root_cs. Must be used
232 * with RCU read locked. The caller may modify @pos_cgrp by calling 221 * with RCU read locked. The caller may modify @pos_css by calling
233 * cgroup_rightmost_descendant() to skip subtree. 222 * css_rightmost_descendant() to skip subtree. @root_cs is included in the
223 * iteration and the first node to be visited.
234 */ 224 */
235#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ 225#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
236 cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ 226 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
237 if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) 227 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
238 228
239/* 229/*
240 * There are two global mutexes guarding cpuset structures - cpuset_mutex 230 * There are two global mutexes guarding cpuset structures - cpuset_mutex
@@ -320,8 +310,7 @@ static struct file_system_type cpuset_fs_type = {
320 * 310 *
321 * Call with callback_mutex held. 311 * Call with callback_mutex held.
322 */ 312 */
323static void guarantee_online_cpus(const struct cpuset *cs, 313static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
324 struct cpumask *pmask)
325{ 314{
326 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 315 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
327 cs = parent_cs(cs); 316 cs = parent_cs(cs);
@@ -339,7 +328,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
339 * 328 *
340 * Call with callback_mutex held. 329 * Call with callback_mutex held.
341 */ 330 */
342static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 331static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
343{ 332{
344 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) 333 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
345 cs = parent_cs(cs); 334 cs = parent_cs(cs);
@@ -384,7 +373,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
384 * alloc_trial_cpuset - allocate a trial cpuset 373 * alloc_trial_cpuset - allocate a trial cpuset
385 * @cs: the cpuset that the trial cpuset duplicates 374 * @cs: the cpuset that the trial cpuset duplicates
386 */ 375 */
387static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) 376static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
388{ 377{
389 struct cpuset *trial; 378 struct cpuset *trial;
390 379
@@ -431,9 +420,9 @@ static void free_trial_cpuset(struct cpuset *trial)
431 * Return 0 if valid, -errno if not. 420 * Return 0 if valid, -errno if not.
432 */ 421 */
433 422
434static int validate_change(const struct cpuset *cur, const struct cpuset *trial) 423static int validate_change(struct cpuset *cur, struct cpuset *trial)
435{ 424{
436 struct cgroup *cgrp; 425 struct cgroup_subsys_state *css;
437 struct cpuset *c, *par; 426 struct cpuset *c, *par;
438 int ret; 427 int ret;
439 428
@@ -441,7 +430,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
441 430
442 /* Each of our child cpusets must be a subset of us */ 431 /* Each of our child cpusets must be a subset of us */
443 ret = -EBUSY; 432 ret = -EBUSY;
444 cpuset_for_each_child(c, cgrp, cur) 433 cpuset_for_each_child(c, css, cur)
445 if (!is_cpuset_subset(c, trial)) 434 if (!is_cpuset_subset(c, trial))
446 goto out; 435 goto out;
447 436
@@ -462,7 +451,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
462 * overlap 451 * overlap
463 */ 452 */
464 ret = -EINVAL; 453 ret = -EINVAL;
465 cpuset_for_each_child(c, cgrp, par) { 454 cpuset_for_each_child(c, css, par) {
466 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 455 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
467 c != cur && 456 c != cur &&
468 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 457 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -515,13 +504,16 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
515 struct cpuset *root_cs) 504 struct cpuset *root_cs)
516{ 505{
517 struct cpuset *cp; 506 struct cpuset *cp;
518 struct cgroup *pos_cgrp; 507 struct cgroup_subsys_state *pos_css;
519 508
520 rcu_read_lock(); 509 rcu_read_lock();
521 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 510 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
511 if (cp == root_cs)
512 continue;
513
522 /* skip the whole subtree if @cp doesn't have any CPU */ 514 /* skip the whole subtree if @cp doesn't have any CPU */
523 if (cpumask_empty(cp->cpus_allowed)) { 515 if (cpumask_empty(cp->cpus_allowed)) {
524 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 516 pos_css = css_rightmost_descendant(pos_css);
525 continue; 517 continue;
526 } 518 }
527 519
@@ -596,7 +588,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
596 struct sched_domain_attr *dattr; /* attributes for custom domains */ 588 struct sched_domain_attr *dattr; /* attributes for custom domains */
597 int ndoms = 0; /* number of sched domains in result */ 589 int ndoms = 0; /* number of sched domains in result */
598 int nslot; /* next empty doms[] struct cpumask slot */ 590 int nslot; /* next empty doms[] struct cpumask slot */
599 struct cgroup *pos_cgrp; 591 struct cgroup_subsys_state *pos_css;
600 592
601 doms = NULL; 593 doms = NULL;
602 dattr = NULL; 594 dattr = NULL;
@@ -625,7 +617,9 @@ static int generate_sched_domains(cpumask_var_t **domains,
625 csn = 0; 617 csn = 0;
626 618
627 rcu_read_lock(); 619 rcu_read_lock();
628 cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { 620 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
621 if (cp == &top_cpuset)
622 continue;
629 /* 623 /*
630 * Continue traversing beyond @cp iff @cp has some CPUs and 624 * Continue traversing beyond @cp iff @cp has some CPUs and
631 * isn't load balancing. The former is obvious. The 625 * isn't load balancing. The former is obvious. The
@@ -642,7 +636,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
642 csa[csn++] = cp; 636 csa[csn++] = cp;
643 637
644 /* skip @cp's subtree */ 638 /* skip @cp's subtree */
645 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 639 pos_css = css_rightmost_descendant(pos_css);
646 } 640 }
647 rcu_read_unlock(); 641 rcu_read_unlock();
648 642
@@ -837,52 +831,45 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
837/** 831/**
838 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's 832 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
839 * @tsk: task to test 833 * @tsk: task to test
840 * @scan: struct cgroup_scanner containing the cgroup of the task 834 * @data: cpuset to @tsk belongs to
841 * 835 *
842 * Called by cgroup_scan_tasks() for each task in a cgroup whose 836 * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed
843 * cpus_allowed mask needs to be changed. 837 * mask needs to be changed.
844 * 838 *
845 * We don't need to re-check for the cgroup/cpuset membership, since we're 839 * We don't need to re-check for the cgroup/cpuset membership, since we're
846 * holding cpuset_mutex at this point. 840 * holding cpuset_mutex at this point.
847 */ 841 */
848static void cpuset_change_cpumask(struct task_struct *tsk, 842static void cpuset_change_cpumask(struct task_struct *tsk, void *data)
849 struct cgroup_scanner *scan)
850{ 843{
851 struct cpuset *cpus_cs; 844 struct cpuset *cs = data;
845 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
852 846
853 cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg));
854 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); 847 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
855} 848}
856 849
857/** 850/**
858 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 851 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
859 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 852 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
860 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 853 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
861 * 854 *
862 * Called with cpuset_mutex held 855 * Called with cpuset_mutex held
863 * 856 *
864 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 857 * The css_scan_tasks() function will scan all the tasks in a cgroup,
865 * calling callback functions for each. 858 * calling callback functions for each.
866 * 859 *
867 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 860 * No return value. It's guaranteed that css_scan_tasks() always returns 0
868 * if @heap != NULL. 861 * if @heap != NULL.
869 */ 862 */
870static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) 863static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
871{ 864{
872 struct cgroup_scanner scan; 865 css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap);
873
874 scan.cg = cs->css.cgroup;
875 scan.test_task = NULL;
876 scan.process_task = cpuset_change_cpumask;
877 scan.heap = heap;
878 cgroup_scan_tasks(&scan);
879} 866}
880 867
881/* 868/*
882 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. 869 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
883 * @root_cs: the root cpuset of the hierarchy 870 * @root_cs: the root cpuset of the hierarchy
884 * @update_root: update root cpuset or not? 871 * @update_root: update root cpuset or not?
885 * @heap: the heap used by cgroup_scan_tasks() 872 * @heap: the heap used by css_scan_tasks()
886 * 873 *
887 * This will update cpumasks of tasks in @root_cs and all other empty cpusets 874 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
888 * which take on cpumask of @root_cs. 875 * which take on cpumask of @root_cs.
@@ -893,17 +880,19 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
893 bool update_root, struct ptr_heap *heap) 880 bool update_root, struct ptr_heap *heap)
894{ 881{
895 struct cpuset *cp; 882 struct cpuset *cp;
896 struct cgroup *pos_cgrp; 883 struct cgroup_subsys_state *pos_css;
897
898 if (update_root)
899 update_tasks_cpumask(root_cs, heap);
900 884
901 rcu_read_lock(); 885 rcu_read_lock();
902 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 886 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
903 /* skip the whole subtree if @cp have some CPU */ 887 if (cp == root_cs) {
904 if (!cpumask_empty(cp->cpus_allowed)) { 888 if (!update_root)
905 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 889 continue;
906 continue; 890 } else {
891 /* skip the whole subtree if @cp have some CPU */
892 if (!cpumask_empty(cp->cpus_allowed)) {
893 pos_css = css_rightmost_descendant(pos_css);
894 continue;
895 }
907 } 896 }
908 if (!css_tryget(&cp->css)) 897 if (!css_tryget(&cp->css))
909 continue; 898 continue;
@@ -1059,20 +1048,24 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1059 task_unlock(tsk); 1048 task_unlock(tsk);
1060} 1049}
1061 1050
1051struct cpuset_change_nodemask_arg {
1052 struct cpuset *cs;
1053 nodemask_t *newmems;
1054};
1055
1062/* 1056/*
1063 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy 1057 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
1064 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if 1058 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
1065 * memory_migrate flag is set. Called with cpuset_mutex held. 1059 * memory_migrate flag is set. Called with cpuset_mutex held.
1066 */ 1060 */
1067static void cpuset_change_nodemask(struct task_struct *p, 1061static void cpuset_change_nodemask(struct task_struct *p, void *data)
1068 struct cgroup_scanner *scan)
1069{ 1062{
1070 struct cpuset *cs = cgroup_cs(scan->cg); 1063 struct cpuset_change_nodemask_arg *arg = data;
1064 struct cpuset *cs = arg->cs;
1071 struct mm_struct *mm; 1065 struct mm_struct *mm;
1072 int migrate; 1066 int migrate;
1073 nodemask_t *newmems = scan->data;
1074 1067
1075 cpuset_change_task_nodemask(p, newmems); 1068 cpuset_change_task_nodemask(p, arg->newmems);
1076 1069
1077 mm = get_task_mm(p); 1070 mm = get_task_mm(p);
1078 if (!mm) 1071 if (!mm)
@@ -1082,7 +1075,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
1082 1075
1083 mpol_rebind_mm(mm, &cs->mems_allowed); 1076 mpol_rebind_mm(mm, &cs->mems_allowed);
1084 if (migrate) 1077 if (migrate)
1085 cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems); 1078 cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems);
1086 mmput(mm); 1079 mmput(mm);
1087} 1080}
1088 1081
@@ -1091,28 +1084,22 @@ static void *cpuset_being_rebound;
1091/** 1084/**
1092 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1085 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1093 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1086 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1094 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1087 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1095 * 1088 *
1096 * Called with cpuset_mutex held 1089 * Called with cpuset_mutex held. No return value. It's guaranteed that
1097 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1090 * css_scan_tasks() always returns 0 if @heap != NULL.
1098 * if @heap != NULL.
1099 */ 1091 */
1100static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) 1092static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1101{ 1093{
1102 static nodemask_t newmems; /* protected by cpuset_mutex */ 1094 static nodemask_t newmems; /* protected by cpuset_mutex */
1103 struct cgroup_scanner scan;
1104 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1095 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1096 struct cpuset_change_nodemask_arg arg = { .cs = cs,
1097 .newmems = &newmems };
1105 1098
1106 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1099 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1107 1100
1108 guarantee_online_mems(mems_cs, &newmems); 1101 guarantee_online_mems(mems_cs, &newmems);
1109 1102
1110 scan.cg = cs->css.cgroup;
1111 scan.test_task = NULL;
1112 scan.process_task = cpuset_change_nodemask;
1113 scan.heap = heap;
1114 scan.data = &newmems;
1115
1116 /* 1103 /*
1117 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't 1104 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
1118 * take while holding tasklist_lock. Forks can happen - the 1105 * take while holding tasklist_lock. Forks can happen - the
@@ -1123,7 +1110,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1123 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1110 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1124 * is idempotent. Also migrate pages in each mm to new nodes. 1111 * is idempotent. Also migrate pages in each mm to new nodes.
1125 */ 1112 */
1126 cgroup_scan_tasks(&scan); 1113 css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap);
1127 1114
1128 /* 1115 /*
1129 * All the tasks' nodemasks have been updated, update 1116 * All the tasks' nodemasks have been updated, update
@@ -1139,7 +1126,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1139 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. 1126 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
1140 * @cs: the root cpuset of the hierarchy 1127 * @cs: the root cpuset of the hierarchy
1141 * @update_root: update the root cpuset or not? 1128 * @update_root: update the root cpuset or not?
1142 * @heap: the heap used by cgroup_scan_tasks() 1129 * @heap: the heap used by css_scan_tasks()
1143 * 1130 *
1144 * This will update nodemasks of tasks in @root_cs and all other empty cpusets 1131 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
1145 * which take on nodemask of @root_cs. 1132 * which take on nodemask of @root_cs.
@@ -1150,17 +1137,19 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1150 bool update_root, struct ptr_heap *heap) 1137 bool update_root, struct ptr_heap *heap)
1151{ 1138{
1152 struct cpuset *cp; 1139 struct cpuset *cp;
1153 struct cgroup *pos_cgrp; 1140 struct cgroup_subsys_state *pos_css;
1154
1155 if (update_root)
1156 update_tasks_nodemask(root_cs, heap);
1157 1141
1158 rcu_read_lock(); 1142 rcu_read_lock();
1159 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 1143 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
1160 /* skip the whole subtree if @cp have some CPU */ 1144 if (cp == root_cs) {
1161 if (!nodes_empty(cp->mems_allowed)) { 1145 if (!update_root)
1162 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 1146 continue;
1163 continue; 1147 } else {
1148 /* skip the whole subtree if @cp have some CPU */
1149 if (!nodes_empty(cp->mems_allowed)) {
1150 pos_css = css_rightmost_descendant(pos_css);
1151 continue;
1152 }
1164 } 1153 }
1165 if (!css_tryget(&cp->css)) 1154 if (!css_tryget(&cp->css))
1166 continue; 1155 continue;
@@ -1267,44 +1256,39 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1267 return 0; 1256 return 0;
1268} 1257}
1269 1258
1270/* 1259/**
1271 * cpuset_change_flag - make a task's spread flags the same as its cpuset's 1260 * cpuset_change_flag - make a task's spread flags the same as its cpuset's
1272 * @tsk: task to be updated 1261 * @tsk: task to be updated
1273 * @scan: struct cgroup_scanner containing the cgroup of the task 1262 * @data: cpuset to @tsk belongs to
1274 * 1263 *
1275 * Called by cgroup_scan_tasks() for each task in a cgroup. 1264 * Called by css_scan_tasks() for each task in a cgroup.
1276 * 1265 *
1277 * We don't need to re-check for the cgroup/cpuset membership, since we're 1266 * We don't need to re-check for the cgroup/cpuset membership, since we're
1278 * holding cpuset_mutex at this point. 1267 * holding cpuset_mutex at this point.
1279 */ 1268 */
1280static void cpuset_change_flag(struct task_struct *tsk, 1269static void cpuset_change_flag(struct task_struct *tsk, void *data)
1281 struct cgroup_scanner *scan)
1282{ 1270{
1283 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); 1271 struct cpuset *cs = data;
1272
1273 cpuset_update_task_spread_flag(cs, tsk);
1284} 1274}
1285 1275
1286/* 1276/**
1287 * update_tasks_flags - update the spread flags of tasks in the cpuset. 1277 * update_tasks_flags - update the spread flags of tasks in the cpuset.
1288 * @cs: the cpuset in which each task's spread flags needs to be changed 1278 * @cs: the cpuset in which each task's spread flags needs to be changed
1289 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1279 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1290 * 1280 *
1291 * Called with cpuset_mutex held 1281 * Called with cpuset_mutex held
1292 * 1282 *
1293 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 1283 * The css_scan_tasks() function will scan all the tasks in a cgroup,
1294 * calling callback functions for each. 1284 * calling callback functions for each.
1295 * 1285 *
1296 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1286 * No return value. It's guaranteed that css_scan_tasks() always returns 0
1297 * if @heap != NULL. 1287 * if @heap != NULL.
1298 */ 1288 */
1299static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) 1289static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1300{ 1290{
1301 struct cgroup_scanner scan; 1291 css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap);
1302
1303 scan.cg = cs->css.cgroup;
1304 scan.test_task = NULL;
1305 scan.process_task = cpuset_change_flag;
1306 scan.heap = heap;
1307 cgroup_scan_tasks(&scan);
1308} 1292}
1309 1293
1310/* 1294/*
@@ -1462,9 +1446,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1462} 1446}
1463 1447
1464/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 1448/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1465static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1449static int cpuset_can_attach(struct cgroup_subsys_state *css,
1450 struct cgroup_taskset *tset)
1466{ 1451{
1467 struct cpuset *cs = cgroup_cs(cgrp); 1452 struct cpuset *cs = css_cs(css);
1468 struct task_struct *task; 1453 struct task_struct *task;
1469 int ret; 1454 int ret;
1470 1455
@@ -1475,11 +1460,11 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1475 * flag is set. 1460 * flag is set.
1476 */ 1461 */
1477 ret = -ENOSPC; 1462 ret = -ENOSPC;
1478 if (!cgroup_sane_behavior(cgrp) && 1463 if (!cgroup_sane_behavior(css->cgroup) &&
1479 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1464 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1480 goto out_unlock; 1465 goto out_unlock;
1481 1466
1482 cgroup_taskset_for_each(task, cgrp, tset) { 1467 cgroup_taskset_for_each(task, css, tset) {
1483 /* 1468 /*
1484 * Kthreads which disallow setaffinity shouldn't be moved 1469 * Kthreads which disallow setaffinity shouldn't be moved
1485 * to a new cpuset; we don't want to change their cpu 1470 * to a new cpuset; we don't want to change their cpu
@@ -1508,11 +1493,11 @@ out_unlock:
1508 return ret; 1493 return ret;
1509} 1494}
1510 1495
1511static void cpuset_cancel_attach(struct cgroup *cgrp, 1496static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
1512 struct cgroup_taskset *tset) 1497 struct cgroup_taskset *tset)
1513{ 1498{
1514 mutex_lock(&cpuset_mutex); 1499 mutex_lock(&cpuset_mutex);
1515 cgroup_cs(cgrp)->attach_in_progress--; 1500 css_cs(css)->attach_in_progress--;
1516 mutex_unlock(&cpuset_mutex); 1501 mutex_unlock(&cpuset_mutex);
1517} 1502}
1518 1503
@@ -1523,16 +1508,18 @@ static void cpuset_cancel_attach(struct cgroup *cgrp,
1523 */ 1508 */
1524static cpumask_var_t cpus_attach; 1509static cpumask_var_t cpus_attach;
1525 1510
1526static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1511static void cpuset_attach(struct cgroup_subsys_state *css,
1512 struct cgroup_taskset *tset)
1527{ 1513{
1528 /* static buf protected by cpuset_mutex */ 1514 /* static buf protected by cpuset_mutex */
1529 static nodemask_t cpuset_attach_nodemask_to; 1515 static nodemask_t cpuset_attach_nodemask_to;
1530 struct mm_struct *mm; 1516 struct mm_struct *mm;
1531 struct task_struct *task; 1517 struct task_struct *task;
1532 struct task_struct *leader = cgroup_taskset_first(tset); 1518 struct task_struct *leader = cgroup_taskset_first(tset);
1533 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); 1519 struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset,
1534 struct cpuset *cs = cgroup_cs(cgrp); 1520 cpuset_subsys_id);
1535 struct cpuset *oldcs = cgroup_cs(oldcgrp); 1521 struct cpuset *cs = css_cs(css);
1522 struct cpuset *oldcs = css_cs(oldcss);
1536 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); 1523 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1537 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1524 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1538 1525
@@ -1546,7 +1533,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1546 1533
1547 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); 1534 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
1548 1535
1549 cgroup_taskset_for_each(task, cgrp, tset) { 1536 cgroup_taskset_for_each(task, css, tset) {
1550 /* 1537 /*
1551 * can_attach beforehand should guarantee that this doesn't 1538 * can_attach beforehand should guarantee that this doesn't
1552 * fail. TODO: have a better way to handle failure here 1539 * fail. TODO: have a better way to handle failure here
@@ -1608,9 +1595,10 @@ typedef enum {
1608 FILE_SPREAD_SLAB, 1595 FILE_SPREAD_SLAB,
1609} cpuset_filetype_t; 1596} cpuset_filetype_t;
1610 1597
1611static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1598static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1599 u64 val)
1612{ 1600{
1613 struct cpuset *cs = cgroup_cs(cgrp); 1601 struct cpuset *cs = css_cs(css);
1614 cpuset_filetype_t type = cft->private; 1602 cpuset_filetype_t type = cft->private;
1615 int retval = 0; 1603 int retval = 0;
1616 1604
@@ -1657,9 +1645,10 @@ out_unlock:
1657 return retval; 1645 return retval;
1658} 1646}
1659 1647
1660static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) 1648static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
1649 s64 val)
1661{ 1650{
1662 struct cpuset *cs = cgroup_cs(cgrp); 1651 struct cpuset *cs = css_cs(css);
1663 cpuset_filetype_t type = cft->private; 1652 cpuset_filetype_t type = cft->private;
1664 int retval = -ENODEV; 1653 int retval = -ENODEV;
1665 1654
@@ -1683,10 +1672,10 @@ out_unlock:
1683/* 1672/*
1684 * Common handling for a write to a "cpus" or "mems" file. 1673 * Common handling for a write to a "cpus" or "mems" file.
1685 */ 1674 */
1686static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, 1675static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1687 const char *buf) 1676 struct cftype *cft, const char *buf)
1688{ 1677{
1689 struct cpuset *cs = cgroup_cs(cgrp); 1678 struct cpuset *cs = css_cs(css);
1690 struct cpuset *trialcs; 1679 struct cpuset *trialcs;
1691 int retval = -ENODEV; 1680 int retval = -ENODEV;
1692 1681
@@ -1765,13 +1754,12 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1765 return count; 1754 return count;
1766} 1755}
1767 1756
1768static ssize_t cpuset_common_file_read(struct cgroup *cgrp, 1757static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css,
1769 struct cftype *cft, 1758 struct cftype *cft, struct file *file,
1770 struct file *file, 1759 char __user *buf, size_t nbytes,
1771 char __user *buf, 1760 loff_t *ppos)
1772 size_t nbytes, loff_t *ppos)
1773{ 1761{
1774 struct cpuset *cs = cgroup_cs(cgrp); 1762 struct cpuset *cs = css_cs(css);
1775 cpuset_filetype_t type = cft->private; 1763 cpuset_filetype_t type = cft->private;
1776 char *page; 1764 char *page;
1777 ssize_t retval = 0; 1765 ssize_t retval = 0;
@@ -1801,9 +1789,9 @@ out:
1801 return retval; 1789 return retval;
1802} 1790}
1803 1791
1804static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) 1792static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
1805{ 1793{
1806 struct cpuset *cs = cgroup_cs(cgrp); 1794 struct cpuset *cs = css_cs(css);
1807 cpuset_filetype_t type = cft->private; 1795 cpuset_filetype_t type = cft->private;
1808 switch (type) { 1796 switch (type) {
1809 case FILE_CPU_EXCLUSIVE: 1797 case FILE_CPU_EXCLUSIVE:
@@ -1832,9 +1820,9 @@ static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft)
1832 return 0; 1820 return 0;
1833} 1821}
1834 1822
1835static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft) 1823static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
1836{ 1824{
1837 struct cpuset *cs = cgroup_cs(cgrp); 1825 struct cpuset *cs = css_cs(css);
1838 cpuset_filetype_t type = cft->private; 1826 cpuset_filetype_t type = cft->private;
1839 switch (type) { 1827 switch (type) {
1840 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1828 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1949,11 +1937,12 @@ static struct cftype files[] = {
1949 * cgrp: control group that the new cpuset will be part of 1937 * cgrp: control group that the new cpuset will be part of
1950 */ 1938 */
1951 1939
1952static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) 1940static struct cgroup_subsys_state *
1941cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1953{ 1942{
1954 struct cpuset *cs; 1943 struct cpuset *cs;
1955 1944
1956 if (!cgrp->parent) 1945 if (!parent_css)
1957 return &top_cpuset.css; 1946 return &top_cpuset.css;
1958 1947
1959 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 1948 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
@@ -1973,12 +1962,12 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp)
1973 return &cs->css; 1962 return &cs->css;
1974} 1963}
1975 1964
1976static int cpuset_css_online(struct cgroup *cgrp) 1965static int cpuset_css_online(struct cgroup_subsys_state *css)
1977{ 1966{
1978 struct cpuset *cs = cgroup_cs(cgrp); 1967 struct cpuset *cs = css_cs(css);
1979 struct cpuset *parent = parent_cs(cs); 1968 struct cpuset *parent = parent_cs(cs);
1980 struct cpuset *tmp_cs; 1969 struct cpuset *tmp_cs;
1981 struct cgroup *pos_cg; 1970 struct cgroup_subsys_state *pos_css;
1982 1971
1983 if (!parent) 1972 if (!parent)
1984 return 0; 1973 return 0;
@@ -1993,7 +1982,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
1993 1982
1994 number_of_cpusets++; 1983 number_of_cpusets++;
1995 1984
1996 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) 1985 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1997 goto out_unlock; 1986 goto out_unlock;
1998 1987
1999 /* 1988 /*
@@ -2010,7 +1999,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
2010 * (and likewise for mems) to the new cgroup. 1999 * (and likewise for mems) to the new cgroup.
2011 */ 2000 */
2012 rcu_read_lock(); 2001 rcu_read_lock();
2013 cpuset_for_each_child(tmp_cs, pos_cg, parent) { 2002 cpuset_for_each_child(tmp_cs, pos_css, parent) {
2014 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { 2003 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2015 rcu_read_unlock(); 2004 rcu_read_unlock();
2016 goto out_unlock; 2005 goto out_unlock;
@@ -2027,9 +2016,15 @@ out_unlock:
2027 return 0; 2016 return 0;
2028} 2017}
2029 2018
2030static void cpuset_css_offline(struct cgroup *cgrp) 2019/*
2020 * If the cpuset being removed has its flag 'sched_load_balance'
2021 * enabled, then simulate turning sched_load_balance off, which
2022 * will call rebuild_sched_domains_locked().
2023 */
2024
2025static void cpuset_css_offline(struct cgroup_subsys_state *css)
2031{ 2026{
2032 struct cpuset *cs = cgroup_cs(cgrp); 2027 struct cpuset *cs = css_cs(css);
2033 2028
2034 mutex_lock(&cpuset_mutex); 2029 mutex_lock(&cpuset_mutex);
2035 2030
@@ -2042,15 +2037,9 @@ static void cpuset_css_offline(struct cgroup *cgrp)
2042 mutex_unlock(&cpuset_mutex); 2037 mutex_unlock(&cpuset_mutex);
2043} 2038}
2044 2039
2045/* 2040static void cpuset_css_free(struct cgroup_subsys_state *css)
2046 * If the cpuset being removed has its flag 'sched_load_balance'
2047 * enabled, then simulate turning sched_load_balance off, which
2048 * will call rebuild_sched_domains_locked().
2049 */
2050
2051static void cpuset_css_free(struct cgroup *cgrp)
2052{ 2041{
2053 struct cpuset *cs = cgroup_cs(cgrp); 2042 struct cpuset *cs = css_cs(css);
2054 2043
2055 free_cpumask_var(cs->cpus_allowed); 2044 free_cpumask_var(cs->cpus_allowed);
2056 kfree(cs); 2045 kfree(cs);
@@ -2257,11 +2246,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2257 /* if cpus or mems changed, we need to propagate to descendants */ 2246 /* if cpus or mems changed, we need to propagate to descendants */
2258 if (cpus_updated || mems_updated) { 2247 if (cpus_updated || mems_updated) {
2259 struct cpuset *cs; 2248 struct cpuset *cs;
2260 struct cgroup *pos_cgrp; 2249 struct cgroup_subsys_state *pos_css;
2261 2250
2262 rcu_read_lock(); 2251 rcu_read_lock();
2263 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { 2252 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2264 if (!css_tryget(&cs->css)) 2253 if (cs == &top_cpuset || !css_tryget(&cs->css))
2265 continue; 2254 continue;
2266 rcu_read_unlock(); 2255 rcu_read_unlock();
2267 2256
@@ -2350,7 +2339,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2350 2339
2351void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2340void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2352{ 2341{
2353 const struct cpuset *cpus_cs; 2342 struct cpuset *cpus_cs;
2354 2343
2355 rcu_read_lock(); 2344 rcu_read_lock();
2356 cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2345 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
@@ -2423,7 +2412,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2423 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall 2412 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
2424 * (an unusual configuration), then returns the root cpuset. 2413 * (an unusual configuration), then returns the root cpuset.
2425 */ 2414 */
2426static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) 2415static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2427{ 2416{
2428 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) 2417 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2429 cs = parent_cs(cs); 2418 cs = parent_cs(cs);
@@ -2493,7 +2482,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2493 */ 2482 */
2494int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) 2483int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2495{ 2484{
2496 const struct cpuset *cs; /* current cpuset ancestors */ 2485 struct cpuset *cs; /* current cpuset ancestors */
2497 int allowed; /* is allocation in zone z allowed? */ 2486 int allowed; /* is allocation in zone z allowed? */
2498 2487
2499 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2488 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
@@ -2731,7 +2720,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2731 goto out_free; 2720 goto out_free;
2732 2721
2733 rcu_read_lock(); 2722 rcu_read_lock();
2734 css = task_subsys_state(tsk, cpuset_subsys_id); 2723 css = task_css(tsk, cpuset_subsys_id);
2735 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2724 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2736 rcu_read_unlock(); 2725 rcu_read_unlock();
2737 if (retval < 0) 2726 if (retval < 0)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f86599e8c123..9300f5226077 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -340,8 +340,8 @@ struct perf_cgroup {
340static inline struct perf_cgroup * 340static inline struct perf_cgroup *
341perf_cgroup_from_task(struct task_struct *task) 341perf_cgroup_from_task(struct task_struct *task)
342{ 342{
343 return container_of(task_subsys_state(task, perf_subsys_id), 343 return container_of(task_css(task, perf_subsys_id),
344 struct perf_cgroup, css); 344 struct perf_cgroup, css);
345} 345}
346 346
347static inline bool 347static inline bool
@@ -591,7 +591,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
591 if (!f.file) 591 if (!f.file)
592 return -EBADF; 592 return -EBADF;
593 593
594 css = cgroup_css_from_dir(f.file, perf_subsys_id); 594 rcu_read_lock();
595
596 css = css_from_dir(f.file->f_dentry, &perf_subsys);
595 if (IS_ERR(css)) { 597 if (IS_ERR(css)) {
596 ret = PTR_ERR(css); 598 ret = PTR_ERR(css);
597 goto out; 599 goto out;
@@ -617,6 +619,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
617 ret = -EINVAL; 619 ret = -EINVAL;
618 } 620 }
619out: 621out:
622 rcu_read_unlock();
620 fdput(f); 623 fdput(f);
621 return ret; 624 return ret;
622} 625}
@@ -7798,7 +7801,8 @@ unlock:
7798device_initcall(perf_event_sysfs_init); 7801device_initcall(perf_event_sysfs_init);
7799 7802
7800#ifdef CONFIG_CGROUP_PERF 7803#ifdef CONFIG_CGROUP_PERF
7801static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) 7804static struct cgroup_subsys_state *
7805perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7802{ 7806{
7803 struct perf_cgroup *jc; 7807 struct perf_cgroup *jc;
7804 7808
@@ -7815,11 +7819,10 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
7815 return &jc->css; 7819 return &jc->css;
7816} 7820}
7817 7821
7818static void perf_cgroup_css_free(struct cgroup *cont) 7822static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
7819{ 7823{
7820 struct perf_cgroup *jc; 7824 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
7821 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7825
7822 struct perf_cgroup, css);
7823 free_percpu(jc->info); 7826 free_percpu(jc->info);
7824 kfree(jc); 7827 kfree(jc);
7825} 7828}
@@ -7831,15 +7834,17 @@ static int __perf_cgroup_move(void *info)
7831 return 0; 7834 return 0;
7832} 7835}
7833 7836
7834static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 7837static void perf_cgroup_attach(struct cgroup_subsys_state *css,
7838 struct cgroup_taskset *tset)
7835{ 7839{
7836 struct task_struct *task; 7840 struct task_struct *task;
7837 7841
7838 cgroup_taskset_for_each(task, cgrp, tset) 7842 cgroup_taskset_for_each(task, css, tset)
7839 task_function_call(task, __perf_cgroup_move, task); 7843 task_function_call(task, __perf_cgroup_move, task);
7840} 7844}
7841 7845
7842static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, 7846static void perf_cgroup_exit(struct cgroup_subsys_state *css,
7847 struct cgroup_subsys_state *old_css,
7843 struct task_struct *task) 7848 struct task_struct *task)
7844{ 7849{
7845 /* 7850 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 05c39f030314..e53bda3ff2f1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6815,7 +6815,7 @@ void sched_move_task(struct task_struct *tsk)
6815 if (unlikely(running)) 6815 if (unlikely(running))
6816 tsk->sched_class->put_prev_task(rq, tsk); 6816 tsk->sched_class->put_prev_task(rq, tsk);
6817 6817
6818 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, 6818 tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
6819 lockdep_is_held(&tsk->sighand->siglock)), 6819 lockdep_is_held(&tsk->sighand->siglock)),
6820 struct task_group, css); 6820 struct task_group, css);
6821 tg = autogroup_task_group(tsk, tg); 6821 tg = autogroup_task_group(tsk, tg);
@@ -7137,23 +7137,22 @@ int sched_rt_handler(struct ctl_table *table, int write,
7137 7137
7138#ifdef CONFIG_CGROUP_SCHED 7138#ifdef CONFIG_CGROUP_SCHED
7139 7139
7140/* return corresponding task_group object of a cgroup */ 7140static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
7141static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7142{ 7141{
7143 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), 7142 return css ? container_of(css, struct task_group, css) : NULL;
7144 struct task_group, css);
7145} 7143}
7146 7144
7147static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) 7145static struct cgroup_subsys_state *
7146cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7148{ 7147{
7149 struct task_group *tg, *parent; 7148 struct task_group *parent = css_tg(parent_css);
7149 struct task_group *tg;
7150 7150
7151 if (!cgrp->parent) { 7151 if (!parent) {
7152 /* This is early initialization for the top cgroup */ 7152 /* This is early initialization for the top cgroup */
7153 return &root_task_group.css; 7153 return &root_task_group.css;
7154 } 7154 }
7155 7155
7156 parent = cgroup_tg(cgrp->parent);
7157 tg = sched_create_group(parent); 7156 tg = sched_create_group(parent);
7158 if (IS_ERR(tg)) 7157 if (IS_ERR(tg))
7159 return ERR_PTR(-ENOMEM); 7158 return ERR_PTR(-ENOMEM);
@@ -7161,41 +7160,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7161 return &tg->css; 7160 return &tg->css;
7162} 7161}
7163 7162
7164static int cpu_cgroup_css_online(struct cgroup *cgrp) 7163static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7165{ 7164{
7166 struct task_group *tg = cgroup_tg(cgrp); 7165 struct task_group *tg = css_tg(css);
7167 struct task_group *parent; 7166 struct task_group *parent = css_tg(css_parent(css));
7168
7169 if (!cgrp->parent)
7170 return 0;
7171 7167
7172 parent = cgroup_tg(cgrp->parent); 7168 if (parent)
7173 sched_online_group(tg, parent); 7169 sched_online_group(tg, parent);
7174 return 0; 7170 return 0;
7175} 7171}
7176 7172
7177static void cpu_cgroup_css_free(struct cgroup *cgrp) 7173static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
7178{ 7174{
7179 struct task_group *tg = cgroup_tg(cgrp); 7175 struct task_group *tg = css_tg(css);
7180 7176
7181 sched_destroy_group(tg); 7177 sched_destroy_group(tg);
7182} 7178}
7183 7179
7184static void cpu_cgroup_css_offline(struct cgroup *cgrp) 7180static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
7185{ 7181{
7186 struct task_group *tg = cgroup_tg(cgrp); 7182 struct task_group *tg = css_tg(css);
7187 7183
7188 sched_offline_group(tg); 7184 sched_offline_group(tg);
7189} 7185}
7190 7186
7191static int cpu_cgroup_can_attach(struct cgroup *cgrp, 7187static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7192 struct cgroup_taskset *tset) 7188 struct cgroup_taskset *tset)
7193{ 7189{
7194 struct task_struct *task; 7190 struct task_struct *task;
7195 7191
7196 cgroup_taskset_for_each(task, cgrp, tset) { 7192 cgroup_taskset_for_each(task, css, tset) {
7197#ifdef CONFIG_RT_GROUP_SCHED 7193#ifdef CONFIG_RT_GROUP_SCHED
7198 if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) 7194 if (!sched_rt_can_attach(css_tg(css), task))
7199 return -EINVAL; 7195 return -EINVAL;
7200#else 7196#else
7201 /* We don't support RT-tasks being in separate groups */ 7197 /* We don't support RT-tasks being in separate groups */
@@ -7206,18 +7202,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7206 return 0; 7202 return 0;
7207} 7203}
7208 7204
7209static void cpu_cgroup_attach(struct cgroup *cgrp, 7205static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
7210 struct cgroup_taskset *tset) 7206 struct cgroup_taskset *tset)
7211{ 7207{
7212 struct task_struct *task; 7208 struct task_struct *task;
7213 7209
7214 cgroup_taskset_for_each(task, cgrp, tset) 7210 cgroup_taskset_for_each(task, css, tset)
7215 sched_move_task(task); 7211 sched_move_task(task);
7216} 7212}
7217 7213
7218static void 7214static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
7219cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, 7215 struct cgroup_subsys_state *old_css,
7220 struct task_struct *task) 7216 struct task_struct *task)
7221{ 7217{
7222 /* 7218 /*
7223 * cgroup_exit() is called in the copy_process() failure path. 7219 * cgroup_exit() is called in the copy_process() failure path.
@@ -7231,15 +7227,16 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7231} 7227}
7232 7228
7233#ifdef CONFIG_FAIR_GROUP_SCHED 7229#ifdef CONFIG_FAIR_GROUP_SCHED
7234static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7230static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
7235 u64 shareval) 7231 struct cftype *cftype, u64 shareval)
7236{ 7232{
7237 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); 7233 return sched_group_set_shares(css_tg(css), scale_load(shareval));
7238} 7234}
7239 7235
7240static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 7236static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
7237 struct cftype *cft)
7241{ 7238{
7242 struct task_group *tg = cgroup_tg(cgrp); 7239 struct task_group *tg = css_tg(css);
7243 7240
7244 return (u64) scale_load_down(tg->shares); 7241 return (u64) scale_load_down(tg->shares);
7245} 7242}
@@ -7361,26 +7358,28 @@ long tg_get_cfs_period(struct task_group *tg)
7361 return cfs_period_us; 7358 return cfs_period_us;
7362} 7359}
7363 7360
7364static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) 7361static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
7362 struct cftype *cft)
7365{ 7363{
7366 return tg_get_cfs_quota(cgroup_tg(cgrp)); 7364 return tg_get_cfs_quota(css_tg(css));
7367} 7365}
7368 7366
7369static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, 7367static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
7370 s64 cfs_quota_us) 7368 struct cftype *cftype, s64 cfs_quota_us)
7371{ 7369{
7372 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); 7370 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
7373} 7371}
7374 7372
7375static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) 7373static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
7374 struct cftype *cft)
7376{ 7375{
7377 return tg_get_cfs_period(cgroup_tg(cgrp)); 7376 return tg_get_cfs_period(css_tg(css));
7378} 7377}
7379 7378
7380static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7379static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
7381 u64 cfs_period_us) 7380 struct cftype *cftype, u64 cfs_period_us)
7382{ 7381{
7383 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); 7382 return tg_set_cfs_period(css_tg(css), cfs_period_us);
7384} 7383}
7385 7384
7386struct cfs_schedulable_data { 7385struct cfs_schedulable_data {
@@ -7461,10 +7460,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7461 return ret; 7460 return ret;
7462} 7461}
7463 7462
7464static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, 7463static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
7465 struct cgroup_map_cb *cb) 7464 struct cgroup_map_cb *cb)
7466{ 7465{
7467 struct task_group *tg = cgroup_tg(cgrp); 7466 struct task_group *tg = css_tg(css);
7468 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7467 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7469 7468
7470 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7469 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
@@ -7477,26 +7476,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
7477#endif /* CONFIG_FAIR_GROUP_SCHED */ 7476#endif /* CONFIG_FAIR_GROUP_SCHED */
7478 7477
7479#ifdef CONFIG_RT_GROUP_SCHED 7478#ifdef CONFIG_RT_GROUP_SCHED
7480static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 7479static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
7481 s64 val) 7480 struct cftype *cft, s64 val)
7482{ 7481{
7483 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); 7482 return sched_group_set_rt_runtime(css_tg(css), val);
7484} 7483}
7485 7484
7486static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) 7485static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
7486 struct cftype *cft)
7487{ 7487{
7488 return sched_group_rt_runtime(cgroup_tg(cgrp)); 7488 return sched_group_rt_runtime(css_tg(css));
7489} 7489}
7490 7490
7491static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 7491static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
7492 u64 rt_period_us) 7492 struct cftype *cftype, u64 rt_period_us)
7493{ 7493{
7494 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); 7494 return sched_group_set_rt_period(css_tg(css), rt_period_us);
7495} 7495}
7496 7496
7497static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) 7497static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
7498 struct cftype *cft)
7498{ 7499{
7499 return sched_group_rt_period(cgroup_tg(cgrp)); 7500 return sched_group_rt_period(css_tg(css));
7500} 7501}
7501#endif /* CONFIG_RT_GROUP_SCHED */ 7502#endif /* CONFIG_RT_GROUP_SCHED */
7502 7503
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dbb7e2cd95eb..f64722ff0299 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -33,30 +33,20 @@ struct cpuacct {
33 struct kernel_cpustat __percpu *cpustat; 33 struct kernel_cpustat __percpu *cpustat;
34}; 34};
35 35
36/* return cpu accounting group corresponding to this container */ 36static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
37static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
38{ 37{
39 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), 38 return css ? container_of(css, struct cpuacct, css) : NULL;
40 struct cpuacct, css);
41} 39}
42 40
43/* return cpu accounting group to which this task belongs */ 41/* return cpu accounting group to which this task belongs */
44static inline struct cpuacct *task_ca(struct task_struct *tsk) 42static inline struct cpuacct *task_ca(struct task_struct *tsk)
45{ 43{
46 return container_of(task_subsys_state(tsk, cpuacct_subsys_id), 44 return css_ca(task_css(tsk, cpuacct_subsys_id));
47 struct cpuacct, css);
48}
49
50static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
51{
52 return cgroup_ca(ca->css.cgroup->parent);
53} 45}
54 46
55static inline struct cpuacct *parent_ca(struct cpuacct *ca) 47static inline struct cpuacct *parent_ca(struct cpuacct *ca)
56{ 48{
57 if (!ca->css.cgroup->parent) 49 return css_ca(css_parent(&ca->css));
58 return NULL;
59 return cgroup_ca(ca->css.cgroup->parent);
60} 50}
61 51
62static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); 52static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
@@ -66,11 +56,12 @@ static struct cpuacct root_cpuacct = {
66}; 56};
67 57
68/* create a new cpu accounting group */ 58/* create a new cpu accounting group */
69static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) 59static struct cgroup_subsys_state *
60cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
70{ 61{
71 struct cpuacct *ca; 62 struct cpuacct *ca;
72 63
73 if (!cgrp->parent) 64 if (!parent_css)
74 return &root_cpuacct.css; 65 return &root_cpuacct.css;
75 66
76 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 67 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
@@ -96,9 +87,9 @@ out:
96} 87}
97 88
98/* destroy an existing cpu accounting group */ 89/* destroy an existing cpu accounting group */
99static void cpuacct_css_free(struct cgroup *cgrp) 90static void cpuacct_css_free(struct cgroup_subsys_state *css)
100{ 91{
101 struct cpuacct *ca = cgroup_ca(cgrp); 92 struct cpuacct *ca = css_ca(css);
102 93
103 free_percpu(ca->cpustat); 94 free_percpu(ca->cpustat);
104 free_percpu(ca->cpuusage); 95 free_percpu(ca->cpuusage);
@@ -141,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
141} 132}
142 133
143/* return total cpu usage (in nanoseconds) of a group */ 134/* return total cpu usage (in nanoseconds) of a group */
144static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 135static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
145{ 136{
146 struct cpuacct *ca = cgroup_ca(cgrp); 137 struct cpuacct *ca = css_ca(css);
147 u64 totalcpuusage = 0; 138 u64 totalcpuusage = 0;
148 int i; 139 int i;
149 140
@@ -153,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
153 return totalcpuusage; 144 return totalcpuusage;
154} 145}
155 146
156static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, 147static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
157 u64 reset) 148 u64 reset)
158{ 149{
159 struct cpuacct *ca = cgroup_ca(cgrp); 150 struct cpuacct *ca = css_ca(css);
160 int err = 0; 151 int err = 0;
161 int i; 152 int i;
162 153
@@ -172,10 +163,10 @@ out:
172 return err; 163 return err;
173} 164}
174 165
175static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, 166static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css,
176 struct seq_file *m) 167 struct cftype *cft, struct seq_file *m)
177{ 168{
178 struct cpuacct *ca = cgroup_ca(cgroup); 169 struct cpuacct *ca = css_ca(css);
179 u64 percpu; 170 u64 percpu;
180 int i; 171 int i;
181 172
@@ -192,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = {
192 [CPUACCT_STAT_SYSTEM] = "system", 183 [CPUACCT_STAT_SYSTEM] = "system",
193}; 184};
194 185
195static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 186static int cpuacct_stats_show(struct cgroup_subsys_state *css,
196 struct cgroup_map_cb *cb) 187 struct cftype *cft, struct cgroup_map_cb *cb)
197{ 188{
198 struct cpuacct *ca = cgroup_ca(cgrp); 189 struct cpuacct *ca = css_ca(css);
199 int cpu; 190 int cpu;
200 s64 val = 0; 191 s64 val = 0;
201 192
@@ -281,7 +272,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
281 while (ca != &root_cpuacct) { 272 while (ca != &root_cpuacct) {
282 kcpustat = this_cpu_ptr(ca->cpustat); 273 kcpustat = this_cpu_ptr(ca->cpustat);
283 kcpustat->cpustat[index] += val; 274 kcpustat->cpustat[index] += val;
284 ca = __parent_ca(ca); 275 ca = parent_ca(ca);
285 } 276 }
286 rcu_read_unlock(); 277 rcu_read_unlock();
287} 278}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef0a7b2439dd..471a56db05ea 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -665,9 +665,9 @@ extern int group_balance_cpu(struct sched_group *sg);
665/* 665/*
666 * Return the group to which this tasks belongs. 666 * Return the group to which this tasks belongs.
667 * 667 *
668 * We cannot use task_subsys_state() and friends because the cgroup 668 * We cannot use task_css() and friends because the cgroup subsystem
669 * subsystem changes that value before the cgroup_subsys::attach() method 669 * changes that value before the cgroup_subsys::attach() method is called,
670 * is called, therefore we cannot pin it and might observe the wrong value. 670 * therefore we cannot pin it and might observe the wrong value.
671 * 671 *
672 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup 672 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
673 * core changes this before calling sched_move_task(). 673 * core changes this before calling sched_move_task().
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 9cea7de22ffb..bda8e44f6fde 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -36,21 +36,13 @@ static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
36static inline 36static inline
37struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) 37struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
38{ 38{
39 return container_of(s, struct hugetlb_cgroup, css); 39 return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
40}
41
42static inline
43struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup)
44{
45 return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup,
46 hugetlb_subsys_id));
47} 40}
48 41
49static inline 42static inline
50struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) 43struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
51{ 44{
52 return hugetlb_cgroup_from_css(task_subsys_state(task, 45 return hugetlb_cgroup_from_css(task_css(task, hugetlb_subsys_id));
53 hugetlb_subsys_id));
54} 46}
55 47
56static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) 48static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
@@ -58,17 +50,15 @@ static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
58 return (h_cg == root_h_cgroup); 50 return (h_cg == root_h_cgroup);
59} 51}
60 52
61static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg) 53static inline struct hugetlb_cgroup *
54parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
62{ 55{
63 if (!cg->parent) 56 return hugetlb_cgroup_from_css(css_parent(&h_cg->css));
64 return NULL;
65 return hugetlb_cgroup_from_cgroup(cg->parent);
66} 57}
67 58
68static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg) 59static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
69{ 60{
70 int idx; 61 int idx;
71 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg);
72 62
73 for (idx = 0; idx < hugetlb_max_hstate; idx++) { 63 for (idx = 0; idx < hugetlb_max_hstate; idx++) {
74 if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0) 64 if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
@@ -77,19 +67,18 @@ static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
77 return false; 67 return false;
78} 68}
79 69
80static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup) 70static struct cgroup_subsys_state *
71hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
81{ 72{
73 struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
74 struct hugetlb_cgroup *h_cgroup;
82 int idx; 75 int idx;
83 struct cgroup *parent_cgroup;
84 struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup;
85 76
86 h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); 77 h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
87 if (!h_cgroup) 78 if (!h_cgroup)
88 return ERR_PTR(-ENOMEM); 79 return ERR_PTR(-ENOMEM);
89 80
90 parent_cgroup = cgroup->parent; 81 if (parent_h_cgroup) {
91 if (parent_cgroup) {
92 parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup);
93 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) 82 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
94 res_counter_init(&h_cgroup->hugepage[idx], 83 res_counter_init(&h_cgroup->hugepage[idx],
95 &parent_h_cgroup->hugepage[idx]); 84 &parent_h_cgroup->hugepage[idx]);
@@ -101,11 +90,11 @@ static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgrou
101 return &h_cgroup->css; 90 return &h_cgroup->css;
102} 91}
103 92
104static void hugetlb_cgroup_css_free(struct cgroup *cgroup) 93static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
105{ 94{
106 struct hugetlb_cgroup *h_cgroup; 95 struct hugetlb_cgroup *h_cgroup;
107 96
108 h_cgroup = hugetlb_cgroup_from_cgroup(cgroup); 97 h_cgroup = hugetlb_cgroup_from_css(css);
109 kfree(h_cgroup); 98 kfree(h_cgroup);
110} 99}
111 100
@@ -117,15 +106,14 @@ static void hugetlb_cgroup_css_free(struct cgroup *cgroup)
117 * page reference and test for page active here. This function 106 * page reference and test for page active here. This function
118 * cannot fail. 107 * cannot fail.
119 */ 108 */
120static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup, 109static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
121 struct page *page) 110 struct page *page)
122{ 111{
123 int csize; 112 int csize;
124 struct res_counter *counter; 113 struct res_counter *counter;
125 struct res_counter *fail_res; 114 struct res_counter *fail_res;
126 struct hugetlb_cgroup *page_hcg; 115 struct hugetlb_cgroup *page_hcg;
127 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); 116 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
128 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup);
129 117
130 page_hcg = hugetlb_cgroup_from_page(page); 118 page_hcg = hugetlb_cgroup_from_page(page);
131 /* 119 /*
@@ -155,8 +143,9 @@ out:
155 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to 143 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
156 * the parent cgroup. 144 * the parent cgroup.
157 */ 145 */
158static void hugetlb_cgroup_css_offline(struct cgroup *cgroup) 146static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
159{ 147{
148 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
160 struct hstate *h; 149 struct hstate *h;
161 struct page *page; 150 struct page *page;
162 int idx = 0; 151 int idx = 0;
@@ -165,13 +154,13 @@ static void hugetlb_cgroup_css_offline(struct cgroup *cgroup)
165 for_each_hstate(h) { 154 for_each_hstate(h) {
166 spin_lock(&hugetlb_lock); 155 spin_lock(&hugetlb_lock);
167 list_for_each_entry(page, &h->hugepage_activelist, lru) 156 list_for_each_entry(page, &h->hugepage_activelist, lru)
168 hugetlb_cgroup_move_parent(idx, cgroup, page); 157 hugetlb_cgroup_move_parent(idx, h_cg, page);
169 158
170 spin_unlock(&hugetlb_lock); 159 spin_unlock(&hugetlb_lock);
171 idx++; 160 idx++;
172 } 161 }
173 cond_resched(); 162 cond_resched();
174 } while (hugetlb_cgroup_have_usage(cgroup)); 163 } while (hugetlb_cgroup_have_usage(h_cg));
175} 164}
176 165
177int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 166int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
@@ -253,14 +242,15 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
253 return; 242 return;
254} 243}
255 244
256static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft, 245static ssize_t hugetlb_cgroup_read(struct cgroup_subsys_state *css,
257 struct file *file, char __user *buf, 246 struct cftype *cft, struct file *file,
258 size_t nbytes, loff_t *ppos) 247 char __user *buf, size_t nbytes,
248 loff_t *ppos)
259{ 249{
260 u64 val; 250 u64 val;
261 char str[64]; 251 char str[64];
262 int idx, name, len; 252 int idx, name, len;
263 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); 253 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
264 254
265 idx = MEMFILE_IDX(cft->private); 255 idx = MEMFILE_IDX(cft->private);
266 name = MEMFILE_ATTR(cft->private); 256 name = MEMFILE_ATTR(cft->private);
@@ -270,12 +260,12 @@ static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft,
270 return simple_read_from_buffer(buf, nbytes, ppos, str, len); 260 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
271} 261}
272 262
273static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft, 263static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
274 const char *buffer) 264 struct cftype *cft, const char *buffer)
275{ 265{
276 int idx, name, ret; 266 int idx, name, ret;
277 unsigned long long val; 267 unsigned long long val;
278 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); 268 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
279 269
280 idx = MEMFILE_IDX(cft->private); 270 idx = MEMFILE_IDX(cft->private);
281 name = MEMFILE_ATTR(cft->private); 271 name = MEMFILE_ATTR(cft->private);
@@ -300,10 +290,11 @@ static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft,
300 return ret; 290 return ret;
301} 291}
302 292
303static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event) 293static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css,
294 unsigned int event)
304{ 295{
305 int idx, name, ret = 0; 296 int idx, name, ret = 0;
306 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); 297 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
307 298
308 idx = MEMFILE_IDX(event); 299 idx = MEMFILE_IDX(event);
309 name = MEMFILE_ATTR(event); 300 name = MEMFILE_ATTR(event);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0878ff7c26a9..3b83957b6439 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -483,10 +483,9 @@ enum res_type {
483 */ 483 */
484static DEFINE_MUTEX(memcg_create_mutex); 484static DEFINE_MUTEX(memcg_create_mutex);
485 485
486static inline
487struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) 486struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
488{ 487{
489 return container_of(s, struct mem_cgroup, css); 488 return s ? container_of(s, struct mem_cgroup, css) : NULL;
490} 489}
491 490
492/* Some nice accessors for the vmpressure. */ 491/* Some nice accessors for the vmpressure. */
@@ -1035,12 +1034,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1035 preempt_enable(); 1034 preempt_enable();
1036} 1035}
1037 1036
1038struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
1039{
1040 return mem_cgroup_from_css(
1041 cgroup_subsys_state(cont, mem_cgroup_subsys_id));
1042}
1043
1044struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 1037struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1045{ 1038{
1046 /* 1039 /*
@@ -1051,7 +1044,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1051 if (unlikely(!p)) 1044 if (unlikely(!p))
1052 return NULL; 1045 return NULL;
1053 1046
1054 return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id)); 1047 return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id));
1055} 1048}
1056 1049
1057struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 1050struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
@@ -1084,20 +1077,11 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
1084static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 1077static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
1085 struct mem_cgroup *last_visited) 1078 struct mem_cgroup *last_visited)
1086{ 1079{
1087 struct cgroup *prev_cgroup, *next_cgroup; 1080 struct cgroup_subsys_state *prev_css, *next_css;
1088 1081
1089 /* 1082 prev_css = last_visited ? &last_visited->css : NULL;
1090 * Root is not visited by cgroup iterators so it needs an
1091 * explicit visit.
1092 */
1093 if (!last_visited)
1094 return root;
1095
1096 prev_cgroup = (last_visited == root) ? NULL
1097 : last_visited->css.cgroup;
1098skip_node: 1083skip_node:
1099 next_cgroup = cgroup_next_descendant_pre( 1084 next_css = css_next_descendant_pre(prev_css, &root->css);
1100 prev_cgroup, root->css.cgroup);
1101 1085
1102 /* 1086 /*
1103 * Even if we found a group we have to make sure it is 1087 * Even if we found a group we have to make sure it is
@@ -1106,13 +1090,13 @@ skip_node:
1106 * last_visited css is safe to use because it is 1090 * last_visited css is safe to use because it is
1107 * protected by css_get and the tree walk is rcu safe. 1091 * protected by css_get and the tree walk is rcu safe.
1108 */ 1092 */
1109 if (next_cgroup) { 1093 if (next_css) {
1110 struct mem_cgroup *mem = mem_cgroup_from_cont( 1094 struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
1111 next_cgroup); 1095
1112 if (css_tryget(&mem->css)) 1096 if (css_tryget(&mem->css))
1113 return mem; 1097 return mem;
1114 else { 1098 else {
1115 prev_cgroup = next_cgroup; 1099 prev_css = next_css;
1116 goto skip_node; 1100 goto skip_node;
1117 } 1101 }
1118 } 1102 }
@@ -1525,10 +1509,8 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1525 1509
1526int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1510int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1527{ 1511{
1528 struct cgroup *cgrp = memcg->css.cgroup;
1529
1530 /* root ? */ 1512 /* root ? */
1531 if (cgrp->parent == NULL) 1513 if (!css_parent(&memcg->css))
1532 return vm_swappiness; 1514 return vm_swappiness;
1533 1515
1534 return memcg->swappiness; 1516 return memcg->swappiness;
@@ -1805,12 +1787,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1805 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 1787 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1806 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; 1788 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1807 for_each_mem_cgroup_tree(iter, memcg) { 1789 for_each_mem_cgroup_tree(iter, memcg) {
1808 struct cgroup *cgroup = iter->css.cgroup; 1790 struct css_task_iter it;
1809 struct cgroup_iter it;
1810 struct task_struct *task; 1791 struct task_struct *task;
1811 1792
1812 cgroup_iter_start(cgroup, &it); 1793 css_task_iter_start(&iter->css, &it);
1813 while ((task = cgroup_iter_next(cgroup, &it))) { 1794 while ((task = css_task_iter_next(&it))) {
1814 switch (oom_scan_process_thread(task, totalpages, NULL, 1795 switch (oom_scan_process_thread(task, totalpages, NULL,
1815 false)) { 1796 false)) {
1816 case OOM_SCAN_SELECT: 1797 case OOM_SCAN_SELECT:
@@ -1823,7 +1804,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1823 case OOM_SCAN_CONTINUE: 1804 case OOM_SCAN_CONTINUE:
1824 continue; 1805 continue;
1825 case OOM_SCAN_ABORT: 1806 case OOM_SCAN_ABORT:
1826 cgroup_iter_end(cgroup, &it); 1807 css_task_iter_end(&it);
1827 mem_cgroup_iter_break(memcg, iter); 1808 mem_cgroup_iter_break(memcg, iter);
1828 if (chosen) 1809 if (chosen)
1829 put_task_struct(chosen); 1810 put_task_struct(chosen);
@@ -1840,7 +1821,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1840 get_task_struct(chosen); 1821 get_task_struct(chosen);
1841 } 1822 }
1842 } 1823 }
1843 cgroup_iter_end(cgroup, &it); 1824 css_task_iter_end(&it);
1844 } 1825 }
1845 1826
1846 if (!chosen) 1827 if (!chosen)
@@ -2954,10 +2935,10 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2954} 2935}
2955 2936
2956#ifdef CONFIG_SLABINFO 2937#ifdef CONFIG_SLABINFO
2957static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft, 2938static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
2958 struct seq_file *m) 2939 struct cftype *cft, struct seq_file *m)
2959{ 2940{
2960 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 2941 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2961 struct memcg_cache_params *params; 2942 struct memcg_cache_params *params;
2962 2943
2963 if (!memcg_can_account_kmem(memcg)) 2944 if (!memcg_can_account_kmem(memcg))
@@ -4943,10 +4924,10 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
4943 */ 4924 */
4944static inline bool __memcg_has_children(struct mem_cgroup *memcg) 4925static inline bool __memcg_has_children(struct mem_cgroup *memcg)
4945{ 4926{
4946 struct cgroup *pos; 4927 struct cgroup_subsys_state *pos;
4947 4928
4948 /* bounce at first found */ 4929 /* bounce at first found */
4949 cgroup_for_each_child(pos, memcg->css.cgroup) 4930 css_for_each_child(pos, &memcg->css)
4950 return true; 4931 return true;
4951 return false; 4932 return false;
4952} 4933}
@@ -5002,9 +4983,10 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
5002 return 0; 4983 return 0;
5003} 4984}
5004 4985
5005static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 4986static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,
4987 unsigned int event)
5006{ 4988{
5007 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4989 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5008 int ret; 4990 int ret;
5009 4991
5010 if (mem_cgroup_is_root(memcg)) 4992 if (mem_cgroup_is_root(memcg))
@@ -5017,21 +4999,18 @@ static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
5017} 4999}
5018 5000
5019 5001
5020static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 5002static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
5003 struct cftype *cft)
5021{ 5004{
5022 return mem_cgroup_from_cont(cont)->use_hierarchy; 5005 return mem_cgroup_from_css(css)->use_hierarchy;
5023} 5006}
5024 5007
5025static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 5008static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
5026 u64 val) 5009 struct cftype *cft, u64 val)
5027{ 5010{
5028 int retval = 0; 5011 int retval = 0;
5029 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5012 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5030 struct cgroup *parent = cont->parent; 5013 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css));
5031 struct mem_cgroup *parent_memcg = NULL;
5032
5033 if (parent)
5034 parent_memcg = mem_cgroup_from_cont(parent);
5035 5014
5036 mutex_lock(&memcg_create_mutex); 5015 mutex_lock(&memcg_create_mutex);
5037 5016
@@ -5101,11 +5080,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
5101 return val << PAGE_SHIFT; 5080 return val << PAGE_SHIFT;
5102} 5081}
5103 5082
5104static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, 5083static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
5105 struct file *file, char __user *buf, 5084 struct cftype *cft, struct file *file,
5106 size_t nbytes, loff_t *ppos) 5085 char __user *buf, size_t nbytes, loff_t *ppos)
5107{ 5086{
5108 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5087 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5109 char str[64]; 5088 char str[64];
5110 u64 val; 5089 u64 val;
5111 int name, len; 5090 int name, len;
@@ -5138,11 +5117,11 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
5138 return simple_read_from_buffer(buf, nbytes, ppos, str, len); 5117 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
5139} 5118}
5140 5119
5141static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) 5120static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
5142{ 5121{
5143 int ret = -EINVAL; 5122 int ret = -EINVAL;
5144#ifdef CONFIG_MEMCG_KMEM 5123#ifdef CONFIG_MEMCG_KMEM
5145 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5124 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5146 /* 5125 /*
5147 * For simplicity, we won't allow this to be disabled. It also can't 5126 * For simplicity, we won't allow this to be disabled. It also can't
5148 * be changed if the cgroup has children already, or if tasks had 5127 * be changed if the cgroup has children already, or if tasks had
@@ -5158,7 +5137,7 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
5158 mutex_lock(&memcg_create_mutex); 5137 mutex_lock(&memcg_create_mutex);
5159 mutex_lock(&set_limit_mutex); 5138 mutex_lock(&set_limit_mutex);
5160 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { 5139 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
5161 if (cgroup_task_count(cont) || memcg_has_children(memcg)) { 5140 if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
5162 ret = -EBUSY; 5141 ret = -EBUSY;
5163 goto out; 5142 goto out;
5164 } 5143 }
@@ -5228,10 +5207,10 @@ out:
5228 * The user of this function is... 5207 * The user of this function is...
5229 * RES_LIMIT. 5208 * RES_LIMIT.
5230 */ 5209 */
5231static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 5210static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
5232 const char *buffer) 5211 const char *buffer)
5233{ 5212{
5234 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5213 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5235 enum res_type type; 5214 enum res_type type;
5236 int name; 5215 int name;
5237 unsigned long long val; 5216 unsigned long long val;
@@ -5255,7 +5234,7 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
5255 else if (type == _MEMSWAP) 5234 else if (type == _MEMSWAP)
5256 ret = mem_cgroup_resize_memsw_limit(memcg, val); 5235 ret = mem_cgroup_resize_memsw_limit(memcg, val);
5257 else if (type == _KMEM) 5236 else if (type == _KMEM)
5258 ret = memcg_update_kmem_limit(cont, val); 5237 ret = memcg_update_kmem_limit(css, val);
5259 else 5238 else
5260 return -EINVAL; 5239 return -EINVAL;
5261 break; 5240 break;
@@ -5283,18 +5262,15 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
5283static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 5262static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
5284 unsigned long long *mem_limit, unsigned long long *memsw_limit) 5263 unsigned long long *mem_limit, unsigned long long *memsw_limit)
5285{ 5264{
5286 struct cgroup *cgroup;
5287 unsigned long long min_limit, min_memsw_limit, tmp; 5265 unsigned long long min_limit, min_memsw_limit, tmp;
5288 5266
5289 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 5267 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
5290 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 5268 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
5291 cgroup = memcg->css.cgroup;
5292 if (!memcg->use_hierarchy) 5269 if (!memcg->use_hierarchy)
5293 goto out; 5270 goto out;
5294 5271
5295 while (cgroup->parent) { 5272 while (css_parent(&memcg->css)) {
5296 cgroup = cgroup->parent; 5273 memcg = mem_cgroup_from_css(css_parent(&memcg->css));
5297 memcg = mem_cgroup_from_cont(cgroup);
5298 if (!memcg->use_hierarchy) 5274 if (!memcg->use_hierarchy)
5299 break; 5275 break;
5300 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 5276 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -5307,9 +5283,9 @@ out:
5307 *memsw_limit = min_memsw_limit; 5283 *memsw_limit = min_memsw_limit;
5308} 5284}
5309 5285
5310static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 5286static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
5311{ 5287{
5312 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5288 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5313 int name; 5289 int name;
5314 enum res_type type; 5290 enum res_type type;
5315 5291
@@ -5342,17 +5318,17 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
5342 return 0; 5318 return 0;
5343} 5319}
5344 5320
5345static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 5321static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
5346 struct cftype *cft) 5322 struct cftype *cft)
5347{ 5323{
5348 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 5324 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
5349} 5325}
5350 5326
5351#ifdef CONFIG_MMU 5327#ifdef CONFIG_MMU
5352static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 5328static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5353 struct cftype *cft, u64 val) 5329 struct cftype *cft, u64 val)
5354{ 5330{
5355 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5331 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5356 5332
5357 if (val >= (1 << NR_MOVE_TYPE)) 5333 if (val >= (1 << NR_MOVE_TYPE))
5358 return -EINVAL; 5334 return -EINVAL;
@@ -5367,7 +5343,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
5367 return 0; 5343 return 0;
5368} 5344}
5369#else 5345#else
5370static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 5346static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5371 struct cftype *cft, u64 val) 5347 struct cftype *cft, u64 val)
5372{ 5348{
5373 return -ENOSYS; 5349 return -ENOSYS;
@@ -5375,13 +5351,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
5375#endif 5351#endif
5376 5352
5377#ifdef CONFIG_NUMA 5353#ifdef CONFIG_NUMA
5378static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, 5354static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
5379 struct seq_file *m) 5355 struct cftype *cft, struct seq_file *m)
5380{ 5356{
5381 int nid; 5357 int nid;
5382 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 5358 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
5383 unsigned long node_nr; 5359 unsigned long node_nr;
5384 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5360 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5385 5361
5386 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); 5362 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
5387 seq_printf(m, "total=%lu", total_nr); 5363 seq_printf(m, "total=%lu", total_nr);
@@ -5426,10 +5402,10 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
5426 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 5402 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
5427} 5403}
5428 5404
5429static int memcg_stat_show(struct cgroup *cont, struct cftype *cft, 5405static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft,
5430 struct seq_file *m) 5406 struct seq_file *m)
5431{ 5407{
5432 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 5408 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5433 struct mem_cgroup *mi; 5409 struct mem_cgroup *mi;
5434 unsigned int i; 5410 unsigned int i;
5435 5411
@@ -5513,27 +5489,23 @@ static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
5513 return 0; 5489 return 0;
5514} 5490}
5515 5491
5516static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 5492static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
5493 struct cftype *cft)
5517{ 5494{
5518 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5495 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5519 5496
5520 return mem_cgroup_swappiness(memcg); 5497 return mem_cgroup_swappiness(memcg);
5521} 5498}
5522 5499
5523static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 5500static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
5524 u64 val) 5501 struct cftype *cft, u64 val)
5525{ 5502{
5526 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5503 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5527 struct mem_cgroup *parent; 5504 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
5528
5529 if (val > 100)
5530 return -EINVAL;
5531 5505
5532 if (cgrp->parent == NULL) 5506 if (val > 100 || !parent)
5533 return -EINVAL; 5507 return -EINVAL;
5534 5508
5535 parent = mem_cgroup_from_cont(cgrp->parent);
5536
5537 mutex_lock(&memcg_create_mutex); 5509 mutex_lock(&memcg_create_mutex);
5538 5510
5539 /* If under hierarchy, only empty-root can set this value */ 5511 /* If under hierarchy, only empty-root can set this value */
@@ -5636,10 +5608,10 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
5636 mem_cgroup_oom_notify_cb(iter); 5608 mem_cgroup_oom_notify_cb(iter);
5637} 5609}
5638 5610
5639static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 5611static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
5640 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5612 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
5641{ 5613{
5642 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5614 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5643 struct mem_cgroup_thresholds *thresholds; 5615 struct mem_cgroup_thresholds *thresholds;
5644 struct mem_cgroup_threshold_ary *new; 5616 struct mem_cgroup_threshold_ary *new;
5645 enum res_type type = MEMFILE_TYPE(cft->private); 5617 enum res_type type = MEMFILE_TYPE(cft->private);
@@ -5719,10 +5691,10 @@ unlock:
5719 return ret; 5691 return ret;
5720} 5692}
5721 5693
5722static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, 5694static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
5723 struct cftype *cft, struct eventfd_ctx *eventfd) 5695 struct cftype *cft, struct eventfd_ctx *eventfd)
5724{ 5696{
5725 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5697 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5726 struct mem_cgroup_thresholds *thresholds; 5698 struct mem_cgroup_thresholds *thresholds;
5727 struct mem_cgroup_threshold_ary *new; 5699 struct mem_cgroup_threshold_ary *new;
5728 enum res_type type = MEMFILE_TYPE(cft->private); 5700 enum res_type type = MEMFILE_TYPE(cft->private);
@@ -5798,10 +5770,10 @@ unlock:
5798 mutex_unlock(&memcg->thresholds_lock); 5770 mutex_unlock(&memcg->thresholds_lock);
5799} 5771}
5800 5772
5801static int mem_cgroup_oom_register_event(struct cgroup *cgrp, 5773static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
5802 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5774 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
5803{ 5775{
5804 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5776 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5805 struct mem_cgroup_eventfd_list *event; 5777 struct mem_cgroup_eventfd_list *event;
5806 enum res_type type = MEMFILE_TYPE(cft->private); 5778 enum res_type type = MEMFILE_TYPE(cft->private);
5807 5779
@@ -5823,10 +5795,10 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
5823 return 0; 5795 return 0;
5824} 5796}
5825 5797
5826static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 5798static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
5827 struct cftype *cft, struct eventfd_ctx *eventfd) 5799 struct cftype *cft, struct eventfd_ctx *eventfd)
5828{ 5800{
5829 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5801 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5830 struct mem_cgroup_eventfd_list *ev, *tmp; 5802 struct mem_cgroup_eventfd_list *ev, *tmp;
5831 enum res_type type = MEMFILE_TYPE(cft->private); 5803 enum res_type type = MEMFILE_TYPE(cft->private);
5832 5804
@@ -5844,10 +5816,10 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
5844 spin_unlock(&memcg_oom_lock); 5816 spin_unlock(&memcg_oom_lock);
5845} 5817}
5846 5818
5847static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 5819static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css,
5848 struct cftype *cft, struct cgroup_map_cb *cb) 5820 struct cftype *cft, struct cgroup_map_cb *cb)
5849{ 5821{
5850 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5822 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5851 5823
5852 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); 5824 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
5853 5825
@@ -5858,18 +5830,16 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
5858 return 0; 5830 return 0;
5859} 5831}
5860 5832
5861static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 5833static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
5862 struct cftype *cft, u64 val) 5834 struct cftype *cft, u64 val)
5863{ 5835{
5864 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 5836 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5865 struct mem_cgroup *parent; 5837 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
5866 5838
5867 /* cannot set to root cgroup and only 0 and 1 are allowed */ 5839 /* cannot set to root cgroup and only 0 and 1 are allowed */
5868 if (!cgrp->parent || !((val == 0) || (val == 1))) 5840 if (!parent || !((val == 0) || (val == 1)))
5869 return -EINVAL; 5841 return -EINVAL;
5870 5842
5871 parent = mem_cgroup_from_cont(cgrp->parent);
5872
5873 mutex_lock(&memcg_create_mutex); 5843 mutex_lock(&memcg_create_mutex);
5874 /* oom-kill-disable is a flag for subhierarchy. */ 5844 /* oom-kill-disable is a flag for subhierarchy. */
5875 if ((parent->use_hierarchy) || memcg_has_children(memcg)) { 5845 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
@@ -6228,7 +6198,7 @@ static void __init mem_cgroup_soft_limit_tree_init(void)
6228} 6198}
6229 6199
6230static struct cgroup_subsys_state * __ref 6200static struct cgroup_subsys_state * __ref
6231mem_cgroup_css_alloc(struct cgroup *cont) 6201mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6232{ 6202{
6233 struct mem_cgroup *memcg; 6203 struct mem_cgroup *memcg;
6234 long error = -ENOMEM; 6204 long error = -ENOMEM;
@@ -6243,7 +6213,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
6243 goto free_out; 6213 goto free_out;
6244 6214
6245 /* root ? */ 6215 /* root ? */
6246 if (cont->parent == NULL) { 6216 if (parent_css == NULL) {
6247 root_mem_cgroup = memcg; 6217 root_mem_cgroup = memcg;
6248 res_counter_init(&memcg->res, NULL); 6218 res_counter_init(&memcg->res, NULL);
6249 res_counter_init(&memcg->memsw, NULL); 6219 res_counter_init(&memcg->memsw, NULL);
@@ -6265,17 +6235,16 @@ free_out:
6265} 6235}
6266 6236
6267static int 6237static int
6268mem_cgroup_css_online(struct cgroup *cont) 6238mem_cgroup_css_online(struct cgroup_subsys_state *css)
6269{ 6239{
6270 struct mem_cgroup *memcg, *parent; 6240 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6241 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
6271 int error = 0; 6242 int error = 0;
6272 6243
6273 if (!cont->parent) 6244 if (!parent)
6274 return 0; 6245 return 0;
6275 6246
6276 mutex_lock(&memcg_create_mutex); 6247 mutex_lock(&memcg_create_mutex);
6277 memcg = mem_cgroup_from_cont(cont);
6278 parent = mem_cgroup_from_cont(cont->parent);
6279 6248
6280 memcg->use_hierarchy = parent->use_hierarchy; 6249 memcg->use_hierarchy = parent->use_hierarchy;
6281 memcg->oom_kill_disable = parent->oom_kill_disable; 6250 memcg->oom_kill_disable = parent->oom_kill_disable;
@@ -6326,9 +6295,9 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6326 mem_cgroup_iter_invalidate(root_mem_cgroup); 6295 mem_cgroup_iter_invalidate(root_mem_cgroup);
6327} 6296}
6328 6297
6329static void mem_cgroup_css_offline(struct cgroup *cont) 6298static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6330{ 6299{
6331 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6300 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6332 6301
6333 kmem_cgroup_css_offline(memcg); 6302 kmem_cgroup_css_offline(memcg);
6334 6303
@@ -6338,9 +6307,9 @@ static void mem_cgroup_css_offline(struct cgroup *cont)
6338 vmpressure_cleanup(&memcg->vmpressure); 6307 vmpressure_cleanup(&memcg->vmpressure);
6339} 6308}
6340 6309
6341static void mem_cgroup_css_free(struct cgroup *cont) 6310static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
6342{ 6311{
6343 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6312 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6344 6313
6345 memcg_destroy_kmem(memcg); 6314 memcg_destroy_kmem(memcg);
6346 __mem_cgroup_free(memcg); 6315 __mem_cgroup_free(memcg);
@@ -6710,12 +6679,12 @@ static void mem_cgroup_clear_mc(void)
6710 mem_cgroup_end_move(from); 6679 mem_cgroup_end_move(from);
6711} 6680}
6712 6681
6713static int mem_cgroup_can_attach(struct cgroup *cgroup, 6682static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
6714 struct cgroup_taskset *tset) 6683 struct cgroup_taskset *tset)
6715{ 6684{
6716 struct task_struct *p = cgroup_taskset_first(tset); 6685 struct task_struct *p = cgroup_taskset_first(tset);
6717 int ret = 0; 6686 int ret = 0;
6718 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); 6687 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6719 unsigned long move_charge_at_immigrate; 6688 unsigned long move_charge_at_immigrate;
6720 6689
6721 /* 6690 /*
@@ -6757,7 +6726,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
6757 return ret; 6726 return ret;
6758} 6727}
6759 6728
6760static void mem_cgroup_cancel_attach(struct cgroup *cgroup, 6729static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
6761 struct cgroup_taskset *tset) 6730 struct cgroup_taskset *tset)
6762{ 6731{
6763 mem_cgroup_clear_mc(); 6732 mem_cgroup_clear_mc();
@@ -6905,7 +6874,7 @@ retry:
6905 up_read(&mm->mmap_sem); 6874 up_read(&mm->mmap_sem);
6906} 6875}
6907 6876
6908static void mem_cgroup_move_task(struct cgroup *cont, 6877static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
6909 struct cgroup_taskset *tset) 6878 struct cgroup_taskset *tset)
6910{ 6879{
6911 struct task_struct *p = cgroup_taskset_first(tset); 6880 struct task_struct *p = cgroup_taskset_first(tset);
@@ -6920,16 +6889,16 @@ static void mem_cgroup_move_task(struct cgroup *cont,
6920 mem_cgroup_clear_mc(); 6889 mem_cgroup_clear_mc();
6921} 6890}
6922#else /* !CONFIG_MMU */ 6891#else /* !CONFIG_MMU */
6923static int mem_cgroup_can_attach(struct cgroup *cgroup, 6892static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
6924 struct cgroup_taskset *tset) 6893 struct cgroup_taskset *tset)
6925{ 6894{
6926 return 0; 6895 return 0;
6927} 6896}
6928static void mem_cgroup_cancel_attach(struct cgroup *cgroup, 6897static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
6929 struct cgroup_taskset *tset) 6898 struct cgroup_taskset *tset)
6930{ 6899{
6931} 6900}
6932static void mem_cgroup_move_task(struct cgroup *cont, 6901static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
6933 struct cgroup_taskset *tset) 6902 struct cgroup_taskset *tset)
6934{ 6903{
6935} 6904}
@@ -6939,15 +6908,15 @@ static void mem_cgroup_move_task(struct cgroup *cont,
6939 * Cgroup retains root cgroups across [un]mount cycles making it necessary 6908 * Cgroup retains root cgroups across [un]mount cycles making it necessary
6940 * to verify sane_behavior flag on each mount attempt. 6909 * to verify sane_behavior flag on each mount attempt.
6941 */ 6910 */
6942static void mem_cgroup_bind(struct cgroup *root) 6911static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
6943{ 6912{
6944 /* 6913 /*
6945 * use_hierarchy is forced with sane_behavior. cgroup core 6914 * use_hierarchy is forced with sane_behavior. cgroup core
6946 * guarantees that @root doesn't have any children, so turning it 6915 * guarantees that @root doesn't have any children, so turning it
6947 * on for the root memcg is enough. 6916 * on for the root memcg is enough.
6948 */ 6917 */
6949 if (cgroup_sane_behavior(root)) 6918 if (cgroup_sane_behavior(root_css->cgroup))
6950 mem_cgroup_from_cont(root)->use_hierarchy = true; 6919 mem_cgroup_from_css(root_css)->use_hierarchy = true;
6951} 6920}
6952 6921
6953struct cgroup_subsys mem_cgroup_subsys = { 6922struct cgroup_subsys mem_cgroup_subsys = {
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 0c1e37d829fa..e0f62837c3f4 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -74,15 +74,10 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work)
74 return container_of(work, struct vmpressure, work); 74 return container_of(work, struct vmpressure, work);
75} 75}
76 76
77static struct vmpressure *cg_to_vmpressure(struct cgroup *cg)
78{
79 return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id));
80}
81
82static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) 77static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
83{ 78{
84 struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup; 79 struct cgroup_subsys_state *css = vmpressure_to_css(vmpr);
85 struct mem_cgroup *memcg = mem_cgroup_from_cont(cg); 80 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
86 81
87 memcg = parent_mem_cgroup(memcg); 82 memcg = parent_mem_cgroup(memcg);
88 if (!memcg) 83 if (!memcg)
@@ -283,7 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
283 278
284/** 279/**
285 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd 280 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
286 * @cg: cgroup that is interested in vmpressure notifications 281 * @css: css that is interested in vmpressure notifications
287 * @cft: cgroup control files handle 282 * @cft: cgroup control files handle
288 * @eventfd: eventfd context to link notifications with 283 * @eventfd: eventfd context to link notifications with
289 * @args: event arguments (used to set up a pressure level threshold) 284 * @args: event arguments (used to set up a pressure level threshold)
@@ -298,10 +293,11 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
298 * cftype).register_event, and then cgroup core will handle everything by 293 * cftype).register_event, and then cgroup core will handle everything by
299 * itself. 294 * itself.
300 */ 295 */
301int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, 296int vmpressure_register_event(struct cgroup_subsys_state *css,
302 struct eventfd_ctx *eventfd, const char *args) 297 struct cftype *cft, struct eventfd_ctx *eventfd,
298 const char *args)
303{ 299{
304 struct vmpressure *vmpr = cg_to_vmpressure(cg); 300 struct vmpressure *vmpr = css_to_vmpressure(css);
305 struct vmpressure_event *ev; 301 struct vmpressure_event *ev;
306 int level; 302 int level;
307 303
@@ -329,7 +325,7 @@ int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
329 325
330/** 326/**
331 * vmpressure_unregister_event() - Unbind eventfd from vmpressure 327 * vmpressure_unregister_event() - Unbind eventfd from vmpressure
332 * @cg: cgroup handle 328 * @css: css handle
333 * @cft: cgroup control files handle 329 * @cft: cgroup control files handle
334 * @eventfd: eventfd context that was used to link vmpressure with the @cg 330 * @eventfd: eventfd context that was used to link vmpressure with the @cg
335 * 331 *
@@ -341,10 +337,11 @@ int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
341 * cftype).unregister_event, and then cgroup core will handle everything 337 * cftype).unregister_event, and then cgroup core will handle everything
342 * by itself. 338 * by itself.
343 */ 339 */
344void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, 340void vmpressure_unregister_event(struct cgroup_subsys_state *css,
341 struct cftype *cft,
345 struct eventfd_ctx *eventfd) 342 struct eventfd_ctx *eventfd)
346{ 343{
347 struct vmpressure *vmpr = cg_to_vmpressure(cg); 344 struct vmpressure *vmpr = css_to_vmpressure(css);
348 struct vmpressure_event *ev; 345 struct vmpressure_event *ev;
349 346
350 mutex_lock(&vmpr->events_lock); 347 mutex_lock(&vmpr->events_lock);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index e533259dce3c..d9cd627e6a16 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -29,12 +29,6 @@
29 29
30#define PRIOMAP_MIN_SZ 128 30#define PRIOMAP_MIN_SZ 128
31 31
32static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp)
33{
34 return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id),
35 struct cgroup_netprio_state, css);
36}
37
38/* 32/*
39 * Extend @dev->priomap so that it's large enough to accomodate 33 * Extend @dev->priomap so that it's large enough to accomodate
40 * @target_idx. @dev->priomap.priomap_len > @target_idx after successful 34 * @target_idx. @dev->priomap.priomap_len > @target_idx after successful
@@ -87,67 +81,70 @@ static int extend_netdev_table(struct net_device *dev, u32 target_idx)
87 81
88/** 82/**
89 * netprio_prio - return the effective netprio of a cgroup-net_device pair 83 * netprio_prio - return the effective netprio of a cgroup-net_device pair
90 * @cgrp: cgroup part of the target pair 84 * @css: css part of the target pair
91 * @dev: net_device part of the target pair 85 * @dev: net_device part of the target pair
92 * 86 *
93 * Should be called under RCU read or rtnl lock. 87 * Should be called under RCU read or rtnl lock.
94 */ 88 */
95static u32 netprio_prio(struct cgroup *cgrp, struct net_device *dev) 89static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev)
96{ 90{
97 struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); 91 struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
92 int id = css->cgroup->id;
98 93
99 if (map && cgrp->id < map->priomap_len) 94 if (map && id < map->priomap_len)
100 return map->priomap[cgrp->id]; 95 return map->priomap[id];
101 return 0; 96 return 0;
102} 97}
103 98
104/** 99/**
105 * netprio_set_prio - set netprio on a cgroup-net_device pair 100 * netprio_set_prio - set netprio on a cgroup-net_device pair
106 * @cgrp: cgroup part of the target pair 101 * @css: css part of the target pair
107 * @dev: net_device part of the target pair 102 * @dev: net_device part of the target pair
108 * @prio: prio to set 103 * @prio: prio to set
109 * 104 *
110 * Set netprio to @prio on @cgrp-@dev pair. Should be called under rtnl 105 * Set netprio to @prio on @css-@dev pair. Should be called under rtnl
111 * lock and may fail under memory pressure for non-zero @prio. 106 * lock and may fail under memory pressure for non-zero @prio.
112 */ 107 */
113static int netprio_set_prio(struct cgroup *cgrp, struct net_device *dev, 108static int netprio_set_prio(struct cgroup_subsys_state *css,
114 u32 prio) 109 struct net_device *dev, u32 prio)
115{ 110{
116 struct netprio_map *map; 111 struct netprio_map *map;
112 int id = css->cgroup->id;
117 int ret; 113 int ret;
118 114
119 /* avoid extending priomap for zero writes */ 115 /* avoid extending priomap for zero writes */
120 map = rtnl_dereference(dev->priomap); 116 map = rtnl_dereference(dev->priomap);
121 if (!prio && (!map || map->priomap_len <= cgrp->id)) 117 if (!prio && (!map || map->priomap_len <= id))
122 return 0; 118 return 0;
123 119
124 ret = extend_netdev_table(dev, cgrp->id); 120 ret = extend_netdev_table(dev, id);
125 if (ret) 121 if (ret)
126 return ret; 122 return ret;
127 123
128 map = rtnl_dereference(dev->priomap); 124 map = rtnl_dereference(dev->priomap);
129 map->priomap[cgrp->id] = prio; 125 map->priomap[id] = prio;
130 return 0; 126 return 0;
131} 127}
132 128
133static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) 129static struct cgroup_subsys_state *
130cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
134{ 131{
135 struct cgroup_netprio_state *cs; 132 struct cgroup_subsys_state *css;
136 133
137 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 134 css = kzalloc(sizeof(*css), GFP_KERNEL);
138 if (!cs) 135 if (!css)
139 return ERR_PTR(-ENOMEM); 136 return ERR_PTR(-ENOMEM);
140 137
141 return &cs->css; 138 return css;
142} 139}
143 140
144static int cgrp_css_online(struct cgroup *cgrp) 141static int cgrp_css_online(struct cgroup_subsys_state *css)
145{ 142{
146 struct cgroup *parent = cgrp->parent; 143 struct cgroup_subsys_state *parent_css = css_parent(css);
147 struct net_device *dev; 144 struct net_device *dev;
148 int ret = 0; 145 int ret = 0;
149 146
150 if (!parent) 147 if (!parent_css)
151 return 0; 148 return 0;
152 149
153 rtnl_lock(); 150 rtnl_lock();
@@ -156,9 +153,9 @@ static int cgrp_css_online(struct cgroup *cgrp)
156 * onlining, there is no need to clear them on offline. 153 * onlining, there is no need to clear them on offline.
157 */ 154 */
158 for_each_netdev(&init_net, dev) { 155 for_each_netdev(&init_net, dev) {
159 u32 prio = netprio_prio(parent, dev); 156 u32 prio = netprio_prio(parent_css, dev);
160 157
161 ret = netprio_set_prio(cgrp, dev, prio); 158 ret = netprio_set_prio(css, dev, prio);
162 if (ret) 159 if (ret)
163 break; 160 break;
164 } 161 }
@@ -166,29 +163,29 @@ static int cgrp_css_online(struct cgroup *cgrp)
166 return ret; 163 return ret;
167} 164}
168 165
169static void cgrp_css_free(struct cgroup *cgrp) 166static void cgrp_css_free(struct cgroup_subsys_state *css)
170{ 167{
171 kfree(cgrp_netprio_state(cgrp)); 168 kfree(css);
172} 169}
173 170
174static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) 171static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
175{ 172{
176 return cgrp->id; 173 return css->cgroup->id;
177} 174}
178 175
179static int read_priomap(struct cgroup *cont, struct cftype *cft, 176static int read_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
180 struct cgroup_map_cb *cb) 177 struct cgroup_map_cb *cb)
181{ 178{
182 struct net_device *dev; 179 struct net_device *dev;
183 180
184 rcu_read_lock(); 181 rcu_read_lock();
185 for_each_netdev_rcu(&init_net, dev) 182 for_each_netdev_rcu(&init_net, dev)
186 cb->fill(cb, dev->name, netprio_prio(cont, dev)); 183 cb->fill(cb, dev->name, netprio_prio(css, dev));
187 rcu_read_unlock(); 184 rcu_read_unlock();
188 return 0; 185 return 0;
189} 186}
190 187
191static int write_priomap(struct cgroup *cgrp, struct cftype *cft, 188static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
192 const char *buffer) 189 const char *buffer)
193{ 190{
194 char devname[IFNAMSIZ + 1]; 191 char devname[IFNAMSIZ + 1];
@@ -205,7 +202,7 @@ static int write_priomap(struct cgroup *cgrp, struct cftype *cft,
205 202
206 rtnl_lock(); 203 rtnl_lock();
207 204
208 ret = netprio_set_prio(cgrp, dev, prio); 205 ret = netprio_set_prio(css, dev, prio);
209 206
210 rtnl_unlock(); 207 rtnl_unlock();
211 dev_put(dev); 208 dev_put(dev);
@@ -221,12 +218,13 @@ static int update_netprio(const void *v, struct file *file, unsigned n)
221 return 0; 218 return 0;
222} 219}
223 220
224static void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 221static void net_prio_attach(struct cgroup_subsys_state *css,
222 struct cgroup_taskset *tset)
225{ 223{
226 struct task_struct *p; 224 struct task_struct *p;
227 void *v; 225 void *v;
228 226
229 cgroup_taskset_for_each(p, cgrp, tset) { 227 cgroup_taskset_for_each(p, css, tset) {
230 task_lock(p); 228 task_lock(p);
231 v = (void *)(unsigned long)task_netprioidx(p); 229 v = (void *)(unsigned long)task_netprioidx(p);
232 iterate_fd(p->files, 0, update_netprio, v); 230 iterate_fd(p->files, 0, update_netprio, v);
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index da14436c1735..8a57d79b0b16 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -132,10 +132,10 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
132 return 0; 132 return 0;
133} 133}
134 134
135static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft, 135static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
136 const char *buffer) 136 const char *buffer)
137{ 137{
138 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 138 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
139 unsigned long long val; 139 unsigned long long val;
140 int ret = 0; 140 int ret = 0;
141 141
@@ -180,9 +180,9 @@ static u64 tcp_read_usage(struct mem_cgroup *memcg)
180 return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); 180 return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE);
181} 181}
182 182
183static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) 183static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
184{ 184{
185 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 185 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
186 u64 val; 186 u64 val;
187 187
188 switch (cft->private) { 188 switch (cft->private) {
@@ -202,13 +202,13 @@ static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft)
202 return val; 202 return val;
203} 203}
204 204
205static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event) 205static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
206{ 206{
207 struct mem_cgroup *memcg; 207 struct mem_cgroup *memcg;
208 struct tcp_memcontrol *tcp; 208 struct tcp_memcontrol *tcp;
209 struct cg_proto *cg_proto; 209 struct cg_proto *cg_proto;
210 210
211 memcg = mem_cgroup_from_cont(cont); 211 memcg = mem_cgroup_from_css(css);
212 cg_proto = tcp_prot.proto_cgroup(memcg); 212 cg_proto = tcp_prot.proto_cgroup(memcg);
213 if (!cg_proto) 213 if (!cg_proto)
214 return 0; 214 return 0;
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 3a294eb98d61..867b4a3e3980 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -23,19 +23,18 @@
23#include <net/sock.h> 23#include <net/sock.h>
24#include <net/cls_cgroup.h> 24#include <net/cls_cgroup.h>
25 25
26static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp) 26static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css)
27{ 27{
28 return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id), 28 return css ? container_of(css, struct cgroup_cls_state, css) : NULL;
29 struct cgroup_cls_state, css);
30} 29}
31 30
32static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) 31static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p)
33{ 32{
34 return container_of(task_subsys_state(p, net_cls_subsys_id), 33 return css_cls_state(task_css(p, net_cls_subsys_id));
35 struct cgroup_cls_state, css);
36} 34}
37 35
38static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) 36static struct cgroup_subsys_state *
37cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
39{ 38{
40 struct cgroup_cls_state *cs; 39 struct cgroup_cls_state *cs;
41 40
@@ -45,17 +44,19 @@ static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp)
45 return &cs->css; 44 return &cs->css;
46} 45}
47 46
48static int cgrp_css_online(struct cgroup *cgrp) 47static int cgrp_css_online(struct cgroup_subsys_state *css)
49{ 48{
50 if (cgrp->parent) 49 struct cgroup_cls_state *cs = css_cls_state(css);
51 cgrp_cls_state(cgrp)->classid = 50 struct cgroup_cls_state *parent = css_cls_state(css_parent(css));
52 cgrp_cls_state(cgrp->parent)->classid; 51
52 if (parent)
53 cs->classid = parent->classid;
53 return 0; 54 return 0;
54} 55}
55 56
56static void cgrp_css_free(struct cgroup *cgrp) 57static void cgrp_css_free(struct cgroup_subsys_state *css)
57{ 58{
58 kfree(cgrp_cls_state(cgrp)); 59 kfree(css_cls_state(css));
59} 60}
60 61
61static int update_classid(const void *v, struct file *file, unsigned n) 62static int update_classid(const void *v, struct file *file, unsigned n)
@@ -67,12 +68,13 @@ static int update_classid(const void *v, struct file *file, unsigned n)
67 return 0; 68 return 0;
68} 69}
69 70
70static void cgrp_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 71static void cgrp_attach(struct cgroup_subsys_state *css,
72 struct cgroup_taskset *tset)
71{ 73{
72 struct task_struct *p; 74 struct task_struct *p;
73 void *v; 75 void *v;
74 76
75 cgroup_taskset_for_each(p, cgrp, tset) { 77 cgroup_taskset_for_each(p, css, tset) {
76 task_lock(p); 78 task_lock(p);
77 v = (void *)(unsigned long)task_cls_classid(p); 79 v = (void *)(unsigned long)task_cls_classid(p);
78 iterate_fd(p->files, 0, update_classid, v); 80 iterate_fd(p->files, 0, update_classid, v);
@@ -80,14 +82,15 @@ static void cgrp_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
80 } 82 }
81} 83}
82 84
83static u64 read_classid(struct cgroup *cgrp, struct cftype *cft) 85static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
84{ 86{
85 return cgrp_cls_state(cgrp)->classid; 87 return css_cls_state(css)->classid;
86} 88}
87 89
88static int write_classid(struct cgroup *cgrp, struct cftype *cft, u64 value) 90static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
91 u64 value)
89{ 92{
90 cgrp_cls_state(cgrp)->classid = (u32) value; 93 css_cls_state(css)->classid = (u32) value;
91 return 0; 94 return 0;
92} 95}
93 96
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index e8aad69f0d69..c123628d3f84 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -53,22 +53,17 @@ struct dev_cgroup {
53 53
54static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) 54static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
55{ 55{
56 return container_of(s, struct dev_cgroup, css); 56 return s ? container_of(s, struct dev_cgroup, css) : NULL;
57}
58
59static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup)
60{
61 return css_to_devcgroup(cgroup_subsys_state(cgroup, devices_subsys_id));
62} 57}
63 58
64static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) 59static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
65{ 60{
66 return css_to_devcgroup(task_subsys_state(task, devices_subsys_id)); 61 return css_to_devcgroup(task_css(task, devices_subsys_id));
67} 62}
68 63
69struct cgroup_subsys devices_subsys; 64struct cgroup_subsys devices_subsys;
70 65
71static int devcgroup_can_attach(struct cgroup *new_cgrp, 66static int devcgroup_can_attach(struct cgroup_subsys_state *new_css,
72 struct cgroup_taskset *set) 67 struct cgroup_taskset *set)
73{ 68{
74 struct task_struct *task = cgroup_taskset_first(set); 69 struct task_struct *task = cgroup_taskset_first(set);
@@ -193,18 +188,16 @@ static inline bool is_devcg_online(const struct dev_cgroup *devcg)
193/** 188/**
194 * devcgroup_online - initializes devcgroup's behavior and exceptions based on 189 * devcgroup_online - initializes devcgroup's behavior and exceptions based on
195 * parent's 190 * parent's
196 * @cgroup: cgroup getting online 191 * @css: css getting online
197 * returns 0 in case of success, error code otherwise 192 * returns 0 in case of success, error code otherwise
198 */ 193 */
199static int devcgroup_online(struct cgroup *cgroup) 194static int devcgroup_online(struct cgroup_subsys_state *css)
200{ 195{
201 struct dev_cgroup *dev_cgroup, *parent_dev_cgroup = NULL; 196 struct dev_cgroup *dev_cgroup = css_to_devcgroup(css);
197 struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css_parent(css));
202 int ret = 0; 198 int ret = 0;
203 199
204 mutex_lock(&devcgroup_mutex); 200 mutex_lock(&devcgroup_mutex);
205 dev_cgroup = cgroup_to_devcgroup(cgroup);
206 if (cgroup->parent)
207 parent_dev_cgroup = cgroup_to_devcgroup(cgroup->parent);
208 201
209 if (parent_dev_cgroup == NULL) 202 if (parent_dev_cgroup == NULL)
210 dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW; 203 dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW;
@@ -219,9 +212,9 @@ static int devcgroup_online(struct cgroup *cgroup)
219 return ret; 212 return ret;
220} 213}
221 214
222static void devcgroup_offline(struct cgroup *cgroup) 215static void devcgroup_offline(struct cgroup_subsys_state *css)
223{ 216{
224 struct dev_cgroup *dev_cgroup = cgroup_to_devcgroup(cgroup); 217 struct dev_cgroup *dev_cgroup = css_to_devcgroup(css);
225 218
226 mutex_lock(&devcgroup_mutex); 219 mutex_lock(&devcgroup_mutex);
227 dev_cgroup->behavior = DEVCG_DEFAULT_NONE; 220 dev_cgroup->behavior = DEVCG_DEFAULT_NONE;
@@ -231,7 +224,8 @@ static void devcgroup_offline(struct cgroup *cgroup)
231/* 224/*
232 * called from kernel/cgroup.c with cgroup_lock() held. 225 * called from kernel/cgroup.c with cgroup_lock() held.
233 */ 226 */
234static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup) 227static struct cgroup_subsys_state *
228devcgroup_css_alloc(struct cgroup_subsys_state *parent_css)
235{ 229{
236 struct dev_cgroup *dev_cgroup; 230 struct dev_cgroup *dev_cgroup;
237 231
@@ -244,11 +238,10 @@ static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup)
244 return &dev_cgroup->css; 238 return &dev_cgroup->css;
245} 239}
246 240
247static void devcgroup_css_free(struct cgroup *cgroup) 241static void devcgroup_css_free(struct cgroup_subsys_state *css)
248{ 242{
249 struct dev_cgroup *dev_cgroup; 243 struct dev_cgroup *dev_cgroup = css_to_devcgroup(css);
250 244
251 dev_cgroup = cgroup_to_devcgroup(cgroup);
252 __dev_exception_clean(dev_cgroup); 245 __dev_exception_clean(dev_cgroup);
253 kfree(dev_cgroup); 246 kfree(dev_cgroup);
254} 247}
@@ -291,10 +284,10 @@ static void set_majmin(char *str, unsigned m)
291 sprintf(str, "%u", m); 284 sprintf(str, "%u", m);
292} 285}
293 286
294static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft, 287static int devcgroup_seq_read(struct cgroup_subsys_state *css,
295 struct seq_file *m) 288 struct cftype *cft, struct seq_file *m)
296{ 289{
297 struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup); 290 struct dev_cgroup *devcgroup = css_to_devcgroup(css);
298 struct dev_exception_item *ex; 291 struct dev_exception_item *ex;
299 char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; 292 char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN];
300 293
@@ -394,12 +387,10 @@ static bool may_access(struct dev_cgroup *dev_cgroup,
394static int parent_has_perm(struct dev_cgroup *childcg, 387static int parent_has_perm(struct dev_cgroup *childcg,
395 struct dev_exception_item *ex) 388 struct dev_exception_item *ex)
396{ 389{
397 struct cgroup *pcg = childcg->css.cgroup->parent; 390 struct dev_cgroup *parent = css_to_devcgroup(css_parent(&childcg->css));
398 struct dev_cgroup *parent;
399 391
400 if (!pcg) 392 if (!parent)
401 return 1; 393 return 1;
402 parent = cgroup_to_devcgroup(pcg);
403 return may_access(parent, ex, childcg->behavior); 394 return may_access(parent, ex, childcg->behavior);
404} 395}
405 396
@@ -451,13 +442,13 @@ static void revalidate_active_exceptions(struct dev_cgroup *devcg)
451static int propagate_exception(struct dev_cgroup *devcg_root, 442static int propagate_exception(struct dev_cgroup *devcg_root,
452 struct dev_exception_item *ex) 443 struct dev_exception_item *ex)
453{ 444{
454 struct cgroup *root = devcg_root->css.cgroup, *pos; 445 struct cgroup_subsys_state *pos;
455 int rc = 0; 446 int rc = 0;
456 447
457 rcu_read_lock(); 448 rcu_read_lock();
458 449
459 cgroup_for_each_descendant_pre(pos, root) { 450 css_for_each_descendant_pre(pos, &devcg_root->css) {
460 struct dev_cgroup *devcg = cgroup_to_devcgroup(pos); 451 struct dev_cgroup *devcg = css_to_devcgroup(pos);
461 452
462 /* 453 /*
463 * Because devcgroup_mutex is held, no devcg will become 454 * Because devcgroup_mutex is held, no devcg will become
@@ -465,7 +456,7 @@ static int propagate_exception(struct dev_cgroup *devcg_root,
465 * methods), and online ones are safe to access outside RCU 456 * methods), and online ones are safe to access outside RCU
466 * read lock without bumping refcnt. 457 * read lock without bumping refcnt.
467 */ 458 */
468 if (!is_devcg_online(devcg)) 459 if (pos == &devcg_root->css || !is_devcg_online(devcg))
469 continue; 460 continue;
470 461
471 rcu_read_unlock(); 462 rcu_read_unlock();
@@ -524,15 +515,11 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
524 char temp[12]; /* 11 + 1 characters needed for a u32 */ 515 char temp[12]; /* 11 + 1 characters needed for a u32 */
525 int count, rc = 0; 516 int count, rc = 0;
526 struct dev_exception_item ex; 517 struct dev_exception_item ex;
527 struct cgroup *p = devcgroup->css.cgroup; 518 struct dev_cgroup *parent = css_to_devcgroup(css_parent(&devcgroup->css));
528 struct dev_cgroup *parent = NULL;
529 519
530 if (!capable(CAP_SYS_ADMIN)) 520 if (!capable(CAP_SYS_ADMIN))
531 return -EPERM; 521 return -EPERM;
532 522
533 if (p->parent)
534 parent = cgroup_to_devcgroup(p->parent);
535
536 memset(&ex, 0, sizeof(ex)); 523 memset(&ex, 0, sizeof(ex));
537 b = buffer; 524 b = buffer;
538 525
@@ -677,13 +664,13 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
677 return rc; 664 return rc;
678} 665}
679 666
680static int devcgroup_access_write(struct cgroup *cgrp, struct cftype *cft, 667static int devcgroup_access_write(struct cgroup_subsys_state *css,
681 const char *buffer) 668 struct cftype *cft, const char *buffer)
682{ 669{
683 int retval; 670 int retval;
684 671
685 mutex_lock(&devcgroup_mutex); 672 mutex_lock(&devcgroup_mutex);
686 retval = devcgroup_update_access(cgroup_to_devcgroup(cgrp), 673 retval = devcgroup_update_access(css_to_devcgroup(css),
687 cft->private, buffer); 674 cft->private, buffer);
688 mutex_unlock(&devcgroup_mutex); 675 mutex_unlock(&devcgroup_mutex);
689 return retval; 676 return retval;