aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2009-04-02 19:57:25 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-04-02 22:04:53 -0400
commit38460b48d06440de46b34cb778bd6c4855030754 (patch)
tree8f3362a446b5b03879f715c3f7279e70842bcca9 /kernel/cgroup.c
parent313e924c0852943e67335fad9d2608701f0dfe8e (diff)
cgroup: CSS ID support
Patch for Per-CSS(Cgroup Subsys State) ID and private hierarchy code. This patch attaches unique ID to each css and provides following. - css_lookup(subsys, id) returns pointer to struct cgroup_subysys_state of id. - css_get_next(subsys, id, rootid, depth, foundid) returns the next css under "root" by scanning When cgroup_subsys->use_id is set, an id for css is maintained. The cgroup framework only parepares - css_id of root css for subsys - id is automatically attached at creation of css. - id is *not* freed automatically. Because the cgroup framework don't know lifetime of cgroup_subsys_state. free_css_id() function is provided. This must be called by subsys. There are several reasons to develop this. - Saving space .... For example, memcg's swap_cgroup is array of pointers to cgroup. But it is not necessary to be very fast. By replacing pointers(8bytes per ent) to ID (2byes per ent), we can reduce much amount of memory usage. - Scanning without lock. CSS_ID provides "scan id under this ROOT" function. By this, scanning css under root can be written without locks. ex) do { rcu_read_lock(); next = cgroup_get_next(subsys, id, root, &found); /* check sanity of next here */ css_tryget(); rcu_read_unlock(); id = found + 1 } while(...) Characteristics: - Each css has unique ID under subsys. - Lifetime of ID is controlled by subsys. - css ID contains "ID" and "Depth in hierarchy" and stack of hierarchy - Allowed ID is 1-65535, ID 0 is UNUSED ID. Design Choices: - scan-by-ID v.s. scan-by-tree-walk. As /proc's pid scan does, scan-by-ID is robust when scanning is done by following kind of routine. scan -> rest a while(release a lock) -> conitunue from interrupted memcg's hierarchical reclaim does this. - When subsys->use_id is set, # of css in the system is limited to 65535. [bharata@linux.vnet.ibm.com: remove rcu_read_lock() from css_get_next()] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Paul Menage <menage@google.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c286
1 files changed, 285 insertions, 1 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 27792bcb0758..d3c521137425 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -94,7 +94,6 @@ struct cgroupfs_root {
94 char release_agent_path[PATH_MAX]; 94 char release_agent_path[PATH_MAX];
95}; 95};
96 96
97
98/* 97/*
99 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 98 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
100 * subsystems that are otherwise unattached - it never has more than a 99 * subsystems that are otherwise unattached - it never has more than a
@@ -102,6 +101,39 @@ struct cgroupfs_root {
102 */ 101 */
103static struct cgroupfs_root rootnode; 102static struct cgroupfs_root rootnode;
104 103
104/*
105 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
106 * cgroup_subsys->use_id != 0.
107 */
108#define CSS_ID_MAX (65535)
109struct css_id {
110 /*
111 * The css to which this ID points. This pointer is set to valid value
112 * after cgroup is populated. If cgroup is removed, this will be NULL.
113 * This pointer is expected to be RCU-safe because destroy()
114 * is called after synchronize_rcu(). But for safe use, css_is_removed()
115 * css_tryget() should be used for avoiding race.
116 */
117 struct cgroup_subsys_state *css;
118 /*
119 * ID of this css.
120 */
121 unsigned short id;
122 /*
123 * Depth in hierarchy which this ID belongs to.
124 */
125 unsigned short depth;
126 /*
127 * ID is freed by RCU. (and lookup routine is RCU safe.)
128 */
129 struct rcu_head rcu_head;
130 /*
131 * Hierarchy of CSS ID belongs to.
132 */
133 unsigned short stack[0]; /* Array of Length (depth+1) */
134};
135
136
105/* The list of hierarchy roots */ 137/* The list of hierarchy roots */
106 138
107static LIST_HEAD(roots); 139static LIST_HEAD(roots);
@@ -185,6 +217,8 @@ struct cg_cgroup_link {
185static struct css_set init_css_set; 217static struct css_set init_css_set;
186static struct cg_cgroup_link init_css_set_link; 218static struct cg_cgroup_link init_css_set_link;
187 219
220static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
221
188/* css_set_lock protects the list of css_set objects, and the 222/* css_set_lock protects the list of css_set objects, and the
189 * chain of tasks off each css_set. Nests outside task->alloc_lock 223 * chain of tasks off each css_set. Nests outside task->alloc_lock
190 * due to cgroup_iter_start() */ 224 * due to cgroup_iter_start() */
@@ -567,6 +601,9 @@ static struct backing_dev_info cgroup_backing_dev_info = {
567 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 601 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
568}; 602};
569 603
604static int alloc_css_id(struct cgroup_subsys *ss,
605 struct cgroup *parent, struct cgroup *child);
606
570static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) 607static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
571{ 608{
572 struct inode *inode = new_inode(sb); 609 struct inode *inode = new_inode(sb);
@@ -2327,6 +2364,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
2327 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) 2364 if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
2328 return err; 2365 return err;
2329 } 2366 }
2367 /* This cgroup is ready now */
2368 for_each_subsys(cgrp->root, ss) {
2369 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2370 /*
2371 * Update id->css pointer and make this css visible from
2372 * CSS ID functions. This pointer will be dereferened
2373 * from RCU-read-side without locks.
2374 */
2375 if (css->id)
2376 rcu_assign_pointer(css->id->css, css);
2377 }
2330 2378
2331 return 0; 2379 return 0;
2332} 2380}
@@ -2338,6 +2386,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2338 css->cgroup = cgrp; 2386 css->cgroup = cgrp;
2339 atomic_set(&css->refcnt, 1); 2387 atomic_set(&css->refcnt, 1);
2340 css->flags = 0; 2388 css->flags = 0;
2389 css->id = NULL;
2341 if (cgrp == dummytop) 2390 if (cgrp == dummytop)
2342 set_bit(CSS_ROOT, &css->flags); 2391 set_bit(CSS_ROOT, &css->flags);
2343 BUG_ON(cgrp->subsys[ss->subsys_id]); 2392 BUG_ON(cgrp->subsys[ss->subsys_id]);
@@ -2413,6 +2462,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2413 goto err_destroy; 2462 goto err_destroy;
2414 } 2463 }
2415 init_cgroup_css(css, ss, cgrp); 2464 init_cgroup_css(css, ss, cgrp);
2465 if (ss->use_id)
2466 if (alloc_css_id(ss, parent, cgrp))
2467 goto err_destroy;
2468 /* At error, ->destroy() callback has to free assigned ID. */
2416 } 2469 }
2417 2470
2418 cgroup_lock_hierarchy(root); 2471 cgroup_lock_hierarchy(root);
@@ -2708,6 +2761,8 @@ int __init cgroup_init(void)
2708 struct cgroup_subsys *ss = subsys[i]; 2761 struct cgroup_subsys *ss = subsys[i];
2709 if (!ss->early_init) 2762 if (!ss->early_init)
2710 cgroup_init_subsys(ss); 2763 cgroup_init_subsys(ss);
2764 if (ss->use_id)
2765 cgroup_subsys_init_idr(ss);
2711 } 2766 }
2712 2767
2713 /* Add init_css_set to the hash table */ 2768 /* Add init_css_set to the hash table */
@@ -3242,3 +3297,232 @@ static int __init cgroup_disable(char *str)
3242 return 1; 3297 return 1;
3243} 3298}
3244__setup("cgroup_disable=", cgroup_disable); 3299__setup("cgroup_disable=", cgroup_disable);
3300
3301/*
3302 * Functons for CSS ID.
3303 */
3304
3305/*
3306 *To get ID other than 0, this should be called when !cgroup_is_removed().
3307 */
3308unsigned short css_id(struct cgroup_subsys_state *css)
3309{
3310 struct css_id *cssid = rcu_dereference(css->id);
3311
3312 if (cssid)
3313 return cssid->id;
3314 return 0;
3315}
3316
3317unsigned short css_depth(struct cgroup_subsys_state *css)
3318{
3319 struct css_id *cssid = rcu_dereference(css->id);
3320
3321 if (cssid)
3322 return cssid->depth;
3323 return 0;
3324}
3325
3326bool css_is_ancestor(struct cgroup_subsys_state *child,
3327 struct cgroup_subsys_state *root)
3328{
3329 struct css_id *child_id = rcu_dereference(child->id);
3330 struct css_id *root_id = rcu_dereference(root->id);
3331
3332 if (!child_id || !root_id || (child_id->depth < root_id->depth))
3333 return false;
3334 return child_id->stack[root_id->depth] == root_id->id;
3335}
3336
3337static void __free_css_id_cb(struct rcu_head *head)
3338{
3339 struct css_id *id;
3340
3341 id = container_of(head, struct css_id, rcu_head);
3342 kfree(id);
3343}
3344
3345void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
3346{
3347 struct css_id *id = css->id;
3348 /* When this is called before css_id initialization, id can be NULL */
3349 if (!id)
3350 return;
3351
3352 BUG_ON(!ss->use_id);
3353
3354 rcu_assign_pointer(id->css, NULL);
3355 rcu_assign_pointer(css->id, NULL);
3356 spin_lock(&ss->id_lock);
3357 idr_remove(&ss->idr, id->id);
3358 spin_unlock(&ss->id_lock);
3359 call_rcu(&id->rcu_head, __free_css_id_cb);
3360}
3361
3362/*
3363 * This is called by init or create(). Then, calls to this function are
3364 * always serialized (By cgroup_mutex() at create()).
3365 */
3366
3367static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
3368{
3369 struct css_id *newid;
3370 int myid, error, size;
3371
3372 BUG_ON(!ss->use_id);
3373
3374 size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
3375 newid = kzalloc(size, GFP_KERNEL);
3376 if (!newid)
3377 return ERR_PTR(-ENOMEM);
3378 /* get id */
3379 if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
3380 error = -ENOMEM;
3381 goto err_out;
3382 }
3383 spin_lock(&ss->id_lock);
3384 /* Don't use 0. allocates an ID of 1-65535 */
3385 error = idr_get_new_above(&ss->idr, newid, 1, &myid);
3386 spin_unlock(&ss->id_lock);
3387
3388 /* Returns error when there are no free spaces for new ID.*/
3389 if (error) {
3390 error = -ENOSPC;
3391 goto err_out;
3392 }
3393 if (myid > CSS_ID_MAX)
3394 goto remove_idr;
3395
3396 newid->id = myid;
3397 newid->depth = depth;
3398 return newid;
3399remove_idr:
3400 error = -ENOSPC;
3401 spin_lock(&ss->id_lock);
3402 idr_remove(&ss->idr, myid);
3403 spin_unlock(&ss->id_lock);
3404err_out:
3405 kfree(newid);
3406 return ERR_PTR(error);
3407
3408}
3409
3410static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
3411{
3412 struct css_id *newid;
3413 struct cgroup_subsys_state *rootcss;
3414
3415 spin_lock_init(&ss->id_lock);
3416 idr_init(&ss->idr);
3417
3418 rootcss = init_css_set.subsys[ss->subsys_id];
3419 newid = get_new_cssid(ss, 0);
3420 if (IS_ERR(newid))
3421 return PTR_ERR(newid);
3422
3423 newid->stack[0] = newid->id;
3424 newid->css = rootcss;
3425 rootcss->id = newid;
3426 return 0;
3427}
3428
3429static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
3430 struct cgroup *child)
3431{
3432 int subsys_id, i, depth = 0;
3433 struct cgroup_subsys_state *parent_css, *child_css;
3434 struct css_id *child_id, *parent_id = NULL;
3435
3436 subsys_id = ss->subsys_id;
3437 parent_css = parent->subsys[subsys_id];
3438 child_css = child->subsys[subsys_id];
3439 depth = css_depth(parent_css) + 1;
3440 parent_id = parent_css->id;
3441
3442 child_id = get_new_cssid(ss, depth);
3443 if (IS_ERR(child_id))
3444 return PTR_ERR(child_id);
3445
3446 for (i = 0; i < depth; i++)
3447 child_id->stack[i] = parent_id->stack[i];
3448 child_id->stack[depth] = child_id->id;
3449 /*
3450 * child_id->css pointer will be set after this cgroup is available
3451 * see cgroup_populate_dir()
3452 */
3453 rcu_assign_pointer(child_css->id, child_id);
3454
3455 return 0;
3456}
3457
3458/**
3459 * css_lookup - lookup css by id
3460 * @ss: cgroup subsys to be looked into.
3461 * @id: the id
3462 *
3463 * Returns pointer to cgroup_subsys_state if there is valid one with id.
3464 * NULL if not. Should be called under rcu_read_lock()
3465 */
3466struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
3467{
3468 struct css_id *cssid = NULL;
3469
3470 BUG_ON(!ss->use_id);
3471 cssid = idr_find(&ss->idr, id);
3472
3473 if (unlikely(!cssid))
3474 return NULL;
3475
3476 return rcu_dereference(cssid->css);
3477}
3478
3479/**
3480 * css_get_next - lookup next cgroup under specified hierarchy.
3481 * @ss: pointer to subsystem
3482 * @id: current position of iteration.
3483 * @root: pointer to css. search tree under this.
3484 * @foundid: position of found object.
3485 *
3486 * Search next css under the specified hierarchy of rootid. Calling under
3487 * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
3488 */
3489struct cgroup_subsys_state *
3490css_get_next(struct cgroup_subsys *ss, int id,
3491 struct cgroup_subsys_state *root, int *foundid)
3492{
3493 struct cgroup_subsys_state *ret = NULL;
3494 struct css_id *tmp;
3495 int tmpid;
3496 int rootid = css_id(root);
3497 int depth = css_depth(root);
3498
3499 if (!rootid)
3500 return NULL;
3501
3502 BUG_ON(!ss->use_id);
3503 /* fill start point for scan */
3504 tmpid = id;
3505 while (1) {
3506 /*
3507 * scan next entry from bitmap(tree), tmpid is updated after
3508 * idr_get_next().
3509 */
3510 spin_lock(&ss->id_lock);
3511 tmp = idr_get_next(&ss->idr, &tmpid);
3512 spin_unlock(&ss->id_lock);
3513
3514 if (!tmp)
3515 break;
3516 if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
3517 ret = rcu_dereference(tmp->css);
3518 if (ret) {
3519 *foundid = tmpid;
3520 break;
3521 }
3522 }
3523 /* continue to scan from next id */
3524 tmpid = tmpid + 1;
3525 }
3526 return ret;
3527}
3528