diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2009-04-02 19:57:25 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-04-02 22:04:53 -0400 |
commit | 38460b48d06440de46b34cb778bd6c4855030754 (patch) | |
tree | 8f3362a446b5b03879f715c3f7279e70842bcca9 /kernel | |
parent | 313e924c0852943e67335fad9d2608701f0dfe8e (diff) |
cgroup: CSS ID support
Patch for Per-CSS(Cgroup Subsys State) ID and private hierarchy code.
This patch attaches unique ID to each css and provides following.
- css_lookup(subsys, id)
returns pointer to struct cgroup_subysys_state of id.
- css_get_next(subsys, id, rootid, depth, foundid)
returns the next css under "root" by scanning
When cgroup_subsys->use_id is set, an id for css is maintained.
The cgroup framework only parepares
- css_id of root css for subsys
- id is automatically attached at creation of css.
- id is *not* freed automatically. Because the cgroup framework
don't know lifetime of cgroup_subsys_state.
free_css_id() function is provided. This must be called by subsys.
There are several reasons to develop this.
- Saving space .... For example, memcg's swap_cgroup is array of
pointers to cgroup. But it is not necessary to be very fast.
By replacing pointers(8bytes per ent) to ID (2byes per ent), we can
reduce much amount of memory usage.
- Scanning without lock.
CSS_ID provides "scan id under this ROOT" function. By this, scanning
css under root can be written without locks.
ex)
do {
rcu_read_lock();
next = cgroup_get_next(subsys, id, root, &found);
/* check sanity of next here */
css_tryget();
rcu_read_unlock();
id = found + 1
} while(...)
Characteristics:
- Each css has unique ID under subsys.
- Lifetime of ID is controlled by subsys.
- css ID contains "ID" and "Depth in hierarchy" and stack of hierarchy
- Allowed ID is 1-65535, ID 0 is UNUSED ID.
Design Choices:
- scan-by-ID v.s. scan-by-tree-walk.
As /proc's pid scan does, scan-by-ID is robust when scanning is done
by following kind of routine.
scan -> rest a while(release a lock) -> conitunue from interrupted
memcg's hierarchical reclaim does this.
- When subsys->use_id is set, # of css in the system is limited to
65535.
[bharata@linux.vnet.ibm.com: remove rcu_read_lock() from css_get_next()]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 286 |
1 files changed, 285 insertions, 1 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 27792bcb0758..d3c521137425 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -94,7 +94,6 @@ struct cgroupfs_root { | |||
94 | char release_agent_path[PATH_MAX]; | 94 | char release_agent_path[PATH_MAX]; |
95 | }; | 95 | }; |
96 | 96 | ||
97 | |||
98 | /* | 97 | /* |
99 | * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the | 98 | * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the |
100 | * subsystems that are otherwise unattached - it never has more than a | 99 | * subsystems that are otherwise unattached - it never has more than a |
@@ -102,6 +101,39 @@ struct cgroupfs_root { | |||
102 | */ | 101 | */ |
103 | static struct cgroupfs_root rootnode; | 102 | static struct cgroupfs_root rootnode; |
104 | 103 | ||
104 | /* | ||
105 | * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when | ||
106 | * cgroup_subsys->use_id != 0. | ||
107 | */ | ||
108 | #define CSS_ID_MAX (65535) | ||
109 | struct css_id { | ||
110 | /* | ||
111 | * The css to which this ID points. This pointer is set to valid value | ||
112 | * after cgroup is populated. If cgroup is removed, this will be NULL. | ||
113 | * This pointer is expected to be RCU-safe because destroy() | ||
114 | * is called after synchronize_rcu(). But for safe use, css_is_removed() | ||
115 | * css_tryget() should be used for avoiding race. | ||
116 | */ | ||
117 | struct cgroup_subsys_state *css; | ||
118 | /* | ||
119 | * ID of this css. | ||
120 | */ | ||
121 | unsigned short id; | ||
122 | /* | ||
123 | * Depth in hierarchy which this ID belongs to. | ||
124 | */ | ||
125 | unsigned short depth; | ||
126 | /* | ||
127 | * ID is freed by RCU. (and lookup routine is RCU safe.) | ||
128 | */ | ||
129 | struct rcu_head rcu_head; | ||
130 | /* | ||
131 | * Hierarchy of CSS ID belongs to. | ||
132 | */ | ||
133 | unsigned short stack[0]; /* Array of Length (depth+1) */ | ||
134 | }; | ||
135 | |||
136 | |||
105 | /* The list of hierarchy roots */ | 137 | /* The list of hierarchy roots */ |
106 | 138 | ||
107 | static LIST_HEAD(roots); | 139 | static LIST_HEAD(roots); |
@@ -185,6 +217,8 @@ struct cg_cgroup_link { | |||
185 | static struct css_set init_css_set; | 217 | static struct css_set init_css_set; |
186 | static struct cg_cgroup_link init_css_set_link; | 218 | static struct cg_cgroup_link init_css_set_link; |
187 | 219 | ||
220 | static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); | ||
221 | |||
188 | /* css_set_lock protects the list of css_set objects, and the | 222 | /* css_set_lock protects the list of css_set objects, and the |
189 | * chain of tasks off each css_set. Nests outside task->alloc_lock | 223 | * chain of tasks off each css_set. Nests outside task->alloc_lock |
190 | * due to cgroup_iter_start() */ | 224 | * due to cgroup_iter_start() */ |
@@ -567,6 +601,9 @@ static struct backing_dev_info cgroup_backing_dev_info = { | |||
567 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, | 601 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, |
568 | }; | 602 | }; |
569 | 603 | ||
604 | static int alloc_css_id(struct cgroup_subsys *ss, | ||
605 | struct cgroup *parent, struct cgroup *child); | ||
606 | |||
570 | static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) | 607 | static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) |
571 | { | 608 | { |
572 | struct inode *inode = new_inode(sb); | 609 | struct inode *inode = new_inode(sb); |
@@ -2327,6 +2364,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp) | |||
2327 | if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) | 2364 | if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) |
2328 | return err; | 2365 | return err; |
2329 | } | 2366 | } |
2367 | /* This cgroup is ready now */ | ||
2368 | for_each_subsys(cgrp->root, ss) { | ||
2369 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
2370 | /* | ||
2371 | * Update id->css pointer and make this css visible from | ||
2372 | * CSS ID functions. This pointer will be dereferened | ||
2373 | * from RCU-read-side without locks. | ||
2374 | */ | ||
2375 | if (css->id) | ||
2376 | rcu_assign_pointer(css->id->css, css); | ||
2377 | } | ||
2330 | 2378 | ||
2331 | return 0; | 2379 | return 0; |
2332 | } | 2380 | } |
@@ -2338,6 +2386,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, | |||
2338 | css->cgroup = cgrp; | 2386 | css->cgroup = cgrp; |
2339 | atomic_set(&css->refcnt, 1); | 2387 | atomic_set(&css->refcnt, 1); |
2340 | css->flags = 0; | 2388 | css->flags = 0; |
2389 | css->id = NULL; | ||
2341 | if (cgrp == dummytop) | 2390 | if (cgrp == dummytop) |
2342 | set_bit(CSS_ROOT, &css->flags); | 2391 | set_bit(CSS_ROOT, &css->flags); |
2343 | BUG_ON(cgrp->subsys[ss->subsys_id]); | 2392 | BUG_ON(cgrp->subsys[ss->subsys_id]); |
@@ -2413,6 +2462,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
2413 | goto err_destroy; | 2462 | goto err_destroy; |
2414 | } | 2463 | } |
2415 | init_cgroup_css(css, ss, cgrp); | 2464 | init_cgroup_css(css, ss, cgrp); |
2465 | if (ss->use_id) | ||
2466 | if (alloc_css_id(ss, parent, cgrp)) | ||
2467 | goto err_destroy; | ||
2468 | /* At error, ->destroy() callback has to free assigned ID. */ | ||
2416 | } | 2469 | } |
2417 | 2470 | ||
2418 | cgroup_lock_hierarchy(root); | 2471 | cgroup_lock_hierarchy(root); |
@@ -2708,6 +2761,8 @@ int __init cgroup_init(void) | |||
2708 | struct cgroup_subsys *ss = subsys[i]; | 2761 | struct cgroup_subsys *ss = subsys[i]; |
2709 | if (!ss->early_init) | 2762 | if (!ss->early_init) |
2710 | cgroup_init_subsys(ss); | 2763 | cgroup_init_subsys(ss); |
2764 | if (ss->use_id) | ||
2765 | cgroup_subsys_init_idr(ss); | ||
2711 | } | 2766 | } |
2712 | 2767 | ||
2713 | /* Add init_css_set to the hash table */ | 2768 | /* Add init_css_set to the hash table */ |
@@ -3242,3 +3297,232 @@ static int __init cgroup_disable(char *str) | |||
3242 | return 1; | 3297 | return 1; |
3243 | } | 3298 | } |
3244 | __setup("cgroup_disable=", cgroup_disable); | 3299 | __setup("cgroup_disable=", cgroup_disable); |
3300 | |||
3301 | /* | ||
3302 | * Functons for CSS ID. | ||
3303 | */ | ||
3304 | |||
3305 | /* | ||
3306 | *To get ID other than 0, this should be called when !cgroup_is_removed(). | ||
3307 | */ | ||
3308 | unsigned short css_id(struct cgroup_subsys_state *css) | ||
3309 | { | ||
3310 | struct css_id *cssid = rcu_dereference(css->id); | ||
3311 | |||
3312 | if (cssid) | ||
3313 | return cssid->id; | ||
3314 | return 0; | ||
3315 | } | ||
3316 | |||
3317 | unsigned short css_depth(struct cgroup_subsys_state *css) | ||
3318 | { | ||
3319 | struct css_id *cssid = rcu_dereference(css->id); | ||
3320 | |||
3321 | if (cssid) | ||
3322 | return cssid->depth; | ||
3323 | return 0; | ||
3324 | } | ||
3325 | |||
3326 | bool css_is_ancestor(struct cgroup_subsys_state *child, | ||
3327 | struct cgroup_subsys_state *root) | ||
3328 | { | ||
3329 | struct css_id *child_id = rcu_dereference(child->id); | ||
3330 | struct css_id *root_id = rcu_dereference(root->id); | ||
3331 | |||
3332 | if (!child_id || !root_id || (child_id->depth < root_id->depth)) | ||
3333 | return false; | ||
3334 | return child_id->stack[root_id->depth] == root_id->id; | ||
3335 | } | ||
3336 | |||
3337 | static void __free_css_id_cb(struct rcu_head *head) | ||
3338 | { | ||
3339 | struct css_id *id; | ||
3340 | |||
3341 | id = container_of(head, struct css_id, rcu_head); | ||
3342 | kfree(id); | ||
3343 | } | ||
3344 | |||
3345 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | ||
3346 | { | ||
3347 | struct css_id *id = css->id; | ||
3348 | /* When this is called before css_id initialization, id can be NULL */ | ||
3349 | if (!id) | ||
3350 | return; | ||
3351 | |||
3352 | BUG_ON(!ss->use_id); | ||
3353 | |||
3354 | rcu_assign_pointer(id->css, NULL); | ||
3355 | rcu_assign_pointer(css->id, NULL); | ||
3356 | spin_lock(&ss->id_lock); | ||
3357 | idr_remove(&ss->idr, id->id); | ||
3358 | spin_unlock(&ss->id_lock); | ||
3359 | call_rcu(&id->rcu_head, __free_css_id_cb); | ||
3360 | } | ||
3361 | |||
3362 | /* | ||
3363 | * This is called by init or create(). Then, calls to this function are | ||
3364 | * always serialized (By cgroup_mutex() at create()). | ||
3365 | */ | ||
3366 | |||
3367 | static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) | ||
3368 | { | ||
3369 | struct css_id *newid; | ||
3370 | int myid, error, size; | ||
3371 | |||
3372 | BUG_ON(!ss->use_id); | ||
3373 | |||
3374 | size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1); | ||
3375 | newid = kzalloc(size, GFP_KERNEL); | ||
3376 | if (!newid) | ||
3377 | return ERR_PTR(-ENOMEM); | ||
3378 | /* get id */ | ||
3379 | if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) { | ||
3380 | error = -ENOMEM; | ||
3381 | goto err_out; | ||
3382 | } | ||
3383 | spin_lock(&ss->id_lock); | ||
3384 | /* Don't use 0. allocates an ID of 1-65535 */ | ||
3385 | error = idr_get_new_above(&ss->idr, newid, 1, &myid); | ||
3386 | spin_unlock(&ss->id_lock); | ||
3387 | |||
3388 | /* Returns error when there are no free spaces for new ID.*/ | ||
3389 | if (error) { | ||
3390 | error = -ENOSPC; | ||
3391 | goto err_out; | ||
3392 | } | ||
3393 | if (myid > CSS_ID_MAX) | ||
3394 | goto remove_idr; | ||
3395 | |||
3396 | newid->id = myid; | ||
3397 | newid->depth = depth; | ||
3398 | return newid; | ||
3399 | remove_idr: | ||
3400 | error = -ENOSPC; | ||
3401 | spin_lock(&ss->id_lock); | ||
3402 | idr_remove(&ss->idr, myid); | ||
3403 | spin_unlock(&ss->id_lock); | ||
3404 | err_out: | ||
3405 | kfree(newid); | ||
3406 | return ERR_PTR(error); | ||
3407 | |||
3408 | } | ||
3409 | |||
3410 | static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss) | ||
3411 | { | ||
3412 | struct css_id *newid; | ||
3413 | struct cgroup_subsys_state *rootcss; | ||
3414 | |||
3415 | spin_lock_init(&ss->id_lock); | ||
3416 | idr_init(&ss->idr); | ||
3417 | |||
3418 | rootcss = init_css_set.subsys[ss->subsys_id]; | ||
3419 | newid = get_new_cssid(ss, 0); | ||
3420 | if (IS_ERR(newid)) | ||
3421 | return PTR_ERR(newid); | ||
3422 | |||
3423 | newid->stack[0] = newid->id; | ||
3424 | newid->css = rootcss; | ||
3425 | rootcss->id = newid; | ||
3426 | return 0; | ||
3427 | } | ||
3428 | |||
3429 | static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, | ||
3430 | struct cgroup *child) | ||
3431 | { | ||
3432 | int subsys_id, i, depth = 0; | ||
3433 | struct cgroup_subsys_state *parent_css, *child_css; | ||
3434 | struct css_id *child_id, *parent_id = NULL; | ||
3435 | |||
3436 | subsys_id = ss->subsys_id; | ||
3437 | parent_css = parent->subsys[subsys_id]; | ||
3438 | child_css = child->subsys[subsys_id]; | ||
3439 | depth = css_depth(parent_css) + 1; | ||
3440 | parent_id = parent_css->id; | ||
3441 | |||
3442 | child_id = get_new_cssid(ss, depth); | ||
3443 | if (IS_ERR(child_id)) | ||
3444 | return PTR_ERR(child_id); | ||
3445 | |||
3446 | for (i = 0; i < depth; i++) | ||
3447 | child_id->stack[i] = parent_id->stack[i]; | ||
3448 | child_id->stack[depth] = child_id->id; | ||
3449 | /* | ||
3450 | * child_id->css pointer will be set after this cgroup is available | ||
3451 | * see cgroup_populate_dir() | ||
3452 | */ | ||
3453 | rcu_assign_pointer(child_css->id, child_id); | ||
3454 | |||
3455 | return 0; | ||
3456 | } | ||
3457 | |||
3458 | /** | ||
3459 | * css_lookup - lookup css by id | ||
3460 | * @ss: cgroup subsys to be looked into. | ||
3461 | * @id: the id | ||
3462 | * | ||
3463 | * Returns pointer to cgroup_subsys_state if there is valid one with id. | ||
3464 | * NULL if not. Should be called under rcu_read_lock() | ||
3465 | */ | ||
3466 | struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) | ||
3467 | { | ||
3468 | struct css_id *cssid = NULL; | ||
3469 | |||
3470 | BUG_ON(!ss->use_id); | ||
3471 | cssid = idr_find(&ss->idr, id); | ||
3472 | |||
3473 | if (unlikely(!cssid)) | ||
3474 | return NULL; | ||
3475 | |||
3476 | return rcu_dereference(cssid->css); | ||
3477 | } | ||
3478 | |||
3479 | /** | ||
3480 | * css_get_next - lookup next cgroup under specified hierarchy. | ||
3481 | * @ss: pointer to subsystem | ||
3482 | * @id: current position of iteration. | ||
3483 | * @root: pointer to css. search tree under this. | ||
3484 | * @foundid: position of found object. | ||
3485 | * | ||
3486 | * Search next css under the specified hierarchy of rootid. Calling under | ||
3487 | * rcu_read_lock() is necessary. Returns NULL if it reaches the end. | ||
3488 | */ | ||
3489 | struct cgroup_subsys_state * | ||
3490 | css_get_next(struct cgroup_subsys *ss, int id, | ||
3491 | struct cgroup_subsys_state *root, int *foundid) | ||
3492 | { | ||
3493 | struct cgroup_subsys_state *ret = NULL; | ||
3494 | struct css_id *tmp; | ||
3495 | int tmpid; | ||
3496 | int rootid = css_id(root); | ||
3497 | int depth = css_depth(root); | ||
3498 | |||
3499 | if (!rootid) | ||
3500 | return NULL; | ||
3501 | |||
3502 | BUG_ON(!ss->use_id); | ||
3503 | /* fill start point for scan */ | ||
3504 | tmpid = id; | ||
3505 | while (1) { | ||
3506 | /* | ||
3507 | * scan next entry from bitmap(tree), tmpid is updated after | ||
3508 | * idr_get_next(). | ||
3509 | */ | ||
3510 | spin_lock(&ss->id_lock); | ||
3511 | tmp = idr_get_next(&ss->idr, &tmpid); | ||
3512 | spin_unlock(&ss->id_lock); | ||
3513 | |||
3514 | if (!tmp) | ||
3515 | break; | ||
3516 | if (tmp->depth >= depth && tmp->stack[depth] == rootid) { | ||
3517 | ret = rcu_dereference(tmp->css); | ||
3518 | if (ret) { | ||
3519 | *foundid = tmpid; | ||
3520 | break; | ||
3521 | } | ||
3522 | } | ||
3523 | /* continue to scan from next id */ | ||
3524 | tmpid = tmpid + 1; | ||
3525 | } | ||
3526 | return ret; | ||
3527 | } | ||
3528 | |||