aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-05-23 21:55:38 -0400
committerTejun Heo <tj@kernel.org>2013-05-23 21:55:38 -0400
commit53fa5261747a90746531e8a1c81eeb78fedc2f71 (patch)
treeb615f4ac453b9f40d412b5ba22498b827902f438 /kernel/cgroup.c
parentbdc7119f1bdd0632d42f435941dc290216a436e7 (diff)
cgroup: add cgroup->serial_nr and implement cgroup_next_sibling()
Currently, there's no easy way to find out the next sibling cgroup unless it's known that the current cgroup is accessed from the parent's children list in a single RCU critical section. This in turn forces all iterators to require whole iteration to be enclosed in a single RCU critical section, which sometimes is too restrictive. This patch implements cgroup_next_sibling() which can reliably determine the next sibling regardless of the state of the current cgroup as long as it's accessible. It currently is impossible to determine the next sibling after dropping RCU read lock because the cgroup being iterated could be removed anytime and if RCU read lock is dropped, nothing guarantess its ->sibling.next pointer is accessible. A removed cgroup would continue to point to its next sibling for RCU accesses but stop receiving updates from the sibling. IOW, the next sibling could be removed and then complete its grace period while RCU read lock is dropped, making it unsafe to dereference ->sibling.next after dropping and re-acquiring RCU read lock. This can be solved by adding a way to traverse to the next sibling without dereferencing ->sibling.next. This patch adds a monotonically increasing cgroup serial number, cgroup->serial_nr, which guarantees that all cgroup->children lists are kept in increasing serial_nr order. A new function, cgroup_next_sibling(), is implemented, which, if CGRP_REMOVED is not set on the current cgroup, follows ->sibling.next; otherwise, traverses the parent's ->children list until it sees a sibling with higher ->serial_nr. This allows the function to always return the next sibling regardless of the state of the current cgroup without adding overhead in the fast path. Further patches will update the iterators to use cgroup_next_sibling() so that they allow dropping RCU read lock and blocking while iteration is in progress which in turn will be used to simplify controllers. v2: Typo fix as per Serge. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c62
1 files changed, 62 insertions, 0 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 501974823b33..b87c7a5a5497 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2976,6 +2976,55 @@ static void cgroup_enable_task_cg_lists(void)
2976} 2976}
2977 2977
2978/** 2978/**
2979 * cgroup_next_sibling - find the next sibling of a given cgroup
2980 * @pos: the current cgroup
2981 *
2982 * This function returns the next sibling of @pos and should be called
2983 * under RCU read lock. The only requirement is that @pos is accessible.
2984 * The next sibling is guaranteed to be returned regardless of @pos's
2985 * state.
2986 */
2987struct cgroup *cgroup_next_sibling(struct cgroup *pos)
2988{
2989 struct cgroup *next;
2990
2991 WARN_ON_ONCE(!rcu_read_lock_held());
2992
2993 /*
2994 * @pos could already have been removed. Once a cgroup is removed,
2995 * its ->sibling.next is no longer updated when its next sibling
2996 * changes. As CGRP_REMOVED is set on removal which is fully
2997 * serialized, if we see it unasserted, it's guaranteed that the
2998 * next sibling hasn't finished its grace period even if it's
2999 * already removed, and thus safe to dereference from this RCU
3000 * critical section. If ->sibling.next is inaccessible,
3001 * cgroup_is_removed() is guaranteed to be visible as %true here.
3002 */
3003 if (likely(!cgroup_is_removed(pos))) {
3004 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3005 if (&next->sibling != &pos->parent->children)
3006 return next;
3007 return NULL;
3008 }
3009
3010 /*
3011 * Can't dereference the next pointer. Each cgroup is given a
3012 * monotonically increasing unique serial number and always
3013 * appended to the sibling list, so the next one can be found by
3014 * walking the parent's children until we see a cgroup with higher
3015 * serial number than @pos's.
3016 *
3017 * While this path can be slow, it's taken only when either the
3018 * current cgroup is removed or iteration and removal race.
3019 */
3020 list_for_each_entry_rcu(next, &pos->parent->children, sibling)
3021 if (next->serial_nr > pos->serial_nr)
3022 return next;
3023 return NULL;
3024}
3025EXPORT_SYMBOL_GPL(cgroup_next_sibling);
3026
3027/**
2979 * cgroup_next_descendant_pre - find the next descendant for pre-order walk 3028 * cgroup_next_descendant_pre - find the next descendant for pre-order walk
2980 * @pos: the current position (%NULL to initiate traversal) 3029 * @pos: the current position (%NULL to initiate traversal)
2981 * @cgroup: cgroup whose descendants to walk 3030 * @cgroup: cgroup whose descendants to walk
@@ -4137,6 +4186,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4137static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 4186static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4138 umode_t mode) 4187 umode_t mode)
4139{ 4188{
4189 static atomic64_t serial_nr_cursor = ATOMIC64_INIT(0);
4140 struct cgroup *cgrp; 4190 struct cgroup *cgrp;
4141 struct cgroup_name *name; 4191 struct cgroup_name *name;
4142 struct cgroupfs_root *root = parent->root; 4192 struct cgroupfs_root *root = parent->root;
@@ -4217,6 +4267,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4217 goto err_free_all; 4267 goto err_free_all;
4218 lockdep_assert_held(&dentry->d_inode->i_mutex); 4268 lockdep_assert_held(&dentry->d_inode->i_mutex);
4219 4269
4270 /*
4271 * Assign a monotonically increasing serial number. With the list
4272 * appending below, it guarantees that sibling cgroups are always
4273 * sorted in the ascending serial number order on the parent's
4274 * ->children.
4275 */
4276 cgrp->serial_nr = atomic64_inc_return(&serial_nr_cursor);
4277
4220 /* allocation complete, commit to creation */ 4278 /* allocation complete, commit to creation */
4221 list_add_tail(&cgrp->allcg_node, &root->allcg_list); 4279 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
4222 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4280 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
@@ -4304,6 +4362,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4304 * removed. This makes future css_tryget() and child creation 4362 * removed. This makes future css_tryget() and child creation
4305 * attempts fail thus maintaining the removal conditions verified 4363 * attempts fail thus maintaining the removal conditions verified
4306 * above. 4364 * above.
4365 *
4366 * Note that CGRP_REMVOED clearing is depended upon by
4367 * cgroup_next_sibling() to resume iteration after dropping RCU
4368 * read lock. See cgroup_next_sibling() for details.
4307 */ 4369 */
4308 for_each_subsys(cgrp->root, ss) { 4370 for_each_subsys(cgrp->root, ss) {
4309 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4371 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];