aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2014-05-16 13:22:49 -0400
committerTejun Heo <tj@kernel.org>2014-05-16 13:22:49 -0400
commitde3f034182ecbf0efbcef7ab8b253c6c3049a592 (patch)
treeff49be5f6dbccf13c40d9ee1eff41cd8a0ab3cd8
parent0cb51d71c1fa9234afe4213089844be76ec1765a (diff)
cgroup: introduce CSS_RELEASED and reduce css iteration fallback window
css iterations allow the caller to drop RCU read lock. As long as the caller keeps the current position accessible, it can simply re-grab RCU read lock later and continue iteration. This is achieved by using CGRP_DEAD to detect whether the current positions next pointer is safe to dereference and if not re-iterate from the beginning to the next position using ->serial_nr. CGRP_DEAD is used as the marker to invalidate the next pointer and the only requirement is that the marker is set before the next sibling starts its RCU grace period. Because CGRP_DEAD is set at the end of cgroup_destroy_locked() but the cgroup is unlinked when the reference count reaches zero, we currently have a rather large window where this fallback re-iteration logic can be triggered. This patch introduces CSS_RELEASED which is set when a css is unlinked from its sibling list. This still keeps the re-iteration logic working while drastically reducing the window of its activation. While at it, rewrite the comment in css_next_child() to reflect the new flag and better explain the synchronization. This will also enable iterating csses directly instead of through cgroups. v2: CSS_RELEASED now assigned to 1 << 2 as 1 << 0 is used by CSS_NO_REF. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com>
-rw-r--r--include/linux/cgroup.h1
-rw-r--r--kernel/cgroup.c41
2 files changed, 21 insertions, 21 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ebe7ce49f4b7..5375582ea5f6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -97,6 +97,7 @@ struct cgroup_subsys_state {
97enum { 97enum {
98 CSS_NO_REF = (1 << 0), /* no reference counting for this css */ 98 CSS_NO_REF = (1 << 0), /* no reference counting for this css */
99 CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ 99 CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */
100 CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */
100}; 101};
101 102
102/** 103/**
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d5af128ec1ec..5544e685f2da 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3108,27 +3108,28 @@ css_next_child(struct cgroup_subsys_state *pos_css,
3108 cgroup_assert_mutex_or_rcu_locked(); 3108 cgroup_assert_mutex_or_rcu_locked();
3109 3109
3110 /* 3110 /*
3111 * @pos could already have been removed. Once a cgroup is removed, 3111 * @pos could already have been unlinked from the sibling list.
3112 * its ->sibling.next is no longer updated when its next sibling 3112 * Once a cgroup is removed, its ->sibling.next is no longer
3113 * changes. As CGRP_DEAD assertion is serialized and happens 3113 * updated when its next sibling changes. CSS_RELEASED is set when
3114 * before the cgroup is taken off the ->sibling list, if we see it 3114 * @pos is taken off list, at which time its next pointer is valid,
3115 * unasserted, it's guaranteed that the next sibling hasn't 3115 * and, as releases are serialized, the one pointed to by the next
3116 * finished its grace period even if it's already removed, and thus 3116 * pointer is guaranteed to not have started release yet. This
3117 * safe to dereference from this RCU critical section. If 3117 * implies that if we observe !CSS_RELEASED on @pos in this RCU
3118 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed 3118 * critical section, the one pointed to by its next pointer is
3119 * to be visible as %true here. 3119 * guaranteed to not have finished its RCU grace period even if we
3120 * have dropped rcu_read_lock() inbetween iterations.
3120 * 3121 *
3121 * If @pos is dead, its next pointer can't be dereferenced; 3122 * If @pos has CSS_RELEASED set, its next pointer can't be
3122 * however, as each cgroup is given a monotonically increasing 3123 * dereferenced; however, as each css is given a monotonically
3123 * unique serial number and always appended to the sibling list, 3124 * increasing unique serial number and always appended to the
3124 * the next one can be found by walking the parent's children until 3125 * sibling list, the next one can be found by walking the parent's
3125 * we see a cgroup with higher serial number than @pos's. While 3126 * children until the first css with higher serial number than
3126 * this path can be slower, it's taken only when either the current 3127 * @pos's. While this path can be slower, it happens iff iteration
3127 * cgroup is removed or iteration and removal race. 3128 * races against release and the race window is very small.
3128 */ 3129 */
3129 if (!pos) { 3130 if (!pos) {
3130 next = list_entry_rcu(cgrp->self.children.next, struct cgroup, self.sibling); 3131 next = list_entry_rcu(cgrp->self.children.next, struct cgroup, self.sibling);
3131 } else if (likely(!cgroup_is_dead(pos))) { 3132 } else if (likely(!(pos->self.flags & CSS_RELEASED))) {
3132 next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling); 3133 next = list_entry_rcu(pos->self.sibling.next, struct cgroup, self.sibling);
3133 } else { 3134 } else {
3134 list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling) 3135 list_for_each_entry_rcu(next, &cgrp->self.children, self.sibling)
@@ -4139,6 +4140,7 @@ static void css_release_work_fn(struct work_struct *work)
4139 4140
4140 mutex_lock(&cgroup_mutex); 4141 mutex_lock(&cgroup_mutex);
4141 4142
4143 css->flags |= CSS_RELEASED;
4142 list_del_rcu(&css->sibling); 4144 list_del_rcu(&css->sibling);
4143 4145
4144 if (ss) { 4146 if (ss) {
@@ -4525,10 +4527,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4525 4527
4526 /* 4528 /*
4527 * Mark @cgrp dead. This prevents further task migration and child 4529 * Mark @cgrp dead. This prevents further task migration and child
4528 * creation by disabling cgroup_lock_live_group(). Note that 4530 * creation by disabling cgroup_lock_live_group().
4529 * CGRP_DEAD assertion is depended upon by css_next_child() to
4530 * resume iteration after dropping RCU read lock. See
4531 * css_next_child() for details.
4532 */ 4531 */
4533 set_bit(CGRP_DEAD, &cgrp->flags); 4532 set_bit(CGRP_DEAD, &cgrp->flags);
4534 4533