aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2010-06-08 05:40:42 -0400
committerIngo Molnar <mingo@elte.hu>2010-06-08 12:44:04 -0400
commitdc61b1d65e353d638b2445f71fb8e5b5630f2415 (patch)
tree07d79b2d385a380207cd889ac764b57190421fd1
parent3975d16760d4be7402d1067c548c30c427971331 (diff)
sched: Fix PROVE_RCU vs cpu_cgroup
PROVE_RCU has a few issues with the cpu_cgroup because the scheduler typically holds rq->lock around the css rcu derefs but the generic cgroup code doesn't (and can't) know about that lock. Provide means to add extra checks to the css dereference and use that in the scheduler to annotate its users. The addition of rq->lock to these checks is correct because the cgroup_subsys::attach() method takes the rq->lock for each task it moves, therefore by holding that lock, we ensure the task is pinned to the current cgroup and the RCU derefence is valid. That leaves one genuine race in __sched_setscheduler() where we used task_group() without holding any of the required locks and thus raced with the cgroup code. Solve this by moving the check under the appropriate lock. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/cgroup.h20
-rw-r--r--kernel/sched.c115
2 files changed, 73 insertions, 62 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0c621604baa1..e3d00fdb858d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -525,13 +525,21 @@ static inline struct cgroup_subsys_state *cgroup_subsys_state(
525 return cgrp->subsys[subsys_id]; 525 return cgrp->subsys[subsys_id];
526} 526}
527 527
528static inline struct cgroup_subsys_state *task_subsys_state( 528/*
529 struct task_struct *task, int subsys_id) 529 * function to get the cgroup_subsys_state which allows for extra
530 * rcu_dereference_check() conditions, such as locks used during the
531 * cgroup_subsys::attach() methods.
532 */
533#define task_subsys_state_check(task, subsys_id, __c) \
534 rcu_dereference_check(task->cgroups->subsys[subsys_id], \
535 rcu_read_lock_held() || \
536 lockdep_is_held(&task->alloc_lock) || \
537 cgroup_lock_is_held() || (__c))
538
539static inline struct cgroup_subsys_state *
540task_subsys_state(struct task_struct *task, int subsys_id)
530{ 541{
531 return rcu_dereference_check(task->cgroups->subsys[subsys_id], 542 return task_subsys_state_check(task, subsys_id, false);
532 rcu_read_lock_held() ||
533 lockdep_is_held(&task->alloc_lock) ||
534 cgroup_lock_is_held());
535} 543}
536 544
537static inline struct cgroup* task_cgroup(struct task_struct *task, 545static inline struct cgroup* task_cgroup(struct task_struct *task,
diff --git a/kernel/sched.c b/kernel/sched.c
index f8b8996228dd..2aaceebd484c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -306,52 +306,6 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD;
306 */ 306 */
307struct task_group init_task_group; 307struct task_group init_task_group;
308 308
309/* return group to which a task belongs */
310static inline struct task_group *task_group(struct task_struct *p)
311{
312 struct task_group *tg;
313
314#ifdef CONFIG_CGROUP_SCHED
315 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
316 struct task_group, css);
317#else
318 tg = &init_task_group;
319#endif
320 return tg;
321}
322
323/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
324static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
325{
326 /*
327 * Strictly speaking this rcu_read_lock() is not needed since the
328 * task_group is tied to the cgroup, which in turn can never go away
329 * as long as there are tasks attached to it.
330 *
331 * However since task_group() uses task_subsys_state() which is an
332 * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
333 */
334 rcu_read_lock();
335#ifdef CONFIG_FAIR_GROUP_SCHED
336 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
337 p->se.parent = task_group(p)->se[cpu];
338#endif
339
340#ifdef CONFIG_RT_GROUP_SCHED
341 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
342 p->rt.parent = task_group(p)->rt_se[cpu];
343#endif
344 rcu_read_unlock();
345}
346
347#else
348
349static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
350static inline struct task_group *task_group(struct task_struct *p)
351{
352 return NULL;
353}
354
355#endif /* CONFIG_CGROUP_SCHED */ 309#endif /* CONFIG_CGROUP_SCHED */
356 310
357/* CFS-related fields in a runqueue */ 311/* CFS-related fields in a runqueue */
@@ -644,6 +598,49 @@ static inline int cpu_of(struct rq *rq)
644#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 598#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
645#define raw_rq() (&__raw_get_cpu_var(runqueues)) 599#define raw_rq() (&__raw_get_cpu_var(runqueues))
646 600
601#ifdef CONFIG_CGROUP_SCHED
602
603/*
604 * Return the group to which this tasks belongs.
605 *
606 * We use task_subsys_state_check() and extend the RCU verification
607 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
608 * holds that lock for each task it moves into the cgroup. Therefore
609 * by holding that lock, we pin the task to the current cgroup.
610 */
611static inline struct task_group *task_group(struct task_struct *p)
612{
613 struct cgroup_subsys_state *css;
614
615 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
616 lockdep_is_held(&task_rq(p)->lock));
617 return container_of(css, struct task_group, css);
618}
619
620/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
621static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
622{
623#ifdef CONFIG_FAIR_GROUP_SCHED
624 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
625 p->se.parent = task_group(p)->se[cpu];
626#endif
627
628#ifdef CONFIG_RT_GROUP_SCHED
629 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
630 p->rt.parent = task_group(p)->rt_se[cpu];
631#endif
632}
633
634#else /* CONFIG_CGROUP_SCHED */
635
636static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
637static inline struct task_group *task_group(struct task_struct *p)
638{
639 return NULL;
640}
641
642#endif /* CONFIG_CGROUP_SCHED */
643
647inline void update_rq_clock(struct rq *rq) 644inline void update_rq_clock(struct rq *rq)
648{ 645{
649 if (!rq->skip_clock_update) 646 if (!rq->skip_clock_update)
@@ -4465,16 +4462,6 @@ recheck:
4465 } 4462 }
4466 4463
4467 if (user) { 4464 if (user) {
4468#ifdef CONFIG_RT_GROUP_SCHED
4469 /*
4470 * Do not allow realtime tasks into groups that have no runtime
4471 * assigned.
4472 */
4473 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4474 task_group(p)->rt_bandwidth.rt_runtime == 0)
4475 return -EPERM;
4476#endif
4477
4478 retval = security_task_setscheduler(p, policy, param); 4465 retval = security_task_setscheduler(p, policy, param);
4479 if (retval) 4466 if (retval)
4480 return retval; 4467 return retval;
@@ -4490,6 +4477,22 @@ recheck:
4490 * runqueue lock must be held. 4477 * runqueue lock must be held.
4491 */ 4478 */
4492 rq = __task_rq_lock(p); 4479 rq = __task_rq_lock(p);
4480
4481#ifdef CONFIG_RT_GROUP_SCHED
4482 if (user) {
4483 /*
4484 * Do not allow realtime tasks into groups that have no runtime
4485 * assigned.
4486 */
4487 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4488 task_group(p)->rt_bandwidth.rt_runtime == 0) {
4489 __task_rq_unlock(rq);
4490 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4491 return -EPERM;
4492 }
4493 }
4494#endif
4495
4493 /* recheck policy now with rq lock held */ 4496 /* recheck policy now with rq lock held */
4494 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4497 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4495 policy = oldpolicy = -1; 4498 policy = oldpolicy = -1;