From cfb5285660aad4931b2ebbfa902ea48a37dfffa1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 14 Nov 2007 16:59:45 -0800 Subject: revert "Task Control Groups: example CPU accounting subsystem" Revert 62d0df64065e7c135d0002f069444fbdfc64768f. This was originally intended as a simple initial example of how to create a control groups subsystem; it wasn't intended for mainline, but I didn't make this clear enough to Andrew. The CFS cgroup subsystem now has better functionality for the per-cgroup usage accounting (based directly on CFS stats) than the "usage" status file in this patch, and the "load" status file is rather simplistic - although having a per-cgroup load average report would be a useful feature, I don't believe this patch actually provides it. If it gets into the final 2.6.24 we'd probably have to support this interface for ever. Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b18f231a4875..4fb3532dd7e8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -52,7 +52,6 @@ #include #include #include -#include #include #include #include @@ -3338,13 +3337,9 @@ void account_user_time(struct task_struct *p, cputime_t cputime) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; cputime64_t tmp; - struct rq *rq = this_rq(); p->utime = cputime_add(p->utime, cputime); - if (p != rq->idle) - cpuacct_charge(p, cputime); - /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); if (TASK_NICE(p) > 0) @@ -3408,10 +3403,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cpustat->irq = cputime64_add(cpustat->irq, tmp); else if (softirq_count()) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); - else if (p != rq->idle) { + else if (p != rq->idle) cpustat->system = cputime64_add(cpustat->system, tmp); - cpuacct_charge(p, cputime); - } else if (atomic_read(&rq->nr_iowait) > 0) + else if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else cpustat->idle = cputime64_add(cpustat->idle, tmp); @@ -3447,10 +3441,8 @@ void account_steal_time(struct task_struct *p, cputime_t steal) cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else cpustat->idle = cputime64_add(cpustat->idle, tmp); - } else { + } else cpustat->steal = cputime64_add(cpustat->steal, tmp); - cpuacct_charge(p, -tmp); - } } /* -- cgit v1.2.2 From 9778385db35a799d410039be123044a0d3e917a2 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 15 Nov 2007 20:57:39 +0100 Subject: sched: fix accounting of interrupts during guest execution on s390 Currently the scheduler checks for PF_VCPU to decide if this timeslice has to be accounted as guest time. On s390 host interrupts are not disabled during guest execution. This causes theses interrupts to be accounted as guest time if CONFIG_VIRT_CPU_ACCOUNTING is set. Solution is to check if an interrupt triggered account_system_time. As the tick is timer interrupt based, we have to subtract hardirq_offset. I tested the patch on s390 with CONFIG_VIRT_CPU_ACCOUNTING and on x86_64. Seems to work. CC: Avi Kivity CC: Laurent Vivier Signed-off-by: Christian Borntraeger Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 4fb3532dd7e8..908335e1781a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3390,10 +3390,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset, struct rq *rq = this_rq(); cputime64_t tmp; - if (p->flags & PF_VCPU) { - account_guest_time(p, cputime); - return; - } + if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) + return account_guest_time(p, cputime); p->stime = cputime_add(p->stime, cputime); -- cgit v1.2.2 From dae51f56204d33444f61d9e7af3ee70aef55daa4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 15 Nov 2007 20:57:40 +0100 Subject: sched: fix SCHED_FIFO tasks & FAIR_GROUP_SCHED Suppose that the SCHED_FIFO task does switch_uid(new_user); Now, p->se.cfs_rq and p->se.parent both point into the old user_struct->tg because sched_move_task() doesn't call set_task_cfs_rq() for !fair_sched_class case. Suppose that old user_struct/task_group is freed/reused, and the task does sched_setscheduler(SCHED_NORMAL); __setscheduler() sets fair_sched_class, but doesn't update ->se.cfs_rq/parent which point to the freed memory. This means that check_preempt_wakeup() doing while (!is_same_group(se, pse)) { se = parent_entity(se); pse = parent_entity(pse); } may OOPS in a similar way if rq->curr or p did something like above. Perhaps we need something like the patch below, note that __setscheduler() can't do set_task_cfs_rq(). Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 908335e1781a..d62b759753f3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7087,8 +7087,10 @@ void sched_move_task(struct task_struct *tsk) rq = task_rq_lock(tsk, &flags); - if (tsk->sched_class != &fair_sched_class) + if (tsk->sched_class != &fair_sched_class) { + set_task_cfs_rq(tsk); goto done; + } update_rq_clock(rq); -- cgit v1.2.2 From ce96b5ac742801718ae86d2adf0500c5abef3782 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Thu, 15 Nov 2007 20:57:40 +0100 Subject: sched: fix __set_task_cpu() SMP race Grant Wilson has reported rare SCHED_FAIR_USER crashes on his quad-core system, which crashes can only be explained via runqueue corruption. there is a narrow SMP race in __set_task_cpu(): after ->cpu is set up to a new value, task_rq_lock(p, ...) can be successfuly executed on another CPU. We must ensure that updates of per-task data have been completed by this moment. this bug has been hiding in the Linux scheduler for an eternity (we never had any explicit barrier for task->cpu in set_task_cpu() - so the bug was introduced in 2.5.1), but only became visible via set_task_cfs_rq() being accidentally put after the task->cpu update. It also probably needs a sufficiently out-of-order CPU to trigger. Reported-by: Grant Wilson Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- kernel/sched.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index d62b759753f3..db1f71e31310 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -216,15 +216,15 @@ static inline struct task_group *task_group(struct task_struct *p) } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -static inline void set_task_cfs_rq(struct task_struct *p) +static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { - p->se.cfs_rq = task_group(p)->cfs_rq[task_cpu(p)]; - p->se.parent = task_group(p)->se[task_cpu(p)]; + p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; + p->se.parent = task_group(p)->se[cpu]; } #else -static inline void set_task_cfs_rq(struct task_struct *p) { } +static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -1022,10 +1022,16 @@ unsigned long weighted_cpuload(const int cpu) static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { + set_task_cfs_rq(p, cpu); #ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); task_thread_info(p)->cpu = cpu; #endif - set_task_cfs_rq(p); } #ifdef CONFIG_SMP @@ -7088,7 +7094,7 @@ void sched_move_task(struct task_struct *tsk) rq = task_rq_lock(tsk, &flags); if (tsk->sched_class != &fair_sched_class) { - set_task_cfs_rq(tsk); + set_task_cfs_rq(tsk, task_cpu(tsk)); goto done; } @@ -7103,7 +7109,7 @@ void sched_move_task(struct task_struct *tsk) tsk->sched_class->put_prev_task(rq, tsk); } - set_task_cfs_rq(tsk); + set_task_cfs_rq(tsk, task_cpu(tsk)); if (on_rq) { if (unlikely(running)) -- cgit v1.2.2 From 94bc9a7bd97efdda4dfbe61d0df32ce19d41c0a9 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Thu, 15 Nov 2007 20:57:40 +0100 Subject: sched: remove activate_idle_task() cpu_down() code is ok wrt sched_idle_next() placing the 'idle' task not at the beginning of the queue. So get rid of activate_idle_task() and make use of activate_task() instead. It is the same as activate_task(), except for the update_rq_clock(rq) call that is redundant. Code size goes down: text data bss dec hex filename 47853 3934 336 52123 cb9b sched.o.before 47828 3934 336 52098 cb82 sched.o.after Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- kernel/sched.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index db1f71e31310..cc8cb6f7d82e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5281,24 +5281,10 @@ static void migrate_live_tasks(int src_cpu) read_unlock(&tasklist_lock); } -/* - * activate_idle_task - move idle task to the _front_ of runqueue. - */ -static void activate_idle_task(struct task_struct *p, struct rq *rq) -{ - update_rq_clock(rq); - - if (p->state == TASK_UNINTERRUPTIBLE) - rq->nr_uninterruptible--; - - enqueue_task(rq, p, 0); - inc_nr_running(p, rq); -} - /* * Schedules idle task to be the next runnable task on current CPU. - * It does so by boosting its priority to highest possible and adding it to - * the _front_ of the runqueue. Used by CPU offline code. + * It does so by boosting its priority to highest possible. + * Used by CPU offline code. */ void sched_idle_next(void) { @@ -5318,8 +5304,8 @@ void sched_idle_next(void) __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); - /* Add idle task to the _front_ of its priority queue: */ - activate_idle_task(p, rq); + update_rq_clock(rq); + activate_task(rq, p, 0); spin_unlock_irqrestore(&rq->lock, flags); } -- cgit v1.2.2 From 9612633a21ae8424531caf977f0560f64285bf36 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 Nov 2007 20:57:40 +0100 Subject: sched: reorder SCHED_FEAT_ bits reorder SCHED_FEAT_ bits so that the used ones come first. Makes tuning instructions easier. Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index cc8cb6f7d82e..38933cafea8a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -455,18 +455,18 @@ static void update_rq_clock(struct rq *rq) */ enum { SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, - SCHED_FEAT_START_DEBIT = 2, - SCHED_FEAT_TREE_AVG = 4, - SCHED_FEAT_APPROX_AVG = 8, - SCHED_FEAT_WAKEUP_PREEMPT = 16, + SCHED_FEAT_WAKEUP_PREEMPT = 2, + SCHED_FEAT_START_DEBIT = 4, + SCHED_FEAT_TREE_AVG = 8, + SCHED_FEAT_APPROX_AVG = 16, }; const_debug unsigned int sysctl_sched_features = SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | + SCHED_FEAT_WAKEUP_PREEMPT * 1 | SCHED_FEAT_START_DEBIT * 1 | SCHED_FEAT_TREE_AVG * 0 | - SCHED_FEAT_APPROX_AVG * 0 | - SCHED_FEAT_WAKEUP_PREEMPT * 1; + SCHED_FEAT_APPROX_AVG * 0; #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) -- cgit v1.2.2