aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVincent Guittot <vincent.guittot@linaro.org>2013-04-18 12:34:26 -0400
committerIngo Molnar <mingo@kernel.org>2013-04-21 05:22:52 -0400
commit642dbc39ab1ea00f47e0fee1b8e8a27da036d940 (patch)
tree337e3cd2be83a916577f78c4819fafe6d29c99c8
parent9b89f6ba2ab56e4d9c00e7e591d6bc333137895e (diff)
sched: Fix wrong rq's runnable_avg update with rt tasks
The current update of the rq's load can be erroneous when RT tasks are involved. The update of the load of a rq that becomes idle, is done only if the avg_idle is less than sysctl_sched_migration_cost. If RT tasks and short idle duration alternate, the runnable_avg will not be updated correctly and the time will be accounted as idle time when a CFS task wakes up. A new idle_enter function is called when the next task is the idle function so the elapsed time will be accounted as run time in the load of the rq, whatever the average idle time is. The function update_rq_runnable_avg is removed from idle_balance. When a RT task is scheduled on an idle CPU, the update of the rq's load is not done when the rq exit idle state because CFS's functions are not called. Then, the idle_balance, which is called just before entering the idle function, updates the rq's load and makes the assumption that the elapsed time since the last update, was only running time. As a consequence, the rq's load of a CPU that only runs a periodic RT task, is close to LOAD_AVG_MAX whatever the running duration of the RT task is. A new idle_exit function is called when the prev task is the idle function so the elapsed time will be accounted as idle time in the rq's load. Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Steven Rostedt <rostedt@goodmis.org> Cc: linaro-kernel@lists.linaro.org Cc: peterz@infradead.org Cc: pjt@google.com Cc: fweisbec@gmail.com Cc: efault@gmx.de Link: http://lkml.kernel.org/r/1366302867-5055-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--kernel/sched/fair.c23
-rw-r--r--kernel/sched/idle_task.c16
-rw-r--r--kernel/sched/sched.h12
3 files changed, 49 insertions, 2 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 155783b4e4bf..1c977350e322 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1563,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1563 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); 1563 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1564 } /* migrations, e.g. sleep=0 leave decay_count == 0 */ 1564 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
1565} 1565}
1566
1567/*
1568 * Update the rq's load with the elapsed running time before entering
1569 * idle. if the last scheduled task is not a CFS task, idle_enter will
1570 * be the only way to update the runnable statistic.
1571 */
1572void idle_enter_fair(struct rq *this_rq)
1573{
1574 update_rq_runnable_avg(this_rq, 1);
1575}
1576
1577/*
1578 * Update the rq's load with the elapsed idle time before a task is
1579 * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
1580 * be the only way to update the runnable statistic.
1581 */
1582void idle_exit_fair(struct rq *this_rq)
1583{
1584 update_rq_runnable_avg(this_rq, 0);
1585}
1586
1566#else 1587#else
1567static inline void update_entity_load_avg(struct sched_entity *se, 1588static inline void update_entity_load_avg(struct sched_entity *se,
1568 int update_cfs_rq) {} 1589 int update_cfs_rq) {}
@@ -5217,8 +5238,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5217 if (this_rq->avg_idle < sysctl_sched_migration_cost) 5238 if (this_rq->avg_idle < sysctl_sched_migration_cost)
5218 return; 5239 return;
5219 5240
5220 update_rq_runnable_avg(this_rq, 1);
5221
5222 /* 5241 /*
5223 * Drop the rq->lock, but keep IRQ/preempt disabled. 5242 * Drop the rq->lock, but keep IRQ/preempt disabled.
5224 */ 5243 */
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b6baf370cae9..b8ce77328341 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,6 +13,16 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
16
17static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
18{
19 idle_exit_fair(rq);
20}
21
22static void post_schedule_idle(struct rq *rq)
23{
24 idle_enter_fair(rq);
25}
16#endif /* CONFIG_SMP */ 26#endif /* CONFIG_SMP */
17/* 27/*
18 * Idle tasks are unconditionally rescheduled: 28 * Idle tasks are unconditionally rescheduled:
@@ -25,6 +35,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
25static struct task_struct *pick_next_task_idle(struct rq *rq) 35static struct task_struct *pick_next_task_idle(struct rq *rq)
26{ 36{
27 schedstat_inc(rq, sched_goidle); 37 schedstat_inc(rq, sched_goidle);
38#ifdef CONFIG_SMP
39 /* Trigger the post schedule to do an idle_enter for CFS */
40 rq->post_schedule = 1;
41#endif
28 return rq->idle; 42 return rq->idle;
29} 43}
30 44
@@ -86,6 +100,8 @@ const struct sched_class idle_sched_class = {
86 100
87#ifdef CONFIG_SMP 101#ifdef CONFIG_SMP
88 .select_task_rq = select_task_rq_idle, 102 .select_task_rq = select_task_rq_idle,
103 .pre_schedule = pre_schedule_idle,
104 .post_schedule = post_schedule_idle,
89#endif 105#endif
90 106
91 .set_curr_task = set_curr_task_idle, 107 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8116cf8e350f..605426a63588 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1024,6 +1024,18 @@ extern void update_group_power(struct sched_domain *sd, int cpu);
1024extern void trigger_load_balance(struct rq *rq, int cpu); 1024extern void trigger_load_balance(struct rq *rq, int cpu);
1025extern void idle_balance(int this_cpu, struct rq *this_rq); 1025extern void idle_balance(int this_cpu, struct rq *this_rq);
1026 1026
1027/*
1028 * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
1029 * becomes useful in lb
1030 */
1031#if defined(CONFIG_FAIR_GROUP_SCHED)
1032extern void idle_enter_fair(struct rq *this_rq);
1033extern void idle_exit_fair(struct rq *this_rq);
1034#else
1035static inline void idle_enter_fair(struct rq *this_rq) {}
1036static inline void idle_exit_fair(struct rq *this_rq) {}
1037#endif
1038
1027#else /* CONFIG_SMP */ 1039#else /* CONFIG_SMP */
1028 1040
1029static inline void idle_balance(int cpu, struct rq *rq) 1041static inline void idle_balance(int cpu, struct rq *rq)