aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2010-04-22 15:50:19 -0400
committerIngo Molnar <mingo@elte.hu>2010-04-23 05:02:02 -0400
commit74f5187ac873042f502227701ed1727e7c5fbfa9 (patch)
treeb200960d04b0a955aaf9a101d6f0a4ed34f07bb2
parent09a40af5240de02d848247ab82440ad75b31ab11 (diff)
sched: Cure load average vs NO_HZ woes
Chase reported that due to us decrementing calc_load_task prematurely (before the next LOAD_FREQ sample), the load average could be scewed by as much as the number of CPUs in the machine. This patch, based on Chase's patch, cures the problem by keeping the delta of the CPU going into NO_HZ idle separately and folding that in on the next LOAD_FREQ update. This restores the balance and we get strict LOAD_FREQ period samples. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Chase Douglas <chase.douglas@canonical.com> LKML-Reference: <1271934490.1776.343.camel@laptop> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--kernel/sched.c80
-rw-r--r--kernel/sched_idletask.c3
2 files changed, 68 insertions, 15 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index de0da71daf77..0cc913a8554f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1815,7 +1815,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1815} 1815}
1816#endif 1816#endif
1817 1817
1818static void calc_load_account_active(struct rq *this_rq); 1818static void calc_load_account_idle(struct rq *this_rq);
1819static void update_sysctl(void); 1819static void update_sysctl(void);
1820static int get_update_sysctl_factor(void); 1820static int get_update_sysctl_factor(void);
1821 1821
@@ -2950,6 +2950,61 @@ static unsigned long calc_load_update;
2950unsigned long avenrun[3]; 2950unsigned long avenrun[3];
2951EXPORT_SYMBOL(avenrun); 2951EXPORT_SYMBOL(avenrun);
2952 2952
2953static long calc_load_fold_active(struct rq *this_rq)
2954{
2955 long nr_active, delta = 0;
2956
2957 nr_active = this_rq->nr_running;
2958 nr_active += (long) this_rq->nr_uninterruptible;
2959
2960 if (nr_active != this_rq->calc_load_active) {
2961 delta = nr_active - this_rq->calc_load_active;
2962 this_rq->calc_load_active = nr_active;
2963 }
2964
2965 return delta;
2966}
2967
2968#ifdef CONFIG_NO_HZ
2969/*
2970 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
2971 *
2972 * When making the ILB scale, we should try to pull this in as well.
2973 */
2974static atomic_long_t calc_load_tasks_idle;
2975
2976static void calc_load_account_idle(struct rq *this_rq)
2977{
2978 long delta;
2979
2980 delta = calc_load_fold_active(this_rq);
2981 if (delta)
2982 atomic_long_add(delta, &calc_load_tasks_idle);
2983}
2984
2985static long calc_load_fold_idle(void)
2986{
2987 long delta = 0;
2988
2989 /*
2990 * Its got a race, we don't care...
2991 */
2992 if (atomic_long_read(&calc_load_tasks_idle))
2993 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
2994
2995 return delta;
2996}
2997#else
2998static void calc_load_account_idle(struct rq *this_rq)
2999{
3000}
3001
3002static inline long calc_load_fold_idle(void)
3003{
3004 return 0;
3005}
3006#endif
3007
2953/** 3008/**
2954 * get_avenrun - get the load average array 3009 * get_avenrun - get the load average array
2955 * @loads: pointer to dest load array 3010 * @loads: pointer to dest load array
@@ -2996,20 +3051,22 @@ void calc_global_load(void)
2996} 3051}
2997 3052
2998/* 3053/*
2999 * Either called from update_cpu_load() or from a cpu going idle 3054 * Called from update_cpu_load() to periodically update this CPU's
3055 * active count.
3000 */ 3056 */
3001static void calc_load_account_active(struct rq *this_rq) 3057static void calc_load_account_active(struct rq *this_rq)
3002{ 3058{
3003 long nr_active, delta; 3059 long delta;
3004 3060
3005 nr_active = this_rq->nr_running; 3061 if (time_before(jiffies, this_rq->calc_load_update))
3006 nr_active += (long) this_rq->nr_uninterruptible; 3062 return;
3007 3063
3008 if (nr_active != this_rq->calc_load_active) { 3064 delta = calc_load_fold_active(this_rq);
3009 delta = nr_active - this_rq->calc_load_active; 3065 delta += calc_load_fold_idle();
3010 this_rq->calc_load_active = nr_active; 3066 if (delta)
3011 atomic_long_add(delta, &calc_load_tasks); 3067 atomic_long_add(delta, &calc_load_tasks);
3012 } 3068
3069 this_rq->calc_load_update += LOAD_FREQ;
3013} 3070}
3014 3071
3015/* 3072/*
@@ -3041,10 +3098,7 @@ static void update_cpu_load(struct rq *this_rq)
3041 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3098 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3042 } 3099 }
3043 3100
3044 if (time_after_eq(jiffies, this_rq->calc_load_update)) { 3101 calc_load_account_active(this_rq);
3045 this_rq->calc_load_update += LOAD_FREQ;
3046 calc_load_account_active(this_rq);
3047 }
3048} 3102}
3049 3103
3050#ifdef CONFIG_SMP 3104#ifdef CONFIG_SMP
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index bea2b8f12024..9fa0f402c87c 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -23,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
23static struct task_struct *pick_next_task_idle(struct rq *rq) 23static struct task_struct *pick_next_task_idle(struct rq *rq)
24{ 24{
25 schedstat_inc(rq, sched_goidle); 25 schedstat_inc(rq, sched_goidle);
26 /* adjust the active tasks as we might go into a long sleep */ 26 calc_load_account_idle(rq);
27 calc_load_account_active(rq);
28 return rq->idle; 27 return rq->idle;
29} 28}
30 29