sched: Cure load average vs NO_HZ woes

Chase reported that due to us decrementing calc_load_task prematurely (before the next LOAD_FREQ sample), the load average could be scewed by as much as the number of CPUs in the machine. This patch, based on Chase's patch, cures the problem by keeping the delta of the CPU going into NO_HZ idle separately and folding that in on the next LOAD_FREQ update. This restores the balance and we get strict LOAD_FREQ period samples. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Chase Douglas <chase.douglas@canonical.com> LKML-Reference: <1271934490.1776.343.camel@laptop> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2010-04-22 15:50:19 -0400
committer: Ingo Molnar <mingo@elte.hu> 2010-04-23 05:02:02 -0400
commit: 74f5187ac873042f502227701ed1727e7c5fbfa9 (patch)
tree: b200960d04b0a955aaf9a101d6f0a4ed34f07bb2
parent: 09a40af5240de02d848247ab82440ad75b31ab11 (diff)
2 files changed, 68 insertions, 15 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index de0da71daf77..0cc913a8554f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1815,7 +1815,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 }
 #endif
-static void calc_load_account_active(struct rq *this_rq);
+static void calc_load_account_idle(struct rq *this_rq);
 static void update_sysctl(void);
 static int get_update_sysctl_factor(void);
@@ -2950,6 +2950,61 @@ static unsigned long calc_load_update;
 unsigned long avenrun[3];
 EXPORT_SYMBOL(avenrun);
+static long calc_load_fold_active(struct rq *this_rq)
+{
+        long nr_active, delta = 0;
+        nr_active = this_rq->nr_running;
+        nr_active += (long) this_rq->nr_uninterruptible;
+        if (nr_active != this_rq->calc_load_active) {
+                delta = nr_active - this_rq->calc_load_active;
+                this_rq->calc_load_active = nr_active;
+        }
+        return delta;
+}
+#ifdef CONFIG_NO_HZ
+/*
+ * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_tasks_idle;
+static void calc_load_account_idle(struct rq *this_rq)
+{
+        long delta;
+        delta = calc_load_fold_active(this_rq);
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks_idle);
+}
+static long calc_load_fold_idle(void)
+{
+        long delta = 0;
+        /*
+         * Its got a race, we don't care...
+         */
+        if (atomic_long_read(&calc_load_tasks_idle))
+                delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+        return delta;
+}
+#else
+static void calc_load_account_idle(struct rq *this_rq)
+{
+}
+static inline long calc_load_fold_idle(void)
+{
+        return 0;
+}
+#endif
 /**
 * get_avenrun - get the load average array
 * @loads:      pointer to dest load array
@@ -2996,20 +3051,22 @@ void calc_global_load(void)
 }
 /*
- * Either called from update_cpu_load() or from a cpu going idle
+ * Called from update_cpu_load() to periodically update this CPU's
+ * active count.
 */
 static void calc_load_account_active(struct rq *this_rq)
 {
-        long nr_active, delta;
+        long delta;
-        nr_active = this_rq->nr_running;
+        if (time_before(jiffies, this_rq->calc_load_update))
-        nr_active += (long) this_rq->nr_uninterruptible;
+                return;
-        if (nr_active != this_rq->calc_load_active) {
+        delta  = calc_load_fold_active(this_rq);
-                delta = nr_active - this_rq->calc_load_active;
+        delta += calc_load_fold_idle();
-                this_rq->calc_load_active = nr_active;
+        if (delta)
                atomic_long_add(delta, &calc_load_tasks);
-        }
+        this_rq->calc_load_update += LOAD_FREQ;
 }
 /*
@@ -3041,10 +3098,7 @@ static void update_cpu_load(struct rq *this_rq)
                this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
        }
-        if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+        calc_load_account_active(this_rq);
-                this_rq->calc_load_update += LOAD_FREQ;
-                calc_load_account_active(this_rq);
-        }
 }
 #ifdef CONFIG_SMP
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index bea2b8f12024..9fa0f402c87c 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -23,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
        schedstat_inc(rq, sched_goidle);
-        /* adjust the active tasks as we might go into a long sleep */
+        calc_load_account_idle(rq);
-        calc_load_account_active(rq);
        return rq->idle;
 }
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2010-04-22 15:50:19 -0400
committer	Ingo Molnar <mingo@elte.hu>	2010-04-23 05:02:02 -0400
commit	74f5187ac873042f502227701ed1727e7c5fbfa9 (patch)
tree	b200960d04b0a955aaf9a101d6f0a4ed34f07bb2
parent	09a40af5240de02d848247ab82440ad75b31ab11 (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index de0da71daf77..0cc913a8554f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -1815,7 +1815,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1815	}	1815	}
1816	#endif	1816	#endif
1817		1817
1818	static void calc_load_account_active(struct rq *this_rq);	1818	static void calc_load_account_idle(struct rq *this_rq);
1819	static void update_sysctl(void);	1819	static void update_sysctl(void);
1820	static int get_update_sysctl_factor(void);	1820	static int get_update_sysctl_factor(void);
1821		1821
@@ -2950,6 +2950,61 @@ static unsigned long calc_load_update;
2950	unsigned long avenrun[3];	2950	unsigned long avenrun[3];
2951	EXPORT_SYMBOL(avenrun);	2951	EXPORT_SYMBOL(avenrun);
2952		2952
		2953	static long calc_load_fold_active(struct rq *this_rq)
		2954	{
		2955	long nr_active, delta = 0;
		2956
		2957	nr_active = this_rq->nr_running;
		2958	nr_active += (long) this_rq->nr_uninterruptible;
		2959
		2960	if (nr_active != this_rq->calc_load_active) {
		2961	delta = nr_active - this_rq->calc_load_active;
		2962	this_rq->calc_load_active = nr_active;
		2963	}
		2964
		2965	return delta;
		2966	}
		2967
		2968	#ifdef CONFIG_NO_HZ
		2969	/*
		2970	* For NO_HZ we delay the active fold to the next LOAD_FREQ update.
		2971	*
		2972	* When making the ILB scale, we should try to pull this in as well.
		2973	*/
		2974	static atomic_long_t calc_load_tasks_idle;
		2975
		2976	static void calc_load_account_idle(struct rq *this_rq)
		2977	{
		2978	long delta;
		2979
		2980	delta = calc_load_fold_active(this_rq);
		2981	if (delta)
		2982	atomic_long_add(delta, &calc_load_tasks_idle);
		2983	}
		2984
		2985	static long calc_load_fold_idle(void)
		2986	{
		2987	long delta = 0;
		2988
		2989	/*
		2990	* Its got a race, we don't care...
		2991	*/
		2992	if (atomic_long_read(&calc_load_tasks_idle))
		2993	delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
		2994
		2995	return delta;
		2996	}
		2997	#else
		2998	static void calc_load_account_idle(struct rq *this_rq)
		2999	{
		3000	}
		3001
		3002	static inline long calc_load_fold_idle(void)
		3003	{
		3004	return 0;
		3005	}
		3006	#endif
		3007
2953	/**	3008	/**
2954	* get_avenrun - get the load average array	3009	* get_avenrun - get the load average array
2955	* @loads: pointer to dest load array	3010	* @loads: pointer to dest load array
@@ -2996,20 +3051,22 @@ void calc_global_load(void)
2996	}	3051	}
2997		3052
2998	/*	3053	/*
2999	* Either called from update_cpu_load() or from a cpu going idle	3054	* Called from update_cpu_load() to periodically update this CPU's
		3055	* active count.
3000	*/	3056	*/
3001	static void calc_load_account_active(struct rq *this_rq)	3057	static void calc_load_account_active(struct rq *this_rq)
3002	{	3058	{
3003	long nr_active, delta;	3059	long delta;
3004		3060
3005	nr_active = this_rq->nr_running;	3061	if (time_before(jiffies, this_rq->calc_load_update))
3006	nr_active += (long) this_rq->nr_uninterruptible;	3062	return;
3007		3063
3008	if (nr_active != this_rq->calc_load_active) {	3064	delta = calc_load_fold_active(this_rq);
3009	delta = nr_active - this_rq->calc_load_active;	3065	delta += calc_load_fold_idle();
3010	this_rq->calc_load_active = nr_active;	3066	if (delta)
3011	atomic_long_add(delta, &calc_load_tasks);	3067	atomic_long_add(delta, &calc_load_tasks);
3012	}	3068
		3069	this_rq->calc_load_update += LOAD_FREQ;
3013	}	3070	}
3014		3071
3015	/*	3072	/*
@@ -3041,10 +3098,7 @@ static void update_cpu_load(struct rq *this_rq)
3041	this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;	3098	this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3042	}	3099	}
3043		3100
3044	if (time_after_eq(jiffies, this_rq->calc_load_update)) {	3101	calc_load_account_active(this_rq);
3045	this_rq->calc_load_update += LOAD_FREQ;
3046	calc_load_account_active(this_rq);
3047	}
3048	}	3102	}
3049		3103
3050	#ifdef CONFIG_SMP	3104	#ifdef CONFIG_SMP


diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index bea2b8f12024..9fa0f402c87c 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c
@@ -23,8 +23,7 @@ static void check_preempt_curr_idle(struct rq rq, struct task_struct p, int fl
23	static struct task_struct pick_next_task_idle(struct rq rq)	23	static struct task_struct pick_next_task_idle(struct rq rq)
24	{	24	{
25	schedstat_inc(rq, sched_goidle);	25	schedstat_inc(rq, sched_goidle);
26	/* adjust the active tasks as we might go into a long sleep */	26	calc_load_account_idle(rq);
27	calc_load_account_active(rq);
28	return rq->idle;	27	return rq->idle;
29	}	28	}
30		29