1 files changed, 236 insertions, 51 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index dc91a4d09ac3..297d1a0eedb0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -636,22 +636,18 @@ static inline struct task_group *task_group(struct task_struct *p)
 #endif /* CONFIG_CGROUP_SCHED */
-static u64 irq_time_cpu(int cpu);
+static void update_rq_clock_task(struct rq *rq, s64 delta);
-static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
-inline void update_rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)
 {
-        if (!rq->skip_clock_update) {
+        s64 delta;
-                int cpu = cpu_of(rq);
-                u64 irq_time;
-                rq->clock = sched_clock_cpu(cpu);
+        if (rq->skip_clock_update)
-                irq_time = irq_time_cpu(cpu);
+                return;
-                if (rq->clock - irq_time > rq->clock_task)
-                        rq->clock_task = rq->clock - irq_time;
-                sched_irq_time_avg_update(rq, irq_time);
+        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
-        }
+        rq->clock += delta;
+        update_rq_clock_task(rq, delta);
 }
 /*
@@ -1924,10 +1920,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 * They are read and saved off onto struct rq in update_rq_clock().
 * This may result in other CPU reading this CPU's irq time and can
 * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value (or semi updated value on 32 bit) with a side effect of
+ * or new value with a side effect of accounting a slice of irq time to wrong
- * accounting a slice of irq time to wrong task when irq is in progress
+ * task when irq is in progress while we read rq->clock. That is a worthy
- * while we read rq->clock. That is a worthy compromise in place of having
+ * compromise in place of having locks on each irq in account_system_time.
- * locks on each irq in account_system_time.
 */
 static DEFINE_PER_CPU(u64, cpu_hardirq_time);
 static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1945,19 +1940,58 @@ void disable_sched_clock_irqtime(void)
        sched_clock_irqtime = 0;
 }
-static u64 irq_time_cpu(int cpu)
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+static inline void irq_time_write_begin(void)
 {
-        if (!sched_clock_irqtime)
+        __this_cpu_inc(irq_time_seq.sequence);
-                return 0;
+        smp_wmb();
+}
+static inline void irq_time_write_end(void)
+{
+        smp_wmb();
+        __this_cpu_inc(irq_time_seq.sequence);
+}
+static inline u64 irq_time_read(int cpu)
+{
+        u64 irq_time;
+        unsigned seq;
+        do {
+                seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+                irq_time = per_cpu(cpu_softirq_time, cpu) +
+                           per_cpu(cpu_hardirq_time, cpu);
+        } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+        return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+static inline void irq_time_write_end(void)
+{
+}
+static inline u64 irq_time_read(int cpu)
+{
        return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
 }
+#endif /* CONFIG_64BIT */
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
 void account_system_vtime(struct task_struct *curr)
 {
        unsigned long flags;
+        s64 delta;
        int cpu;
-        u64 now, delta;
        if (!sched_clock_irqtime)
                return;
@@ -1965,9 +1999,10 @@ void account_system_vtime(struct task_struct *curr)
        local_irq_save(flags);
        cpu = smp_processor_id();
-        now = sched_clock_cpu(cpu);
+        delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
-        delta = now - per_cpu(irq_start_time, cpu);
+        __this_cpu_add(irq_start_time, delta);
-        per_cpu(irq_start_time, cpu) = now;
+        irq_time_write_begin();
        /*
         * We do not account for softirq time from ksoftirqd here.
         * We want to continue accounting softirq time to ksoftirqd thread
@@ -1975,33 +2010,55 @@ void account_system_vtime(struct task_struct *curr)
         * that do not consume any time, but still wants to run.
         */
        if (hardirq_count())
-                per_cpu(cpu_hardirq_time, cpu) += delta;
+                __this_cpu_add(cpu_hardirq_time, delta);
        else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
-                per_cpu(cpu_softirq_time, cpu) += delta;
+                __this_cpu_add(cpu_softirq_time, delta);
+        irq_time_write_end();
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
-        if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
+        s64 irq_delta;
-                u64 delta_irq = curr_irq_time - rq->prev_irq_time;
-                rq->prev_irq_time = curr_irq_time;
+        irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
-                sched_rt_avg_update(rq, delta_irq);
-        }
+        /*
+         * Since irq_time is only updated on {soft,}irq_exit, we might run into
+         * this case when a previous update_rq_clock() happened inside a
+         * {soft,}irq region.
+         *
+         * When this happens, we stop ->clock_task and only update the
+         * prev_irq_time stamp to account for the part that fit, so that a next
+         * update will consume the rest. This ensures ->clock_task is
+         * monotonic.
+         *
+         * It does however cause some slight miss-attribution of {soft,}irq
+         * time, a more accurate solution would be to update the irq_time using
+         * the current rq->clock timestamp, except that would require using
+         * atomic ops.
+         */
+        if (irq_delta > delta)
+                irq_delta = delta;
+        rq->prev_irq_time += irq_delta;
+        delta -= irq_delta;
+        rq->clock_task += delta;
+        if (irq_delta && sched_feat(NONIRQ_POWER))
+                sched_rt_avg_update(rq, irq_delta);
 }
-#else
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static u64 irq_time_cpu(int cpu)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
-        return 0;
+        rq->clock_task += delta;
 }
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-#endif
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -2129,7 +2186,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
         * A queue event has occurred, and we're going to schedule.  In
         * this case, we can save a useless back to back clock update.
         */
-        if (test_tsk_need_resched(rq->curr))
+        if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
                rq->skip_clock_update = 1;
 }
@@ -3119,6 +3176,15 @@ static long calc_load_fold_active(struct rq *this_rq)
        return delta;
 }
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+        load *= exp;
+        load += active * (FIXED_1 - exp);
+        load += 1UL << (FSHIFT - 1);
+        return load >> FSHIFT;
+}
 #ifdef CONFIG_NO_HZ
 /*
 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3148,6 +3214,128 @@ static long calc_load_fold_idle(void)
        return delta;
 }
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x:         base of the power
+ * @frac_bits: fractional bits of @x
+ * @n:         power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+        unsigned long result = 1UL << frac_bits;
+        if (n) for (;;) {
+                if (n & 1) {
+                        result *= x;
+                        result += 1UL << (frac_bits - 1);
+                        result >>= frac_bits;
+                }
+                n >>= 1;
+                if (!n)
+                        break;
+                x *= x;
+                x += 1UL << (frac_bits - 1);
+                x >>= frac_bits;
+        }
+        return result;
+}
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ *    = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ *  ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ *    = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ *              n         1 - x^(n+1)
+ *     S_n := \Sum x^i = -------------
+ *             i=0          1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+            unsigned long active, unsigned int n)
+{
+        return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(unsigned long ticks)
+{
+        long delta, active, n;
+        if (time_before(jiffies, calc_load_update))
+                return;
+        /*
+         * If we crossed a calc_load_update boundary, make sure to fold
+         * any pending idle changes, the respective CPUs might have
+         * missed the tick driven calc_load_account_active() update
+         * due to NO_HZ.
+         */
+        delta = calc_load_fold_idle();
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks);
+        /*
+         * If we were idle for multiple load cycles, apply them.
+         */
+        if (ticks >= LOAD_FREQ) {
+                n = ticks / LOAD_FREQ;
+                active = atomic_long_read(&calc_load_tasks);
+                active = active > 0 ? active * FIXED_1 : 0;
+                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+                calc_load_update += n * LOAD_FREQ;
+        }
+        /*
+         * Its possible the remainder of the above division also crosses
+         * a LOAD_FREQ period, the regular check in calc_global_load()
+         * which comes after this will take care of that.
+         *
+         * Consider us being 11 ticks before a cycle completion, and us
+         * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+         * age us 4 cycles, and the test in calc_global_load() will
+         * pick up the final one.
+         */
+}
 #else
 static void calc_load_account_idle(struct rq *this_rq)
 {
@@ -3157,6 +3345,10 @@ static inline long calc_load_fold_idle(void)
 {
        return 0;
 }
+static void calc_global_nohz(unsigned long ticks)
+{
+}
 #endif
 /**
@@ -3174,24 +3366,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
        loads[2] = (avenrun[2] + offset) << shift;
 }
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
-        load *= exp;
-        load += active * (FIXED_1 - exp);
-        return load >> FSHIFT;
-}
 /*
 * calc_load - update the avenrun load estimates 10 ticks after the
 * CPUs have updated calc_load_tasks.
 */
-void calc_global_load(void)
+void calc_global_load(unsigned long ticks)
 {
-        unsigned long upd = calc_load_update + 10;
        long active;
-        if (time_before(jiffies, upd))
+        calc_global_nohz(ticks);
+        if (time_before(jiffies, calc_load_update + 10))
                return;
        active = atomic_long_read(&calc_load_tasks);
@@ -3845,7 +4030,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
        if (prev->se.on_rq)
                update_rq_clock(rq);
-        rq->skip_clock_update = 0;
        prev->sched_class->put_prev_task(rq, prev);
 }
@@ -3903,7 +4087,6 @@ need_resched_nonpreemptible:
                hrtick_clear(rq);
        raw_spin_lock_irq(&rq->lock);
-        clear_tsk_need_resched(prev);
        switch_count = &prev->nivcsw;
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3935,6 +4118,8 @@ need_resched_nonpreemptible:
        put_prev_task(rq, prev);
        next = pick_next_task(rq);
+        clear_tsk_need_resched(prev);
+        rq->skip_clock_update = 0;
        if (likely(prev != next)) {
                sched_info_switch(prev, next);

diff --git a/kernel/sched.c b/kernel/sched.c index dc91a4d09ac3..297d1a0eedb0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -636,22 +636,18 @@ static inline struct task_group task_group(struct task_struct p)
636		636
637	#endif /* CONFIG_CGROUP_SCHED */	637	#endif /* CONFIG_CGROUP_SCHED */
638		638
639	static u64 irq_time_cpu(int cpu);	639	static void update_rq_clock_task(struct rq *rq, s64 delta);
640	static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
641		640
642	inline void update_rq_clock(struct rq *rq)	641	static void update_rq_clock(struct rq *rq)
643	{	642	{
644	if (!rq->skip_clock_update) {	643	s64 delta;
645	int cpu = cpu_of(rq);
646	u64 irq_time;
647		644
648	rq->clock = sched_clock_cpu(cpu);	645	if (rq->skip_clock_update)
649	irq_time = irq_time_cpu(cpu);	646	return;
650	if (rq->clock - irq_time > rq->clock_task)
651	rq->clock_task = rq->clock - irq_time;
652		647
653	sched_irq_time_avg_update(rq, irq_time);	648	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
654	}	649	rq->clock += delta;
		650	update_rq_clock_task(rq, delta);
655	}	651	}
656		652
657	/*	653	/*
@@ -1924,10 +1920,9 @@ static void deactivate_task(struct rq rq, struct task_struct p, int flags)
1924	* They are read and saved off onto struct rq in update_rq_clock().	1920	* They are read and saved off onto struct rq in update_rq_clock().
1925	* This may result in other CPU reading this CPU's irq time and can	1921	* This may result in other CPU reading this CPU's irq time and can
1926	* race with irq/account_system_vtime on this CPU. We would either get old	1922	* race with irq/account_system_vtime on this CPU. We would either get old
1927	* or new value (or semi updated value on 32 bit) with a side effect of	1923	* or new value with a side effect of accounting a slice of irq time to wrong
1928	* accounting a slice of irq time to wrong task when irq is in progress	1924	* task when irq is in progress while we read rq->clock. That is a worthy
1929	* while we read rq->clock. That is a worthy compromise in place of having	1925	* compromise in place of having locks on each irq in account_system_time.
1930	* locks on each irq in account_system_time.
1931	*/	1926	*/
1932	static DEFINE_PER_CPU(u64, cpu_hardirq_time);	1927	static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1933	static DEFINE_PER_CPU(u64, cpu_softirq_time);	1928	static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1945,19 +1940,58 @@ void disable_sched_clock_irqtime(void)
1945	sched_clock_irqtime = 0;	1940	sched_clock_irqtime = 0;
1946	}	1941	}
1947		1942
1948	static u64 irq_time_cpu(int cpu)	1943	#ifndef CONFIG_64BIT
		1944	static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
		1945
		1946	static inline void irq_time_write_begin(void)
1949	{	1947	{
1950	if (!sched_clock_irqtime)	1948	__this_cpu_inc(irq_time_seq.sequence);
1951	return 0;	1949	smp_wmb();
		1950	}
		1951
		1952	static inline void irq_time_write_end(void)
		1953	{
		1954	smp_wmb();
		1955	__this_cpu_inc(irq_time_seq.sequence);
		1956	}
		1957
		1958	static inline u64 irq_time_read(int cpu)
		1959	{
		1960	u64 irq_time;
		1961	unsigned seq;
1952		1962
		1963	do {
		1964	seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
		1965	irq_time = per_cpu(cpu_softirq_time, cpu) +
		1966	per_cpu(cpu_hardirq_time, cpu);
		1967	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
		1968
		1969	return irq_time;
		1970	}
		1971	#else /* CONFIG_64BIT */
		1972	static inline void irq_time_write_begin(void)
		1973	{
		1974	}
		1975
		1976	static inline void irq_time_write_end(void)
		1977	{
		1978	}
		1979
		1980	static inline u64 irq_time_read(int cpu)
		1981	{
1953	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);	1982	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1954	}	1983	}
		1984	#endif /* CONFIG_64BIT */
1955		1985
		1986	/*
		1987	* Called before incrementing preempt_count on {soft,}irq_enter
		1988	* and before decrementing preempt_count on {soft,}irq_exit.
		1989	*/
1956	void account_system_vtime(struct task_struct *curr)	1990	void account_system_vtime(struct task_struct *curr)
1957	{	1991	{
1958	unsigned long flags;	1992	unsigned long flags;
		1993	s64 delta;
1959	int cpu;	1994	int cpu;
1960	u64 now, delta;
1961		1995
1962	if (!sched_clock_irqtime)	1996	if (!sched_clock_irqtime)
1963	return;	1997	return;
@@ -1965,9 +1999,10 @@ void account_system_vtime(struct task_struct *curr)
1965	local_irq_save(flags);	1999	local_irq_save(flags);
1966		2000
1967	cpu = smp_processor_id();	2001	cpu = smp_processor_id();
1968	now = sched_clock_cpu(cpu);	2002	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1969	delta = now - per_cpu(irq_start_time, cpu);	2003	__this_cpu_add(irq_start_time, delta);
1970	per_cpu(irq_start_time, cpu) = now;	2004
		2005	irq_time_write_begin();
1971	/*	2006	/*
1972	* We do not account for softirq time from ksoftirqd here.	2007	* We do not account for softirq time from ksoftirqd here.
1973	* We want to continue accounting softirq time to ksoftirqd thread	2008	* We want to continue accounting softirq time to ksoftirqd thread
@@ -1975,33 +2010,55 @@ void account_system_vtime(struct task_struct *curr)
1975	* that do not consume any time, but still wants to run.	2010	* that do not consume any time, but still wants to run.
1976	*/	2011	*/
1977	if (hardirq_count())	2012	if (hardirq_count())
1978	per_cpu(cpu_hardirq_time, cpu) += delta;	2013	__this_cpu_add(cpu_hardirq_time, delta);
1979	else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))	2014	else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1980	per_cpu(cpu_softirq_time, cpu) += delta;	2015	__this_cpu_add(cpu_softirq_time, delta);
1981		2016
		2017	irq_time_write_end();
1982	local_irq_restore(flags);	2018	local_irq_restore(flags);
1983	}	2019	}
1984	EXPORT_SYMBOL_GPL(account_system_vtime);	2020	EXPORT_SYMBOL_GPL(account_system_vtime);
1985		2021
1986	static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)	2022	static void update_rq_clock_task(struct rq *rq, s64 delta)
1987	{	2023	{
1988	if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {	2024	s64 irq_delta;
1989	u64 delta_irq = curr_irq_time - rq->prev_irq_time;	2025
1990	rq->prev_irq_time = curr_irq_time;	2026	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1991	sched_rt_avg_update(rq, delta_irq);	2027
1992	}	2028	/*
		2029	* Since irq_time is only updated on {soft,}irq_exit, we might run into
		2030	* this case when a previous update_rq_clock() happened inside a
		2031	* {soft,}irq region.
		2032	*
		2033	* When this happens, we stop ->clock_task and only update the
		2034	* prev_irq_time stamp to account for the part that fit, so that a next
		2035	* update will consume the rest. This ensures ->clock_task is
		2036	* monotonic.
		2037	*
		2038	* It does however cause some slight miss-attribution of {soft,}irq
		2039	* time, a more accurate solution would be to update the irq_time using
		2040	* the current rq->clock timestamp, except that would require using
		2041	* atomic ops.
		2042	*/
		2043	if (irq_delta > delta)
		2044	irq_delta = delta;
		2045
		2046	rq->prev_irq_time += irq_delta;
		2047	delta -= irq_delta;
		2048	rq->clock_task += delta;
		2049
		2050	if (irq_delta && sched_feat(NONIRQ_POWER))
		2051	sched_rt_avg_update(rq, irq_delta);
1993	}	2052	}
1994		2053
1995	#else	2054	#else /* CONFIG_IRQ_TIME_ACCOUNTING */
1996		2055
1997	static u64 irq_time_cpu(int cpu)	2056	static void update_rq_clock_task(struct rq *rq, s64 delta)
1998	{	2057	{
1999	return 0;	2058	rq->clock_task += delta;
2000	}	2059	}
2001		2060
2002	static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }	2061	#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2003
2004	#endif
2005		2062
2006	#include "sched_idletask.c"	2063	#include "sched_idletask.c"
2007	#include "sched_fair.c"	2064	#include "sched_fair.c"
@@ -2129,7 +2186,7 @@ static void check_preempt_curr(struct rq rq, struct task_struct p, int flags)
2129	* A queue event has occurred, and we're going to schedule. In	2186	* A queue event has occurred, and we're going to schedule. In
2130	* this case, we can save a useless back to back clock update.	2187	* this case, we can save a useless back to back clock update.
2131	*/	2188	*/
2132	if (test_tsk_need_resched(rq->curr))	2189	if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
2133	rq->skip_clock_update = 1;	2190	rq->skip_clock_update = 1;
2134	}	2191	}
2135		2192
@@ -3119,6 +3176,15 @@ static long calc_load_fold_active(struct rq *this_rq)
3119	return delta;	3176	return delta;
3120	}	3177	}
3121		3178
		3179	static unsigned long
		3180	calc_load(unsigned long load, unsigned long exp, unsigned long active)
		3181	{
		3182	load *= exp;
		3183	load += active * (FIXED_1 - exp);
		3184	load += 1UL << (FSHIFT - 1);
		3185	return load >> FSHIFT;
		3186	}
		3187
3122	#ifdef CONFIG_NO_HZ	3188	#ifdef CONFIG_NO_HZ
3123	/*	3189	/*
3124	* For NO_HZ we delay the active fold to the next LOAD_FREQ update.	3190	* For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3148,6 +3214,128 @@ static long calc_load_fold_idle(void)
3148		3214
3149	return delta;	3215	return delta;
3150	}	3216	}
		3217
		3218	/**
		3219	* fixed_power_int - compute: x^n, in O(log n) time
		3220	*
		3221	* @x: base of the power
		3222	* @frac_bits: fractional bits of @x
		3223	* @n: power to raise @x to.
		3224	*
		3225	* By exploiting the relation between the definition of the natural power
		3226	* function: x^n := xx...*x (x multiplied by itself for n times), and
		3227	* the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
		3228	* (where: n_i \elem {0, 1}, the binary vector representing n),
		3229	* we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
		3230	* of course trivially computable in O(log_2 n), the length of our binary
		3231	* vector.
		3232	*/
		3233	static unsigned long
		3234	fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
		3235	{
		3236	unsigned long result = 1UL << frac_bits;
		3237
		3238	if (n) for (;;) {
		3239	if (n & 1) {
		3240	result *= x;
		3241	result += 1UL << (frac_bits - 1);
		3242	result >>= frac_bits;
		3243	}
		3244	n >>= 1;
		3245	if (!n)
		3246	break;
		3247	x *= x;
		3248	x += 1UL << (frac_bits - 1);
		3249	x >>= frac_bits;
		3250	}
		3251
		3252	return result;
		3253	}
		3254
		3255	/*
		3256	* a1 = a0 * e + a * (1 - e)
		3257	*
		3258	* a2 = a1 * e + a * (1 - e)
		3259	* = (a0 * e + a * (1 - e)) * e + a * (1 - e)
		3260	* = a0 * e^2 + a * (1 - e) * (1 + e)
		3261	*
		3262	* a3 = a2 * e + a * (1 - e)
		3263	* = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
		3264	* = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
		3265	*
		3266	* ...
		3267	*
		3268	* an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
		3269	* = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
		3270	* = a0 * e^n + a * (1 - e^n)
		3271	*
		3272	* [1] application of the geometric series:
		3273	*
		3274	* n 1 - x^(n+1)
		3275	* S_n := \Sum x^i = -------------
		3276	* i=0 1 - x
		3277	*/
		3278	static unsigned long
		3279	calc_load_n(unsigned long load, unsigned long exp,
		3280	unsigned long active, unsigned int n)
		3281	{
		3282
		3283	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
		3284	}
		3285
		3286	/*
		3287	* NO_HZ can leave us missing all per-cpu ticks calling
		3288	* calc_load_account_active(), but since an idle CPU folds its delta into
		3289	* calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
		3290	* in the pending idle delta if our idle period crossed a load cycle boundary.
		3291	*
		3292	* Once we've updated the global active value, we need to apply the exponential
		3293	* weights adjusted to the number of cycles missed.
		3294	*/
		3295	static void calc_global_nohz(unsigned long ticks)
		3296	{
		3297	long delta, active, n;
		3298
		3299	if (time_before(jiffies, calc_load_update))
		3300	return;
		3301
		3302	/*
		3303	* If we crossed a calc_load_update boundary, make sure to fold
		3304	* any pending idle changes, the respective CPUs might have
		3305	* missed the tick driven calc_load_account_active() update
		3306	* due to NO_HZ.
		3307	*/
		3308	delta = calc_load_fold_idle();
		3309	if (delta)
		3310	atomic_long_add(delta, &calc_load_tasks);
		3311
		3312	/*
		3313	* If we were idle for multiple load cycles, apply them.
		3314	*/
		3315	if (ticks >= LOAD_FREQ) {
		3316	n = ticks / LOAD_FREQ;
		3317
		3318	active = atomic_long_read(&calc_load_tasks);
		3319	active = active > 0 ? active * FIXED_1 : 0;
		3320
		3321	avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
		3322	avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
		3323	avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
		3324
		3325	calc_load_update += n * LOAD_FREQ;
		3326	}
		3327
		3328	/*
		3329	* Its possible the remainder of the above division also crosses
		3330	* a LOAD_FREQ period, the regular check in calc_global_load()
		3331	* which comes after this will take care of that.
		3332	*
		3333	* Consider us being 11 ticks before a cycle completion, and us
		3334	* sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
		3335	* age us 4 cycles, and the test in calc_global_load() will
		3336	* pick up the final one.
		3337	*/
		3338	}
3151	#else	3339	#else
3152	static void calc_load_account_idle(struct rq *this_rq)	3340	static void calc_load_account_idle(struct rq *this_rq)
3153	{	3341	{
@@ -3157,6 +3345,10 @@ static inline long calc_load_fold_idle(void)
3157	{	3345	{
3158	return 0;	3346	return 0;
3159	}	3347	}
		3348
		3349	static void calc_global_nohz(unsigned long ticks)
		3350	{
		3351	}
3160	#endif	3352	#endif
3161		3353
3162	/**	3354	/**
@@ -3174,24 +3366,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3174	loads[2] = (avenrun[2] + offset) << shift;	3366	loads[2] = (avenrun[2] + offset) << shift;
3175	}	3367	}
3176		3368
3177	static unsigned long
3178	calc_load(unsigned long load, unsigned long exp, unsigned long active)
3179	{
3180	load *= exp;
3181	load += active * (FIXED_1 - exp);
3182	return load >> FSHIFT;
3183	}
3184
3185	/*	3369	/*
3186	* calc_load - update the avenrun load estimates 10 ticks after the	3370	* calc_load - update the avenrun load estimates 10 ticks after the
3187	* CPUs have updated calc_load_tasks.	3371	* CPUs have updated calc_load_tasks.
3188	*/	3372	*/
3189	void calc_global_load(void)	3373	void calc_global_load(unsigned long ticks)
3190	{	3374	{
3191	unsigned long upd = calc_load_update + 10;
3192	long active;	3375	long active;
3193		3376
3194	if (time_before(jiffies, upd))	3377	calc_global_nohz(ticks);
		3378
		3379	if (time_before(jiffies, calc_load_update + 10))
3195	return;	3380	return;
3196		3381
3197	active = atomic_long_read(&calc_load_tasks);	3382	active = atomic_long_read(&calc_load_tasks);
@@ -3845,7 +4030,6 @@ static void put_prev_task(struct rq rq, struct task_struct prev)
3845	{	4030	{
3846	if (prev->se.on_rq)	4031	if (prev->se.on_rq)
3847	update_rq_clock(rq);	4032	update_rq_clock(rq);
3848	rq->skip_clock_update = 0;
3849	prev->sched_class->put_prev_task(rq, prev);	4033	prev->sched_class->put_prev_task(rq, prev);
3850	}	4034	}
3851		4035
@@ -3903,7 +4087,6 @@ need_resched_nonpreemptible:
3903	hrtick_clear(rq);	4087	hrtick_clear(rq);
3904		4088
3905	raw_spin_lock_irq(&rq->lock);	4089	raw_spin_lock_irq(&rq->lock);
3906	clear_tsk_need_resched(prev);
3907		4090
3908	switch_count = &prev->nivcsw;	4091	switch_count = &prev->nivcsw;
3909	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {	4092	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3935,6 +4118,8 @@ need_resched_nonpreemptible:
3935		4118
3936	put_prev_task(rq, prev);	4119	put_prev_task(rq, prev);
3937	next = pick_next_task(rq);	4120	next = pick_next_task(rq);
		4121	clear_tsk_need_resched(prev);
		4122	rq->skip_clock_update = 0;
3938		4123
3939	if (likely(prev != next)) {	4124	if (likely(prev != next)) {
3940	sched_info_switch(prev, next);	4125	sched_info_switch(prev, next);