Merge commit 'v2.6.37' into sched/core

Merge reason: Merge the final .37 tree. Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2011-01-05 08:14:42 -0500
committer: Ingo Molnar <mingo@elte.hu> 2011-01-05 08:14:46 -0500
commit: 27066fd484a32c80630136aa2b91c980f3198f9d (patch)
tree: 78ddabdedbfd7525d13ecd62a745525843f1d0e8 /kernel/sched.c
parent: 101e5f77bf35679809586e250b6c62193d2ed179 (diff)
parent: 3c0eee3fe6a3a1c745379547c7e7c904aa64f6d5 (diff)
1 files changed, 236 insertions, 51 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 9f9dd8dda53c..f2f914e0c47c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -642,22 +642,18 @@ static inline struct task_group *task_group(struct task_struct *p)
 #endif /* CONFIG_CGROUP_SCHED */
-static u64 irq_time_cpu(int cpu);
+static void update_rq_clock_task(struct rq *rq, s64 delta);
-static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
-inline void update_rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)
 {
-        if (!rq->skip_clock_update) {
+        s64 delta;
-                int cpu = cpu_of(rq);
-                u64 irq_time;
-                rq->clock = sched_clock_cpu(cpu);
+        if (rq->skip_clock_update)
-                irq_time = irq_time_cpu(cpu);
+                return;
-                if (rq->clock - irq_time > rq->clock_task)
-                        rq->clock_task = rq->clock - irq_time;
-                sched_irq_time_avg_update(rq, irq_time);
+        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
-        }
+        rq->clock += delta;
+        update_rq_clock_task(rq, delta);
 }
 /*
@@ -1795,10 +1791,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 * They are read and saved off onto struct rq in update_rq_clock().
 * This may result in other CPU reading this CPU's irq time and can
 * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value (or semi updated value on 32 bit) with a side effect of
+ * or new value with a side effect of accounting a slice of irq time to wrong
- * accounting a slice of irq time to wrong task when irq is in progress
+ * task when irq is in progress while we read rq->clock. That is a worthy
- * while we read rq->clock. That is a worthy compromise in place of having
+ * compromise in place of having locks on each irq in account_system_time.
- * locks on each irq in account_system_time.
 */
 static DEFINE_PER_CPU(u64, cpu_hardirq_time);
 static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1816,19 +1811,58 @@ void disable_sched_clock_irqtime(void)
        sched_clock_irqtime = 0;
 }
-static u64 irq_time_cpu(int cpu)
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+static inline void irq_time_write_begin(void)
 {
-        if (!sched_clock_irqtime)
+        __this_cpu_inc(irq_time_seq.sequence);
-                return 0;
+        smp_wmb();
+}
+static inline void irq_time_write_end(void)
+{
+        smp_wmb();
+        __this_cpu_inc(irq_time_seq.sequence);
+}
+static inline u64 irq_time_read(int cpu)
+{
+        u64 irq_time;
+        unsigned seq;
+        do {
+                seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+                irq_time = per_cpu(cpu_softirq_time, cpu) +
+                           per_cpu(cpu_hardirq_time, cpu);
+        } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+        return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+static inline void irq_time_write_end(void)
+{
+}
+static inline u64 irq_time_read(int cpu)
+{
        return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
 }
+#endif /* CONFIG_64BIT */
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
 void account_system_vtime(struct task_struct *curr)
 {
        unsigned long flags;
+        s64 delta;
        int cpu;
-        u64 now, delta;
        if (!sched_clock_irqtime)
                return;
@@ -1836,9 +1870,10 @@ void account_system_vtime(struct task_struct *curr)
        local_irq_save(flags);
        cpu = smp_processor_id();
-        now = sched_clock_cpu(cpu);
+        delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
-        delta = now - per_cpu(irq_start_time, cpu);
+        __this_cpu_add(irq_start_time, delta);
-        per_cpu(irq_start_time, cpu) = now;
+        irq_time_write_begin();
        /*
         * We do not account for softirq time from ksoftirqd here.
         * We want to continue accounting softirq time to ksoftirqd thread
@@ -1846,33 +1881,55 @@ void account_system_vtime(struct task_struct *curr)
         * that do not consume any time, but still wants to run.
         */
        if (hardirq_count())
-                per_cpu(cpu_hardirq_time, cpu) += delta;
+                __this_cpu_add(cpu_hardirq_time, delta);
        else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
-                per_cpu(cpu_softirq_time, cpu) += delta;
+                __this_cpu_add(cpu_softirq_time, delta);
+        irq_time_write_end();
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
-        if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
+        s64 irq_delta;
-                u64 delta_irq = curr_irq_time - rq->prev_irq_time;
-                rq->prev_irq_time = curr_irq_time;
+        irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
-                sched_rt_avg_update(rq, delta_irq);
-        }
+        /*
+         * Since irq_time is only updated on {soft,}irq_exit, we might run into
+         * this case when a previous update_rq_clock() happened inside a
+         * {soft,}irq region.
+         *
+         * When this happens, we stop ->clock_task and only update the
+         * prev_irq_time stamp to account for the part that fit, so that a next
+         * update will consume the rest. This ensures ->clock_task is
+         * monotonic.
+         *
+         * It does however cause some slight miss-attribution of {soft,}irq
+         * time, a more accurate solution would be to update the irq_time using
+         * the current rq->clock timestamp, except that would require using
+         * atomic ops.
+         */
+        if (irq_delta > delta)
+                irq_delta = delta;
+        rq->prev_irq_time += irq_delta;
+        delta -= irq_delta;
+        rq->clock_task += delta;
+        if (irq_delta && sched_feat(NONIRQ_POWER))
+                sched_rt_avg_update(rq, irq_delta);
 }
-#else
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static u64 irq_time_cpu(int cpu)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
-        return 0;
+        rq->clock_task += delta;
 }
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-#endif
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -2001,7 +2058,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
         * A queue event has occurred, and we're going to schedule.  In
         * this case, we can save a useless back to back clock update.
         */
-        if (test_tsk_need_resched(rq->curr))
+        if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
                rq->skip_clock_update = 1;
 }
@@ -2988,6 +3045,15 @@ static long calc_load_fold_active(struct rq *this_rq)
        return delta;
 }
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+        load *= exp;
+        load += active * (FIXED_1 - exp);
+        load += 1UL << (FSHIFT - 1);
+        return load >> FSHIFT;
+}
 #ifdef CONFIG_NO_HZ
 /*
 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3017,6 +3083,128 @@ static long calc_load_fold_idle(void)
        return delta;
 }
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x:         base of the power
+ * @frac_bits: fractional bits of @x
+ * @n:         power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+        unsigned long result = 1UL << frac_bits;
+        if (n) for (;;) {
+                if (n & 1) {
+                        result *= x;
+                        result += 1UL << (frac_bits - 1);
+                        result >>= frac_bits;
+                }
+                n >>= 1;
+                if (!n)
+                        break;
+                x *= x;
+                x += 1UL << (frac_bits - 1);
+                x >>= frac_bits;
+        }
+        return result;
+}
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ *    = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ *  ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ *    = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ *              n         1 - x^(n+1)
+ *     S_n := \Sum x^i = -------------
+ *             i=0          1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+            unsigned long active, unsigned int n)
+{
+        return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(unsigned long ticks)
+{
+        long delta, active, n;
+        if (time_before(jiffies, calc_load_update))
+                return;
+        /*
+         * If we crossed a calc_load_update boundary, make sure to fold
+         * any pending idle changes, the respective CPUs might have
+         * missed the tick driven calc_load_account_active() update
+         * due to NO_HZ.
+         */
+        delta = calc_load_fold_idle();
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks);
+        /*
+         * If we were idle for multiple load cycles, apply them.
+         */
+        if (ticks >= LOAD_FREQ) {
+                n = ticks / LOAD_FREQ;
+                active = atomic_long_read(&calc_load_tasks);
+                active = active > 0 ? active * FIXED_1 : 0;
+                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+                calc_load_update += n * LOAD_FREQ;
+        }
+        /*
+         * Its possible the remainder of the above division also crosses
+         * a LOAD_FREQ period, the regular check in calc_global_load()
+         * which comes after this will take care of that.
+         *
+         * Consider us being 11 ticks before a cycle completion, and us
+         * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+         * age us 4 cycles, and the test in calc_global_load() will
+         * pick up the final one.
+         */
+}
 #else
 static void calc_load_account_idle(struct rq *this_rq)
 {
@@ -3026,6 +3214,10 @@ static inline long calc_load_fold_idle(void)
 {
        return 0;
 }
+static void calc_global_nohz(unsigned long ticks)
+{
+}
 #endif
 /**
@@ -3043,24 +3235,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
        loads[2] = (avenrun[2] + offset) << shift;
 }
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
-        load *= exp;
-        load += active * (FIXED_1 - exp);
-        return load >> FSHIFT;
-}
 /*
 * calc_load - update the avenrun load estimates 10 ticks after the
 * CPUs have updated calc_load_tasks.
 */
-void calc_global_load(void)
+void calc_global_load(unsigned long ticks)
 {
-        unsigned long upd = calc_load_update + 10;
        long active;
-        if (time_before(jiffies, upd))
+        calc_global_nohz(ticks);
+        if (time_before(jiffies, calc_load_update + 10))
                return;
        active = atomic_long_read(&calc_load_tasks);
@@ -3714,7 +3899,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
        if (prev->se.on_rq)
                update_rq_clock(rq);
-        rq->skip_clock_update = 0;
        prev->sched_class->put_prev_task(rq, prev);
 }
@@ -3772,7 +3956,6 @@ need_resched_nonpreemptible:
                hrtick_clear(rq);
        raw_spin_lock_irq(&rq->lock);
-        clear_tsk_need_resched(prev);
        switch_count = &prev->nivcsw;
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3804,6 +3987,8 @@ need_resched_nonpreemptible:
        put_prev_task(rq, prev);
        next = pick_next_task(rq);
+        clear_tsk_need_resched(prev);
+        rq->skip_clock_update = 0;
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
author	Ingo Molnar <mingo@elte.hu>	2011-01-05 08:14:42 -0500
committer	Ingo Molnar <mingo@elte.hu>	2011-01-05 08:14:46 -0500
commit	27066fd484a32c80630136aa2b91c980f3198f9d (patch)
tree	78ddabdedbfd7525d13ecd62a745525843f1d0e8 /kernel/sched.c
parent	101e5f77bf35679809586e250b6c62193d2ed179 (diff)
parent	3c0eee3fe6a3a1c745379547c7e7c904aa64f6d5 (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index 9f9dd8dda53c..f2f914e0c47c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -642,22 +642,18 @@ static inline struct task_group task_group(struct task_struct p)
642		642
643	#endif /* CONFIG_CGROUP_SCHED */	643	#endif /* CONFIG_CGROUP_SCHED */
644		644
645	static u64 irq_time_cpu(int cpu);	645	static void update_rq_clock_task(struct rq *rq, s64 delta);
646	static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
647		646
648	inline void update_rq_clock(struct rq *rq)	647	static void update_rq_clock(struct rq *rq)
649	{	648	{
650	if (!rq->skip_clock_update) {	649	s64 delta;
651	int cpu = cpu_of(rq);
652	u64 irq_time;
653		650
654	rq->clock = sched_clock_cpu(cpu);	651	if (rq->skip_clock_update)
655	irq_time = irq_time_cpu(cpu);	652	return;
656	if (rq->clock - irq_time > rq->clock_task)
657	rq->clock_task = rq->clock - irq_time;
658		653
659	sched_irq_time_avg_update(rq, irq_time);	654	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
660	}	655	rq->clock += delta;
		656	update_rq_clock_task(rq, delta);
661	}	657	}
662		658
663	/*	659	/*
@@ -1795,10 +1791,9 @@ static void deactivate_task(struct rq rq, struct task_struct p, int flags)
1795	* They are read and saved off onto struct rq in update_rq_clock().	1791	* They are read and saved off onto struct rq in update_rq_clock().
1796	* This may result in other CPU reading this CPU's irq time and can	1792	* This may result in other CPU reading this CPU's irq time and can
1797	* race with irq/account_system_vtime on this CPU. We would either get old	1793	* race with irq/account_system_vtime on this CPU. We would either get old
1798	* or new value (or semi updated value on 32 bit) with a side effect of	1794	* or new value with a side effect of accounting a slice of irq time to wrong
1799	* accounting a slice of irq time to wrong task when irq is in progress	1795	* task when irq is in progress while we read rq->clock. That is a worthy
1800	* while we read rq->clock. That is a worthy compromise in place of having	1796	* compromise in place of having locks on each irq in account_system_time.
1801	* locks on each irq in account_system_time.
1802	*/	1797	*/
1803	static DEFINE_PER_CPU(u64, cpu_hardirq_time);	1798	static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1804	static DEFINE_PER_CPU(u64, cpu_softirq_time);	1799	static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1816,19 +1811,58 @@ void disable_sched_clock_irqtime(void)
1816	sched_clock_irqtime = 0;	1811	sched_clock_irqtime = 0;
1817	}	1812	}
1818		1813
1819	static u64 irq_time_cpu(int cpu)	1814	#ifndef CONFIG_64BIT
		1815	static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
		1816
		1817	static inline void irq_time_write_begin(void)
1820	{	1818	{
1821	if (!sched_clock_irqtime)	1819	__this_cpu_inc(irq_time_seq.sequence);
1822	return 0;	1820	smp_wmb();
		1821	}
		1822
		1823	static inline void irq_time_write_end(void)
		1824	{
		1825	smp_wmb();
		1826	__this_cpu_inc(irq_time_seq.sequence);
		1827	}
		1828
		1829	static inline u64 irq_time_read(int cpu)
		1830	{
		1831	u64 irq_time;
		1832	unsigned seq;
1823		1833
		1834	do {
		1835	seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
		1836	irq_time = per_cpu(cpu_softirq_time, cpu) +
		1837	per_cpu(cpu_hardirq_time, cpu);
		1838	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
		1839
		1840	return irq_time;
		1841	}
		1842	#else /* CONFIG_64BIT */
		1843	static inline void irq_time_write_begin(void)
		1844	{
		1845	}
		1846
		1847	static inline void irq_time_write_end(void)
		1848	{
		1849	}
		1850
		1851	static inline u64 irq_time_read(int cpu)
		1852	{
1824	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);	1853	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1825	}	1854	}
		1855	#endif /* CONFIG_64BIT */
1826		1856
		1857	/*
		1858	* Called before incrementing preempt_count on {soft,}irq_enter
		1859	* and before decrementing preempt_count on {soft,}irq_exit.
		1860	*/
1827	void account_system_vtime(struct task_struct *curr)	1861	void account_system_vtime(struct task_struct *curr)
1828	{	1862	{
1829	unsigned long flags;	1863	unsigned long flags;
		1864	s64 delta;
1830	int cpu;	1865	int cpu;
1831	u64 now, delta;
1832		1866
1833	if (!sched_clock_irqtime)	1867	if (!sched_clock_irqtime)
1834	return;	1868	return;
@@ -1836,9 +1870,10 @@ void account_system_vtime(struct task_struct *curr)
1836	local_irq_save(flags);	1870	local_irq_save(flags);
1837		1871
1838	cpu = smp_processor_id();	1872	cpu = smp_processor_id();
1839	now = sched_clock_cpu(cpu);	1873	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1840	delta = now - per_cpu(irq_start_time, cpu);	1874	__this_cpu_add(irq_start_time, delta);
1841	per_cpu(irq_start_time, cpu) = now;	1875
		1876	irq_time_write_begin();
1842	/*	1877	/*
1843	* We do not account for softirq time from ksoftirqd here.	1878	* We do not account for softirq time from ksoftirqd here.
1844	* We want to continue accounting softirq time to ksoftirqd thread	1879	* We want to continue accounting softirq time to ksoftirqd thread
@@ -1846,33 +1881,55 @@ void account_system_vtime(struct task_struct *curr)
1846	* that do not consume any time, but still wants to run.	1881	* that do not consume any time, but still wants to run.
1847	*/	1882	*/
1848	if (hardirq_count())	1883	if (hardirq_count())
1849	per_cpu(cpu_hardirq_time, cpu) += delta;	1884	__this_cpu_add(cpu_hardirq_time, delta);
1850	else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))	1885	else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1851	per_cpu(cpu_softirq_time, cpu) += delta;	1886	__this_cpu_add(cpu_softirq_time, delta);
1852		1887
		1888	irq_time_write_end();
1853	local_irq_restore(flags);	1889	local_irq_restore(flags);
1854	}	1890	}
1855	EXPORT_SYMBOL_GPL(account_system_vtime);	1891	EXPORT_SYMBOL_GPL(account_system_vtime);
1856		1892
1857	static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)	1893	static void update_rq_clock_task(struct rq *rq, s64 delta)
1858	{	1894	{
1859	if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {	1895	s64 irq_delta;
1860	u64 delta_irq = curr_irq_time - rq->prev_irq_time;	1896
1861	rq->prev_irq_time = curr_irq_time;	1897	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1862	sched_rt_avg_update(rq, delta_irq);	1898
1863	}	1899	/*
		1900	* Since irq_time is only updated on {soft,}irq_exit, we might run into
		1901	* this case when a previous update_rq_clock() happened inside a
		1902	* {soft,}irq region.
		1903	*
		1904	* When this happens, we stop ->clock_task and only update the
		1905	* prev_irq_time stamp to account for the part that fit, so that a next
		1906	* update will consume the rest. This ensures ->clock_task is
		1907	* monotonic.
		1908	*
		1909	* It does however cause some slight miss-attribution of {soft,}irq
		1910	* time, a more accurate solution would be to update the irq_time using
		1911	* the current rq->clock timestamp, except that would require using
		1912	* atomic ops.
		1913	*/
		1914	if (irq_delta > delta)
		1915	irq_delta = delta;
		1916
		1917	rq->prev_irq_time += irq_delta;
		1918	delta -= irq_delta;
		1919	rq->clock_task += delta;
		1920
		1921	if (irq_delta && sched_feat(NONIRQ_POWER))
		1922	sched_rt_avg_update(rq, irq_delta);
1864	}	1923	}
1865		1924
1866	#else	1925	#else /* CONFIG_IRQ_TIME_ACCOUNTING */
1867		1926
1868	static u64 irq_time_cpu(int cpu)	1927	static void update_rq_clock_task(struct rq *rq, s64 delta)
1869	{	1928	{
1870	return 0;	1929	rq->clock_task += delta;
1871	}	1930	}
1872		1931
1873	static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }	1932	#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1874
1875	#endif
1876		1933
1877	#include "sched_idletask.c"	1934	#include "sched_idletask.c"
1878	#include "sched_fair.c"	1935	#include "sched_fair.c"
@@ -2001,7 +2058,7 @@ static void check_preempt_curr(struct rq rq, struct task_struct p, int flags)
2001	* A queue event has occurred, and we're going to schedule. In	2058	* A queue event has occurred, and we're going to schedule. In
2002	* this case, we can save a useless back to back clock update.	2059	* this case, we can save a useless back to back clock update.
2003	*/	2060	*/
2004	if (test_tsk_need_resched(rq->curr))	2061	if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
2005	rq->skip_clock_update = 1;	2062	rq->skip_clock_update = 1;
2006	}	2063	}
2007		2064
@@ -2988,6 +3045,15 @@ static long calc_load_fold_active(struct rq *this_rq)
2988	return delta;	3045	return delta;
2989	}	3046	}
2990		3047
		3048	static unsigned long
		3049	calc_load(unsigned long load, unsigned long exp, unsigned long active)
		3050	{
		3051	load *= exp;
		3052	load += active * (FIXED_1 - exp);
		3053	load += 1UL << (FSHIFT - 1);
		3054	return load >> FSHIFT;
		3055	}
		3056
2991	#ifdef CONFIG_NO_HZ	3057	#ifdef CONFIG_NO_HZ
2992	/*	3058	/*
2993	* For NO_HZ we delay the active fold to the next LOAD_FREQ update.	3059	* For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3017,6 +3083,128 @@ static long calc_load_fold_idle(void)
3017		3083
3018	return delta;	3084	return delta;
3019	}	3085	}
		3086
		3087	/**
		3088	* fixed_power_int - compute: x^n, in O(log n) time
		3089	*
		3090	* @x: base of the power
		3091	* @frac_bits: fractional bits of @x
		3092	* @n: power to raise @x to.
		3093	*
		3094	* By exploiting the relation between the definition of the natural power
		3095	* function: x^n := xx...*x (x multiplied by itself for n times), and
		3096	* the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
		3097	* (where: n_i \elem {0, 1}, the binary vector representing n),
		3098	* we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
		3099	* of course trivially computable in O(log_2 n), the length of our binary
		3100	* vector.
		3101	*/
		3102	static unsigned long
		3103	fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
		3104	{
		3105	unsigned long result = 1UL << frac_bits;
		3106
		3107	if (n) for (;;) {
		3108	if (n & 1) {
		3109	result *= x;
		3110	result += 1UL << (frac_bits - 1);
		3111	result >>= frac_bits;
		3112	}
		3113	n >>= 1;
		3114	if (!n)
		3115	break;
		3116	x *= x;
		3117	x += 1UL << (frac_bits - 1);
		3118	x >>= frac_bits;
		3119	}
		3120
		3121	return result;
		3122	}
		3123
		3124	/*
		3125	* a1 = a0 * e + a * (1 - e)
		3126	*
		3127	* a2 = a1 * e + a * (1 - e)
		3128	* = (a0 * e + a * (1 - e)) * e + a * (1 - e)
		3129	* = a0 * e^2 + a * (1 - e) * (1 + e)
		3130	*
		3131	* a3 = a2 * e + a * (1 - e)
		3132	* = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
		3133	* = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
		3134	*
		3135	* ...
		3136	*
		3137	* an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
		3138	* = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
		3139	* = a0 * e^n + a * (1 - e^n)
		3140	*
		3141	* [1] application of the geometric series:
		3142	*
		3143	* n 1 - x^(n+1)
		3144	* S_n := \Sum x^i = -------------
		3145	* i=0 1 - x
		3146	*/
		3147	static unsigned long
		3148	calc_load_n(unsigned long load, unsigned long exp,
		3149	unsigned long active, unsigned int n)
		3150	{
		3151
		3152	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
		3153	}
		3154
		3155	/*
		3156	* NO_HZ can leave us missing all per-cpu ticks calling
		3157	* calc_load_account_active(), but since an idle CPU folds its delta into
		3158	* calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
		3159	* in the pending idle delta if our idle period crossed a load cycle boundary.
		3160	*
		3161	* Once we've updated the global active value, we need to apply the exponential
		3162	* weights adjusted to the number of cycles missed.
		3163	*/
		3164	static void calc_global_nohz(unsigned long ticks)
		3165	{
		3166	long delta, active, n;
		3167
		3168	if (time_before(jiffies, calc_load_update))
		3169	return;
		3170
		3171	/*
		3172	* If we crossed a calc_load_update boundary, make sure to fold
		3173	* any pending idle changes, the respective CPUs might have
		3174	* missed the tick driven calc_load_account_active() update
		3175	* due to NO_HZ.
		3176	*/
		3177	delta = calc_load_fold_idle();
		3178	if (delta)
		3179	atomic_long_add(delta, &calc_load_tasks);
		3180
		3181	/*
		3182	* If we were idle for multiple load cycles, apply them.
		3183	*/
		3184	if (ticks >= LOAD_FREQ) {
		3185	n = ticks / LOAD_FREQ;
		3186
		3187	active = atomic_long_read(&calc_load_tasks);
		3188	active = active > 0 ? active * FIXED_1 : 0;
		3189
		3190	avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
		3191	avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
		3192	avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
		3193
		3194	calc_load_update += n * LOAD_FREQ;
		3195	}
		3196
		3197	/*
		3198	* Its possible the remainder of the above division also crosses
		3199	* a LOAD_FREQ period, the regular check in calc_global_load()
		3200	* which comes after this will take care of that.
		3201	*
		3202	* Consider us being 11 ticks before a cycle completion, and us
		3203	* sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
		3204	* age us 4 cycles, and the test in calc_global_load() will
		3205	* pick up the final one.
		3206	*/
		3207	}
3020	#else	3208	#else
3021	static void calc_load_account_idle(struct rq *this_rq)	3209	static void calc_load_account_idle(struct rq *this_rq)
3022	{	3210	{
@@ -3026,6 +3214,10 @@ static inline long calc_load_fold_idle(void)
3026	{	3214	{
3027	return 0;	3215	return 0;
3028	}	3216	}
		3217
		3218	static void calc_global_nohz(unsigned long ticks)
		3219	{
		3220	}
3029	#endif	3221	#endif
3030		3222
3031	/**	3223	/**
@@ -3043,24 +3235,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3043	loads[2] = (avenrun[2] + offset) << shift;	3235	loads[2] = (avenrun[2] + offset) << shift;
3044	}	3236	}
3045		3237
3046	static unsigned long
3047	calc_load(unsigned long load, unsigned long exp, unsigned long active)
3048	{
3049	load *= exp;
3050	load += active * (FIXED_1 - exp);
3051	return load >> FSHIFT;
3052	}
3053
3054	/*	3238	/*
3055	* calc_load - update the avenrun load estimates 10 ticks after the	3239	* calc_load - update the avenrun load estimates 10 ticks after the
3056	* CPUs have updated calc_load_tasks.	3240	* CPUs have updated calc_load_tasks.
3057	*/	3241	*/
3058	void calc_global_load(void)	3242	void calc_global_load(unsigned long ticks)
3059	{	3243	{
3060	unsigned long upd = calc_load_update + 10;
3061	long active;	3244	long active;
3062		3245
3063	if (time_before(jiffies, upd))	3246	calc_global_nohz(ticks);
		3247
		3248	if (time_before(jiffies, calc_load_update + 10))
3064	return;	3249	return;
3065		3250
3066	active = atomic_long_read(&calc_load_tasks);	3251	active = atomic_long_read(&calc_load_tasks);
@@ -3714,7 +3899,6 @@ static void put_prev_task(struct rq rq, struct task_struct prev)
3714	{	3899	{
3715	if (prev->se.on_rq)	3900	if (prev->se.on_rq)
3716	update_rq_clock(rq);	3901	update_rq_clock(rq);
3717	rq->skip_clock_update = 0;
3718	prev->sched_class->put_prev_task(rq, prev);	3902	prev->sched_class->put_prev_task(rq, prev);
3719	}	3903	}
3720		3904
@@ -3772,7 +3956,6 @@ need_resched_nonpreemptible:
3772	hrtick_clear(rq);	3956	hrtick_clear(rq);
3773		3957
3774	raw_spin_lock_irq(&rq->lock);	3958	raw_spin_lock_irq(&rq->lock);
3775	clear_tsk_need_resched(prev);
3776		3959
3777	switch_count = &prev->nivcsw;	3960	switch_count = &prev->nivcsw;
3778	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {	3961	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3804,6 +3987,8 @@ need_resched_nonpreemptible:
3804		3987
3805	put_prev_task(rq, prev);	3988	put_prev_task(rq, prev);
3806	next = pick_next_task(rq);	3989	next = pick_next_task(rq);
		3990	clear_tsk_need_resched(prev);
		3991	rq->skip_clock_update = 0;
3807		3992
3808	if (likely(prev != next)) {	3993	if (likely(prev != next)) {
3809	sched_info_switch(prev, next);	3994	sched_info_switch(prev, next);