1 files changed, 169 insertions, 31 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 8e2558c2ba67..2f28351892c9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -331,6 +331,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 */
 static DEFINE_SPINLOCK(task_group_lock);
+#ifdef CONFIG_SMP
+static int root_task_group_empty(void)
+{
+        return list_empty(&root_task_group.children);
+}
+#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_USER_SCHED
 # define INIT_TASK_GROUP_LOAD   (2*NICE_0_LOAD)
@@ -391,6 +398,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 #else
+#ifdef CONFIG_SMP
+static int root_task_group_empty(void)
+{
+        return 1;
+}
+#endif
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 static inline struct task_group *task_group(struct task_struct *p)
 {
@@ -467,11 +481,17 @@ struct rt_rq {
        struct rt_prio_array active;
        unsigned long rt_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-        int highest_prio; /* highest queued rt task prio */
+        struct {
+                int curr; /* highest queued rt task prio */
+#ifdef CONFIG_SMP
+                int next; /* next highest */
+#endif
+        } highest_prio;
 #endif
 #ifdef CONFIG_SMP
        unsigned long rt_nr_migratory;
        int overloaded;
+        struct plist_head pushable_tasks;
 #endif
        int rt_throttled;
        u64 rt_time;
@@ -549,7 +569,6 @@ struct rq {
        unsigned long nr_running;
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
-        unsigned char idle_at_tick;
 #ifdef CONFIG_NO_HZ
        unsigned long last_tick_seen;
        unsigned char in_nohz_recently;
@@ -590,6 +609,7 @@ struct rq {
        struct root_domain *rd;
        struct sched_domain *sd;
+        unsigned char idle_at_tick;
        /* For active balancing */
        int active_balance;
        int push_cpu;
@@ -1183,10 +1203,10 @@ static void resched_task(struct task_struct *p)
        assert_spin_locked(&task_rq(p)->lock);
-        if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+        if (test_tsk_need_resched(p))
                return;
-        set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+        set_tsk_need_resched(p);
        cpu = task_cpu(p);
        if (cpu == smp_processor_id())
@@ -1242,7 +1262,7 @@ void wake_up_idle_cpu(int cpu)
         * lockless. The worst case is that the other CPU runs the
         * idle task through an additional NOOP schedule()
         */
-        set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+        set_tsk_need_resched(rq->idle);
        /* NEED_RESCHED must be visible before we test polling */
        smp_mb();
@@ -1610,21 +1630,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 #endif
+#ifdef CONFIG_PREEMPT
 /*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ * way at the expense of forcing extra atomic operations in all
+ * invocations.  This assures that the double_lock is acquired using the
+ * same underlying policy as the spinlock_t on this architecture, which
+ * reduces latency compared to the unfair variant below.  However, it
+ * also adds more overhead and therefore may reduce throughput.
 */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+        __releases(this_rq->lock)
+        __acquires(busiest->lock)
+        __acquires(this_rq->lock)
+{
+        spin_unlock(&this_rq->lock);
+        double_rq_lock(this_rq, busiest);
+        return 1;
+}
+#else
+/*
+ * Unfair double_lock_balance: Optimizes throughput at the expense of
+ * latency by eliminating extra atomic operations when the locks are
+ * already in proper order on entry.  This favors lower cpu-ids and will
+ * grant the double lock to lower cpus over higher ids under contention,
+ * regardless of entry order into the function.
+ */
+static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(this_rq->lock)
        __acquires(busiest->lock)
        __acquires(this_rq->lock)
 {
        int ret = 0;
-        if (unlikely(!irqs_disabled())) {
-                /* printk() doesn't work good under rq->lock */
-                spin_unlock(&this_rq->lock);
-                BUG_ON(1);
-        }
        if (unlikely(!spin_trylock(&busiest->lock))) {
                if (busiest < this_rq) {
                        spin_unlock(&this_rq->lock);
@@ -1637,6 +1678,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
        return ret;
 }
+#endif /* CONFIG_PREEMPT */
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+{
+        if (unlikely(!irqs_disabled())) {
+                /* printk() doesn't work good under rq->lock */
+                spin_unlock(&this_rq->lock);
+                BUG_ON(1);
+        }
+        return _double_lock_balance(this_rq, busiest);
+}
 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(busiest->lock)
 {
@@ -1705,6 +1762,9 @@ static void update_avg(u64 *avg, u64 sample)
 static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
+        if (wakeup)
+                p->se.start_runtime = p->se.sum_exec_runtime;
        sched_info_queued(p);
        p->sched_class->enqueue_task(rq, p, wakeup);
        p->se.on_rq = 1;
@@ -1712,10 +1772,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
-        if (sleep && p->se.last_wakeup) {
+        if (sleep) {
-                update_avg(&p->se.avg_overlap,
+                if (p->se.last_wakeup) {
-                           p->se.sum_exec_runtime - p->se.last_wakeup);
+                        update_avg(&p->se.avg_overlap,
-                p->se.last_wakeup = 0;
+                                p->se.sum_exec_runtime - p->se.last_wakeup);
+                        p->se.last_wakeup = 0;
+                } else {
+                        update_avg(&p->se.avg_wakeup,
+                                sysctl_sched_wakeup_granularity);
+                }
        }
        sched_info_dequeued(p);
@@ -2267,7 +2332,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
                sync = 0;
 #ifdef CONFIG_SMP
-        if (sched_feat(LB_WAKEUP_UPDATE)) {
+        if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
                struct sched_domain *sd;
                this_cpu = raw_smp_processor_id();
@@ -2345,6 +2410,22 @@ out_activate:
        activate_task(rq, p, 1);
        success = 1;
+        /*
+         * Only attribute actual wakeups done by this task.
+         */
+        if (!in_interrupt()) {
+                struct sched_entity *se = &current->se;
+                u64 sample = se->sum_exec_runtime;
+                if (se->last_wakeup)
+                        sample -= se->last_wakeup;
+                else
+                        sample -= se->start_runtime;
+                update_avg(&se->avg_wakeup, sample);
+                se->last_wakeup = se->sum_exec_runtime;
+        }
 out_running:
        trace_sched_wakeup(rq, p, success);
        check_preempt_curr(rq, p, sync);
@@ -2355,8 +2436,6 @@ out_running:
                p->sched_class->task_wake_up(rq, p);
 #endif
 out:
-        current->se.last_wakeup = current->se.sum_exec_runtime;
        task_rq_unlock(rq, &flags);
        return success;
@@ -2386,6 +2465,8 @@ static void __sched_fork(struct task_struct *p)
        p->se.prev_sum_exec_runtime     = 0;
        p->se.last_wakeup               = 0;
        p->se.avg_overlap               = 0;
+        p->se.start_runtime             = 0;
+        p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
 #ifdef CONFIG_SCHEDSTATS
        p->se.wait_start                = 0;
@@ -2448,6 +2529,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
 #endif
+        plist_node_init(&p->pushable_tasks, MAX_PRIO);
        put_cpu();
 }
@@ -2588,6 +2671,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 {
        struct mm_struct *mm = rq->prev_mm;
        long prev_state;
+#ifdef CONFIG_SMP
+        int post_schedule = 0;
+        if (current->sched_class->needs_post_schedule)
+                post_schedule = current->sched_class->needs_post_schedule(rq);
+#endif
        rq->prev_mm = NULL;
@@ -2606,7 +2695,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        finish_arch_switch(prev);
        finish_lock_switch(rq, prev);
 #ifdef CONFIG_SMP
-        if (current->sched_class->post_schedule)
+        if (post_schedule)
                current->sched_class->post_schedule(rq);
 #endif
@@ -2987,6 +3076,16 @@ next:
        pulled++;
        rem_load_move -= p->se.load.weight;
+#ifdef CONFIG_PREEMPT
+        /*
+         * NEWIDLE balancing is a source of latency, so preemptible kernels
+         * will stop after the first task is pulled to minimize the critical
+         * section.
+         */
+        if (idle == CPU_NEWLY_IDLE)
+                goto out;
+#endif
        /*
         * We only want to steal up to the prescribed amount of weighted load.
         */
@@ -3033,9 +3132,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                                sd, idle, all_pinned, &this_best_prio);
                class = class->next;
+#ifdef CONFIG_PREEMPT
+                /*
+                 * NEWIDLE balancing is a source of latency, so preemptible
+                 * kernels will stop after the first task is pulled to minimize
+                 * the critical section.
+                 */
                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
                        break;
+#endif
        } while (class && max_load_move > total_load_moved);
        return total_load_moved > 0;
@@ -4057,6 +4162,11 @@ static void run_rebalance_domains(struct softirq_action *h)
 #endif
 }
+static inline int on_null_domain(int cpu)
+{
+        return !rcu_dereference(cpu_rq(cpu)->sd);
+}
 /*
 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
 *
@@ -4114,7 +4224,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
            cpumask_test_cpu(cpu, nohz.cpu_mask))
                return;
 #endif
-        if (time_after_eq(jiffies, rq->next_balance))
+        /* Don't need to rebalance while attached to NULL domain */
+        if (time_after_eq(jiffies, rq->next_balance) &&
+            likely(!on_null_domain(cpu)))
                raise_softirq(SCHED_SOFTIRQ);
 }
@@ -4508,11 +4620,33 @@ static inline void schedule_debug(struct task_struct *prev)
 #endif
 }
+static void put_prev_task(struct rq *rq, struct task_struct *prev)
+{
+        if (prev->state == TASK_RUNNING) {
+                u64 runtime = prev->se.sum_exec_runtime;
+                runtime -= prev->se.prev_sum_exec_runtime;
+                runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+                /*
+                 * In order to avoid avg_overlap growing stale when we are
+                 * indeed overlapping and hence not getting put to sleep, grow
+                 * the avg_overlap on preemption.
+                 *
+                 * We use the average preemption runtime because that
+                 * correlates to the amount of cache footprint a task can
+                 * build up.
+                 */
+                update_avg(&prev->se.avg_overlap, runtime);
+        }
+        prev->sched_class->put_prev_task(rq, prev);
+}
 /*
 * Pick up the highest-prio task:
 */
 static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev)
+pick_next_task(struct rq *rq)
 {
        const struct sched_class *class;
        struct task_struct *p;
@@ -4586,8 +4720,8 @@ need_resched_nonpreemptible:
        if (unlikely(!rq->nr_running))
                idle_balance(cpu, rq);
-        prev->sched_class->put_prev_task(rq, prev);
+        put_prev_task(rq, prev);
-        next = pick_next_task(rq, prev);
+        next = pick_next_task(rq);
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
@@ -4642,7 +4776,7 @@ asmlinkage void __sched preempt_schedule(void)
                 * between schedule and now.
                 */
                barrier();
-        } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+        } while (need_resched());
 }
 EXPORT_SYMBOL(preempt_schedule);
@@ -4671,7 +4805,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
                 * between schedule and now.
                 */
                barrier();
-        } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+        } while (need_resched());
 }
 #endif /* CONFIG_PREEMPT */
@@ -5145,7 +5279,7 @@ SYSCALL_DEFINE1(nice, int, increment)
        if (increment > 40)
                increment = 40;
-        nice = PRIO_TO_NICE(current->static_prio) + increment;
+        nice = TASK_NICE(current) + increment;
        if (nice < -20)
                nice = -20;
        if (nice > 19)
@@ -6423,7 +6557,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
                if (!rq->nr_running)
                        break;
                update_rq_clock(rq);
-                next = pick_next_task(rq, rq->curr);
+                next = pick_next_task(rq);
                if (!next)
                        break;
                next->sched_class->put_prev_task(rq, next);
@@ -8218,11 +8352,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        __set_bit(MAX_RT_PRIO, array->bitmap);
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-        rt_rq->highest_prio = MAX_RT_PRIO;
+        rt_rq->highest_prio.curr = MAX_RT_PRIO;
+#ifdef CONFIG_SMP
+        rt_rq->highest_prio.next = MAX_RT_PRIO;
+#endif
 #endif
 #ifdef CONFIG_SMP
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
+        plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
 #endif
        rt_rq->rt_time = 0;
@@ -9598,7 +9736,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
        struct cpuacct *ca;
        int cpu;
-        if (!cpuacct_subsys.active)
+        if (unlikely(!cpuacct_subsys.active))
                return;
        cpu = task_cpu(tsk);

diff --git a/kernel/sched.c b/kernel/sched.c index 8e2558c2ba67..2f28351892c9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -331,6 +331,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
331	*/	331	*/
332	static DEFINE_SPINLOCK(task_group_lock);	332	static DEFINE_SPINLOCK(task_group_lock);
333		333
		334	#ifdef CONFIG_SMP
		335	static int root_task_group_empty(void)
		336	{
		337	return list_empty(&root_task_group.children);
		338	}
		339	#endif
		340
334	#ifdef CONFIG_FAIR_GROUP_SCHED	341	#ifdef CONFIG_FAIR_GROUP_SCHED
335	#ifdef CONFIG_USER_SCHED	342	#ifdef CONFIG_USER_SCHED
336	# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)	343	# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@ -391,6 +398,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
391		398
392	#else	399	#else
393		400
		401	#ifdef CONFIG_SMP
		402	static int root_task_group_empty(void)
		403	{
		404	return 1;
		405	}
		406	#endif
		407
394	static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }	408	static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
395	static inline struct task_group task_group(struct task_struct p)	409	static inline struct task_group task_group(struct task_struct p)
396	{	410	{
@@ -467,11 +481,17 @@ struct rt_rq {
467	struct rt_prio_array active;	481	struct rt_prio_array active;
468	unsigned long rt_nr_running;	482	unsigned long rt_nr_running;
469	#if defined CONFIG_SMP \|\| defined CONFIG_RT_GROUP_SCHED	483	#if defined CONFIG_SMP \|\| defined CONFIG_RT_GROUP_SCHED
470	int highest_prio; /* highest queued rt task prio */	484	struct {
		485	int curr; /* highest queued rt task prio */
		486	#ifdef CONFIG_SMP
		487	int next; /* next highest */
		488	#endif
		489	} highest_prio;
471	#endif	490	#endif
472	#ifdef CONFIG_SMP	491	#ifdef CONFIG_SMP
473	unsigned long rt_nr_migratory;	492	unsigned long rt_nr_migratory;
474	int overloaded;	493	int overloaded;
		494	struct plist_head pushable_tasks;
475	#endif	495	#endif
476	int rt_throttled;	496	int rt_throttled;
477	u64 rt_time;	497	u64 rt_time;
@@ -549,7 +569,6 @@ struct rq {
549	unsigned long nr_running;	569	unsigned long nr_running;
550	#define CPU_LOAD_IDX_MAX 5	570	#define CPU_LOAD_IDX_MAX 5
551	unsigned long cpu_load[CPU_LOAD_IDX_MAX];	571	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
552	unsigned char idle_at_tick;
553	#ifdef CONFIG_NO_HZ	572	#ifdef CONFIG_NO_HZ
554	unsigned long last_tick_seen;	573	unsigned long last_tick_seen;
555	unsigned char in_nohz_recently;	574	unsigned char in_nohz_recently;
@@ -590,6 +609,7 @@ struct rq {
590	struct root_domain *rd;	609	struct root_domain *rd;
591	struct sched_domain *sd;	610	struct sched_domain *sd;
592		611
		612	unsigned char idle_at_tick;
593	/* For active balancing */	613	/* For active balancing */
594	int active_balance;	614	int active_balance;
595	int push_cpu;	615	int push_cpu;
@@ -1183,10 +1203,10 @@ static void resched_task(struct task_struct *p)
1183		1203
1184	assert_spin_locked(&task_rq(p)->lock);	1204	assert_spin_locked(&task_rq(p)->lock);
1185		1205
1186	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))	1206	if (test_tsk_need_resched(p))
1187	return;	1207	return;
1188		1208
1189	set_tsk_thread_flag(p, TIF_NEED_RESCHED);	1209	set_tsk_need_resched(p);
1190		1210
1191	cpu = task_cpu(p);	1211	cpu = task_cpu(p);
1192	if (cpu == smp_processor_id())	1212	if (cpu == smp_processor_id())
@@ -1242,7 +1262,7 @@ void wake_up_idle_cpu(int cpu)
1242	* lockless. The worst case is that the other CPU runs the	1262	* lockless. The worst case is that the other CPU runs the
1243	* idle task through an additional NOOP schedule()	1263	* idle task through an additional NOOP schedule()
1244	*/	1264	*/
1245	set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);	1265	set_tsk_need_resched(rq->idle);
1246		1266
1247	/* NEED_RESCHED must be visible before we test polling */	1267	/* NEED_RESCHED must be visible before we test polling */
1248	smp_mb();	1268	smp_mb();
@@ -1610,21 +1630,42 @@ static inline void update_shares_locked(struct rq rq, struct sched_domain sd)
1610		1630
1611	#endif	1631	#endif
1612		1632
		1633	#ifdef CONFIG_PREEMPT
		1634
1613	/*	1635	/*
1614	* double_lock_balance - lock the busiest runqueue, this_rq is locked already.	1636	* fair double_lock_balance: Safely acquires both rq->locks in a fair
		1637	* way at the expense of forcing extra atomic operations in all
		1638	* invocations. This assures that the double_lock is acquired using the
		1639	* same underlying policy as the spinlock_t on this architecture, which
		1640	* reduces latency compared to the unfair variant below. However, it
		1641	* also adds more overhead and therefore may reduce throughput.
1615	*/	1642	*/
1616	static int double_lock_balance(struct rq this_rq, struct rq busiest)	1643	static inline int _double_lock_balance(struct rq this_rq, struct rq busiest)
		1644	__releases(this_rq->lock)
		1645	__acquires(busiest->lock)
		1646	__acquires(this_rq->lock)
		1647	{
		1648	spin_unlock(&this_rq->lock);
		1649	double_rq_lock(this_rq, busiest);
		1650
		1651	return 1;
		1652	}
		1653
		1654	#else
		1655	/*
		1656	* Unfair double_lock_balance: Optimizes throughput at the expense of
		1657	* latency by eliminating extra atomic operations when the locks are
		1658	* already in proper order on entry. This favors lower cpu-ids and will
		1659	* grant the double lock to lower cpus over higher ids under contention,
		1660	* regardless of entry order into the function.
		1661	*/
		1662	static int _double_lock_balance(struct rq this_rq, struct rq busiest)
1617	__releases(this_rq->lock)	1663	__releases(this_rq->lock)
1618	__acquires(busiest->lock)	1664	__acquires(busiest->lock)
1619	__acquires(this_rq->lock)	1665	__acquires(this_rq->lock)
1620	{	1666	{
1621	int ret = 0;	1667	int ret = 0;
1622		1668
1623	if (unlikely(!irqs_disabled())) {
1624	/* printk() doesn't work good under rq->lock */
1625	spin_unlock(&this_rq->lock);
1626	BUG_ON(1);
1627	}
1628	if (unlikely(!spin_trylock(&busiest->lock))) {	1669	if (unlikely(!spin_trylock(&busiest->lock))) {
1629	if (busiest < this_rq) {	1670	if (busiest < this_rq) {
1630	spin_unlock(&this_rq->lock);	1671	spin_unlock(&this_rq->lock);
@@ -1637,6 +1678,22 @@ static int double_lock_balance(struct rq this_rq, struct rq busiest)
1637	return ret;	1678	return ret;
1638	}	1679	}
1639		1680
		1681	#endif /* CONFIG_PREEMPT */
		1682
		1683	/*
		1684	* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
		1685	*/
		1686	static int double_lock_balance(struct rq this_rq, struct rq busiest)
		1687	{
		1688	if (unlikely(!irqs_disabled())) {
		1689	/* printk() doesn't work good under rq->lock */
		1690	spin_unlock(&this_rq->lock);
		1691	BUG_ON(1);
		1692	}
		1693
		1694	return _double_lock_balance(this_rq, busiest);
		1695	}
		1696
1640	static inline void double_unlock_balance(struct rq this_rq, struct rq busiest)	1697	static inline void double_unlock_balance(struct rq this_rq, struct rq busiest)
1641	__releases(busiest->lock)	1698	__releases(busiest->lock)
1642	{	1699	{
@@ -1705,6 +1762,9 @@ static void update_avg(u64 *avg, u64 sample)
1705		1762
1706	static void enqueue_task(struct rq rq, struct task_struct p, int wakeup)	1763	static void enqueue_task(struct rq rq, struct task_struct p, int wakeup)
1707	{	1764	{
		1765	if (wakeup)
		1766	p->se.start_runtime = p->se.sum_exec_runtime;
		1767
1708	sched_info_queued(p);	1768	sched_info_queued(p);
1709	p->sched_class->enqueue_task(rq, p, wakeup);	1769	p->sched_class->enqueue_task(rq, p, wakeup);
1710	p->se.on_rq = 1;	1770	p->se.on_rq = 1;
@@ -1712,10 +1772,15 @@ static void enqueue_task(struct rq rq, struct task_struct p, int wakeup)
1712		1772
1713	static void dequeue_task(struct rq rq, struct task_struct p, int sleep)	1773	static void dequeue_task(struct rq rq, struct task_struct p, int sleep)
1714	{	1774	{
1715	if (sleep && p->se.last_wakeup) {	1775	if (sleep) {
1716	update_avg(&p->se.avg_overlap,	1776	if (p->se.last_wakeup) {
1717	p->se.sum_exec_runtime - p->se.last_wakeup);	1777	update_avg(&p->se.avg_overlap,
1718	p->se.last_wakeup = 0;	1778	p->se.sum_exec_runtime - p->se.last_wakeup);
		1779	p->se.last_wakeup = 0;
		1780	} else {
		1781	update_avg(&p->se.avg_wakeup,
		1782	sysctl_sched_wakeup_granularity);
		1783	}
1719	}	1784	}
1720		1785
1721	sched_info_dequeued(p);	1786	sched_info_dequeued(p);
@@ -2267,7 +2332,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2267	sync = 0;	2332	sync = 0;
2268		2333
2269	#ifdef CONFIG_SMP	2334	#ifdef CONFIG_SMP
2270	if (sched_feat(LB_WAKEUP_UPDATE)) {	2335	if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2271	struct sched_domain *sd;	2336	struct sched_domain *sd;
2272		2337
2273	this_cpu = raw_smp_processor_id();	2338	this_cpu = raw_smp_processor_id();
@@ -2345,6 +2410,22 @@ out_activate:
2345	activate_task(rq, p, 1);	2410	activate_task(rq, p, 1);
2346	success = 1;	2411	success = 1;
2347		2412
		2413	/*
		2414	* Only attribute actual wakeups done by this task.
		2415	*/
		2416	if (!in_interrupt()) {
		2417	struct sched_entity *se = &current->se;
		2418	u64 sample = se->sum_exec_runtime;
		2419
		2420	if (se->last_wakeup)
		2421	sample -= se->last_wakeup;
		2422	else
		2423	sample -= se->start_runtime;
		2424	update_avg(&se->avg_wakeup, sample);
		2425
		2426	se->last_wakeup = se->sum_exec_runtime;
		2427	}
		2428
2348	out_running:	2429	out_running:
2349	trace_sched_wakeup(rq, p, success);	2430	trace_sched_wakeup(rq, p, success);
2350	check_preempt_curr(rq, p, sync);	2431	check_preempt_curr(rq, p, sync);
@@ -2355,8 +2436,6 @@ out_running:
2355	p->sched_class->task_wake_up(rq, p);	2436	p->sched_class->task_wake_up(rq, p);
2356	#endif	2437	#endif
2357	out:	2438	out:
2358	current->se.last_wakeup = current->se.sum_exec_runtime;
2359
2360	task_rq_unlock(rq, &flags);	2439	task_rq_unlock(rq, &flags);
2361		2440
2362	return success;	2441	return success;
@@ -2386,6 +2465,8 @@ static void __sched_fork(struct task_struct *p)
2386	p->se.prev_sum_exec_runtime = 0;	2465	p->se.prev_sum_exec_runtime = 0;
2387	p->se.last_wakeup = 0;	2466	p->se.last_wakeup = 0;
2388	p->se.avg_overlap = 0;	2467	p->se.avg_overlap = 0;
		2468	p->se.start_runtime = 0;
		2469	p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2389		2470
2390	#ifdef CONFIG_SCHEDSTATS	2471	#ifdef CONFIG_SCHEDSTATS
2391	p->se.wait_start = 0;	2472	p->se.wait_start = 0;
@@ -2448,6 +2529,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
2448	/* Want to start with kernel preemption disabled. */	2529	/* Want to start with kernel preemption disabled. */
2449	task_thread_info(p)->preempt_count = 1;	2530	task_thread_info(p)->preempt_count = 1;
2450	#endif	2531	#endif
		2532	plist_node_init(&p->pushable_tasks, MAX_PRIO);
		2533
2451	put_cpu();	2534	put_cpu();
2452	}	2535	}
2453		2536
@@ -2588,6 +2671,12 @@ static void finish_task_switch(struct rq rq, struct task_struct prev)
2588	{	2671	{
2589	struct mm_struct *mm = rq->prev_mm;	2672	struct mm_struct *mm = rq->prev_mm;
2590	long prev_state;	2673	long prev_state;
		2674	#ifdef CONFIG_SMP
		2675	int post_schedule = 0;
		2676
		2677	if (current->sched_class->needs_post_schedule)
		2678	post_schedule = current->sched_class->needs_post_schedule(rq);
		2679	#endif
2591		2680
2592	rq->prev_mm = NULL;	2681	rq->prev_mm = NULL;
2593		2682
@@ -2606,7 +2695,7 @@ static void finish_task_switch(struct rq rq, struct task_struct prev)
2606	finish_arch_switch(prev);	2695	finish_arch_switch(prev);
2607	finish_lock_switch(rq, prev);	2696	finish_lock_switch(rq, prev);
2608	#ifdef CONFIG_SMP	2697	#ifdef CONFIG_SMP
2609	if (current->sched_class->post_schedule)	2698	if (post_schedule)
2610	current->sched_class->post_schedule(rq);	2699	current->sched_class->post_schedule(rq);
2611	#endif	2700	#endif
2612		2701
@@ -2987,6 +3076,16 @@ next:
2987	pulled++;	3076	pulled++;
2988	rem_load_move -= p->se.load.weight;	3077	rem_load_move -= p->se.load.weight;
2989		3078
		3079	#ifdef CONFIG_PREEMPT
		3080	/*
		3081	* NEWIDLE balancing is a source of latency, so preemptible kernels
		3082	* will stop after the first task is pulled to minimize the critical
		3083	* section.
		3084	*/
		3085	if (idle == CPU_NEWLY_IDLE)
		3086	goto out;
		3087	#endif
		3088
2990	/*	3089	/*
2991	* We only want to steal up to the prescribed amount of weighted load.	3090	* We only want to steal up to the prescribed amount of weighted load.
2992	*/	3091	*/
@@ -3033,9 +3132,15 @@ static int move_tasks(struct rq this_rq, int this_cpu, struct rq busiest,
3033	sd, idle, all_pinned, &this_best_prio);	3132	sd, idle, all_pinned, &this_best_prio);
3034	class = class->next;	3133	class = class->next;
3035		3134
		3135	#ifdef CONFIG_PREEMPT
		3136	/*
		3137	* NEWIDLE balancing is a source of latency, so preemptible
		3138	* kernels will stop after the first task is pulled to minimize
		3139	* the critical section.
		3140	*/
3036	if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)	3141	if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3037	break;	3142	break;
3038		3143	#endif
3039	} while (class && max_load_move > total_load_moved);	3144	} while (class && max_load_move > total_load_moved);
3040		3145
3041	return total_load_moved > 0;	3146	return total_load_moved > 0;
@@ -4057,6 +4162,11 @@ static void run_rebalance_domains(struct softirq_action *h)
4057	#endif	4162	#endif
4058	}	4163	}
4059		4164
		4165	static inline int on_null_domain(int cpu)
		4166	{
		4167	return !rcu_dereference(cpu_rq(cpu)->sd);
		4168	}
		4169
4060	/*	4170	/*
4061	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.	4171	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4062	*	4172	*
@@ -4114,7 +4224,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4114	cpumask_test_cpu(cpu, nohz.cpu_mask))	4224	cpumask_test_cpu(cpu, nohz.cpu_mask))
4115	return;	4225	return;
4116	#endif	4226	#endif
4117	if (time_after_eq(jiffies, rq->next_balance))	4227	/* Don't need to rebalance while attached to NULL domain */
		4228	if (time_after_eq(jiffies, rq->next_balance) &&
		4229	likely(!on_null_domain(cpu)))
4118	raise_softirq(SCHED_SOFTIRQ);	4230	raise_softirq(SCHED_SOFTIRQ);
4119	}	4231	}
4120		4232
@@ -4508,11 +4620,33 @@ static inline void schedule_debug(struct task_struct *prev)
4508	#endif	4620	#endif
4509	}	4621	}
4510		4622
		4623	static void put_prev_task(struct rq rq, struct task_struct prev)
		4624	{
		4625	if (prev->state == TASK_RUNNING) {
		4626	u64 runtime = prev->se.sum_exec_runtime;
		4627
		4628	runtime -= prev->se.prev_sum_exec_runtime;
		4629	runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
		4630
		4631	/*
		4632	* In order to avoid avg_overlap growing stale when we are
		4633	* indeed overlapping and hence not getting put to sleep, grow
		4634	* the avg_overlap on preemption.
		4635	*
		4636	* We use the average preemption runtime because that
		4637	* correlates to the amount of cache footprint a task can
		4638	* build up.
		4639	*/
		4640	update_avg(&prev->se.avg_overlap, runtime);
		4641	}
		4642	prev->sched_class->put_prev_task(rq, prev);
		4643	}
		4644
4511	/*	4645	/*
4512	* Pick up the highest-prio task:	4646	* Pick up the highest-prio task:
4513	*/	4647	*/
4514	static inline struct task_struct *	4648	static inline struct task_struct *
4515	pick_next_task(struct rq rq, struct task_struct prev)	4649	pick_next_task(struct rq *rq)
4516	{	4650	{
4517	const struct sched_class *class;	4651	const struct sched_class *class;
4518	struct task_struct *p;	4652	struct task_struct *p;
@@ -4586,8 +4720,8 @@ need_resched_nonpreemptible:
4586	if (unlikely(!rq->nr_running))	4720	if (unlikely(!rq->nr_running))
4587	idle_balance(cpu, rq);	4721	idle_balance(cpu, rq);
4588		4722
4589	prev->sched_class->put_prev_task(rq, prev);	4723	put_prev_task(rq, prev);
4590	next = pick_next_task(rq, prev);	4724	next = pick_next_task(rq);
4591		4725
4592	if (likely(prev != next)) {	4726	if (likely(prev != next)) {
4593	sched_info_switch(prev, next);	4727	sched_info_switch(prev, next);
@@ -4642,7 +4776,7 @@ asmlinkage void __sched preempt_schedule(void)
4642	* between schedule and now.	4776	* between schedule and now.
4643	*/	4777	*/
4644	barrier();	4778	barrier();
4645	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));	4779	} while (need_resched());
4646	}	4780	}
4647	EXPORT_SYMBOL(preempt_schedule);	4781	EXPORT_SYMBOL(preempt_schedule);
4648		4782
@@ -4671,7 +4805,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
4671	* between schedule and now.	4805	* between schedule and now.
4672	*/	4806	*/
4673	barrier();	4807	barrier();
4674	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));	4808	} while (need_resched());
4675	}	4809	}
4676		4810
4677	#endif /* CONFIG_PREEMPT */	4811	#endif /* CONFIG_PREEMPT */
@@ -5145,7 +5279,7 @@ SYSCALL_DEFINE1(nice, int, increment)
5145	if (increment > 40)	5279	if (increment > 40)
5146	increment = 40;	5280	increment = 40;
5147		5281
5148	nice = PRIO_TO_NICE(current->static_prio) + increment;	5282	nice = TASK_NICE(current) + increment;
5149	if (nice < -20)	5283	if (nice < -20)
5150	nice = -20;	5284	nice = -20;
5151	if (nice > 19)	5285	if (nice > 19)
@@ -6423,7 +6557,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6423	if (!rq->nr_running)	6557	if (!rq->nr_running)
6424	break;	6558	break;
6425	update_rq_clock(rq);	6559	update_rq_clock(rq);
6426	next = pick_next_task(rq, rq->curr);	6560	next = pick_next_task(rq);
6427	if (!next)	6561	if (!next)
6428	break;	6562	break;
6429	next->sched_class->put_prev_task(rq, next);	6563	next->sched_class->put_prev_task(rq, next);
@@ -8218,11 +8352,15 @@ static void init_rt_rq(struct rt_rq rt_rq, struct rq rq)
8218	__set_bit(MAX_RT_PRIO, array->bitmap);	8352	__set_bit(MAX_RT_PRIO, array->bitmap);
8219		8353
8220	#if defined CONFIG_SMP \|\| defined CONFIG_RT_GROUP_SCHED	8354	#if defined CONFIG_SMP \|\| defined CONFIG_RT_GROUP_SCHED
8221	rt_rq->highest_prio = MAX_RT_PRIO;	8355	rt_rq->highest_prio.curr = MAX_RT_PRIO;
		8356	#ifdef CONFIG_SMP
		8357	rt_rq->highest_prio.next = MAX_RT_PRIO;
		8358	#endif
8222	#endif	8359	#endif
8223	#ifdef CONFIG_SMP	8360	#ifdef CONFIG_SMP
8224	rt_rq->rt_nr_migratory = 0;	8361	rt_rq->rt_nr_migratory = 0;
8225	rt_rq->overloaded = 0;	8362	rt_rq->overloaded = 0;
		8363	plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
8226	#endif	8364	#endif
8227		8365
8228	rt_rq->rt_time = 0;	8366	rt_rq->rt_time = 0;
@@ -9598,7 +9736,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9598	struct cpuacct *ca;	9736	struct cpuacct *ca;
9599	int cpu;	9737	int cpu;
9600		9738
9601	if (!cpuacct_subsys.active)	9739	if (unlikely(!cpuacct_subsys.active))
9602	return;	9740	return;
9603		9741
9604	cpu = task_cpu(tsk);	9742	cpu = task_cpu(tsk);