1 files changed, 195 insertions, 78 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 37087a7fac22..42ac3c9f66f6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
 */
 #include <linux/latencytop.h>
+#include <linux/sched.h>
 /*
 * Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
 *  run vmstat and monitor the context-switches (cs) field)
 */
 unsigned int sysctl_sched_latency = 5000000ULL;
+unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+/*
+ * The initial- and re-scaling of tunables is configurable
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
+ *
+ * Options are:
+ * SCHED_TUNABLESCALING_NONE - unscaled, always *1
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ */
+enum sched_tunable_scaling sysctl_sched_tunable_scaling
+        = SCHED_TUNABLESCALING_LOG;
 /*
 * Minimal preemption granularity for CPU-bound tasks:
 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
 */
 unsigned int sysctl_sched_min_granularity = 1000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
 /*
 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
 * have immediate wakeup/sleep latencies.
 */
 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 */
 #ifdef CONFIG_SCHED_DEBUG
-int sched_nr_latency_handler(struct ctl_table *table, int write,
+int sched_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        int factor = get_update_sysctl_factor();
        if (ret || !write)
                return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
        sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
                                        sysctl_sched_min_granularity);
+#define WRT_SYSCTL(name) \
+        (normalized_sysctl_##name = sysctl_##name / (factor))
+        WRT_SYSCTL(sched_min_granularity);
+        WRT_SYSCTL(sched_latency);
+        WRT_SYSCTL(sched_wakeup_granularity);
+        WRT_SYSCTL(sched_shares_ratelimit);
+#undef WRT_SYSCTL
        return 0;
 }
 #endif
@@ -485,6 +510,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
        curr->sum_exec_runtime += delta_exec;
        schedstat_add(cfs_rq, exec_clock, delta_exec);
        delta_exec_weighted = calc_delta_fair(delta_exec, curr);
        curr->vruntime += delta_exec_weighted;
        update_min_vruntime(cfs_rq);
 }
@@ -740,16 +766,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        se->vruntime = vruntime;
 }
+#define ENQUEUE_WAKEUP  1
+#define ENQUEUE_MIGRATE 2
 static void
-enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
        /*
+         * Update the normalized vruntime before updating min_vruntime
+         * through callig update_curr().
+         */
+        if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
+                se->vruntime += cfs_rq->min_vruntime;
+        /*
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
        account_entity_enqueue(cfs_rq, se);
-        if (wakeup) {
+        if (flags & ENQUEUE_WAKEUP) {
                place_entity(cfs_rq, se, 0);
                enqueue_sleeper(cfs_rq, se);
        }
@@ -803,6 +839,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
                __dequeue_entity(cfs_rq, se);
        account_entity_dequeue(cfs_rq, se);
        update_min_vruntime(cfs_rq);
+        /*
+         * Normalize the entity after updating the min_vruntime because the
+         * update can refer to the ->curr item and we need to reflect this
+         * movement in our normalized position.
+         */
+        if (!sleep)
+                se->vruntime -= cfs_rq->min_vruntime;
 }
 /*
@@ -1013,13 +1057,19 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
+        int flags = 0;
+        if (wakeup)
+                flags |= ENQUEUE_WAKEUP;
+        if (p->state == TASK_WAKING)
+                flags |= ENQUEUE_MIGRATE;
        for_each_sched_entity(se) {
                if (se->on_rq)
                        break;
                cfs_rq = cfs_rq_of(se);
-                enqueue_entity(cfs_rq, se, wakeup);
+                enqueue_entity(cfs_rq, se, flags);
-                wakeup = 1;
+                flags = ENQUEUE_WAKEUP;
        }
        hrtick_update(rq);
@@ -1095,6 +1145,14 @@ static void yield_task_fair(struct rq *rq)
 #ifdef CONFIG_SMP
+static void task_waking_fair(struct rq *rq, struct task_struct *p)
+{
+        struct sched_entity *se = &p->se;
+        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+        se->vruntime -= cfs_rq->min_vruntime;
+}
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /*
 * effective_load() calculates the load change as seen from the root_task_group
@@ -1345,6 +1403,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 }
 /*
+ * Try and locate an idle CPU in the sched_domain.
+ */
+static int
+select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
+{
+        int cpu = smp_processor_id();
+        int prev_cpu = task_cpu(p);
+        int i;
+        /*
+         * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
+         * test in select_task_rq_fair) and the prev_cpu is idle then that's
+         * always a better target than the current cpu.
+         */
+        if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+                return prev_cpu;
+        /*
+         * Otherwise, iterate the domain and find an elegible idle cpu.
+         */
+        for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+                if (!cpu_rq(i)->cfs.nr_running) {
+                        target = i;
+                        break;
+                }
+        }
+        return target;
+}
+/*
 * sched_balance_self: balance the current task (running on cpu) in domains
 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
 * SD_BALANCE_EXEC.
@@ -1372,8 +1461,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                new_cpu = prev_cpu;
        }
-        rcu_read_lock();
        for_each_domain(cpu, tmp) {
+                if (!(tmp->flags & SD_LOAD_BALANCE))
+                        continue;
                /*
                 * If power savings logic is enabled for a domain, see if we
                 * are not overloaded, if so, don't balance wider.
@@ -1398,11 +1489,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                                want_sd = 0;
                }
-                if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+                /*
-                    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+                 * While iterating the domains looking for a spanning
+                 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
+                 * in cache sharing domains along the way.
+                 */
+                if (want_affine) {
+                        int target = -1;
+                        /*
+                         * If both cpu and prev_cpu are part of this domain,
+                         * cpu is a valid SD_WAKE_AFFINE target.
+                         */
+                        if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
+                                target = cpu;
-                        affine_sd = tmp;
+                        /*
-                        want_affine = 0;
+                         * If there's an idle sibling in this domain, make that
+                         * the wake_affine target instead of the current cpu.
+                         */
+                        if (tmp->flags & SD_PREFER_SIBLING)
+                                target = select_idle_sibling(p, tmp, target);
+                        if (target >= 0) {
+                                if (tmp->flags & SD_WAKE_AFFINE) {
+                                        affine_sd = tmp;
+                                        want_affine = 0;
+                                }
+                                cpu = target;
+                        }
                }
                if (!want_sd && !want_affine)
@@ -1429,10 +1544,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                        update_shares(tmp);
        }
-        if (affine_sd && wake_affine(affine_sd, p, sync)) {
+        if (affine_sd && wake_affine(affine_sd, p, sync))
-                new_cpu = cpu;
+                return cpu;
-                goto out;
-        }
        while (sd) {
                int load_idx = sd->forkexec_idx;
@@ -1473,8 +1586,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                /* while loop will break here if sd == NULL */
        }
-out:
-        rcu_read_unlock();
        return new_cpu;
 }
 #endif /* CONFIG_SMP */
@@ -1596,12 +1707,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        int sync = wake_flags & WF_SYNC;
        int scale = cfs_rq->nr_running >= sched_nr_latency;
-        update_curr(cfs_rq);
+        if (unlikely(rt_prio(p->prio)))
+                goto preempt;
-        if (unlikely(rt_prio(p->prio))) {
-                resched_task(curr);
-                return;
-        }
        if (unlikely(p->sched_class != &fair_sched_class))
                return;
@@ -1627,50 +1734,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
                return;
        /* Idle tasks are by definition preempted by everybody. */
-        if (unlikely(curr->policy == SCHED_IDLE)) {
+        if (unlikely(curr->policy == SCHED_IDLE))
-                resched_task(curr);
+                goto preempt;
-                return;
-        }
-        if ((sched_feat(WAKEUP_SYNC) && sync) ||
+        if (sched_feat(WAKEUP_SYNC) && sync)
-            (sched_feat(WAKEUP_OVERLAP) &&
+                goto preempt;
-             (se->avg_overlap < sysctl_sched_migration_cost &&
-              pse->avg_overlap < sysctl_sched_migration_cost))) {
-                resched_task(curr);
-                return;
-        }
-        if (sched_feat(WAKEUP_RUNNING)) {
+        if (sched_feat(WAKEUP_OVERLAP) &&
-                if (pse->avg_running < se->avg_running) {
+                        se->avg_overlap < sysctl_sched_migration_cost &&
-                        set_next_buddy(pse);
+                        pse->avg_overlap < sysctl_sched_migration_cost)
-                        resched_task(curr);
+                goto preempt;
-                        return;
-                }
-        }
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
+        update_curr(cfs_rq);
        find_matching_se(&se, &pse);
        BUG_ON(!pse);
+        if (wakeup_preempt_entity(se, pse) == 1)
+                goto preempt;
-        if (wakeup_preempt_entity(se, pse) == 1) {
+        return;
-                resched_task(curr);
-                /*
+preempt:
-                 * Only set the backward buddy when the current task is still
+        resched_task(curr);
-                 * on the rq. This can happen when a wakeup gets interleaved
+        /*
-                 * with schedule on the ->pre_schedule() or idle_balance()
+         * Only set the backward buddy when the current task is still
-                 * point, either of which can * drop the rq lock.
+         * on the rq. This can happen when a wakeup gets interleaved
-                 *
+         * with schedule on the ->pre_schedule() or idle_balance()
-                 * Also, during early boot the idle thread is in the fair class,
+         * point, either of which can * drop the rq lock.
-                 * for obvious reasons its a bad idea to schedule back to it.
+         *
-                 */
+         * Also, during early boot the idle thread is in the fair class,
-                if (unlikely(!se->on_rq || curr == rq->idle))
+         * for obvious reasons its a bad idea to schedule back to it.
-                        return;
+         */
-                if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+        if (unlikely(!se->on_rq || curr == rq->idle))
-                        set_last_buddy(se);
+                return;
-        }
+        if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+                set_last_buddy(se);
 }
 static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1679,7 +1780,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
        struct cfs_rq *cfs_rq = &rq->cfs;
        struct sched_entity *se;
-        if (unlikely(!cfs_rq->nr_running))
+        if (!cfs_rq->nr_running)
                return NULL;
        do {
@@ -1850,6 +1951,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        return 0;
 }
+static void rq_online_fair(struct rq *rq)
+{
+        update_sysctl();
+}
+static void rq_offline_fair(struct rq *rq)
+{
+        update_sysctl();
+}
 #endif /* CONFIG_SMP */
 /*
@@ -1867,28 +1979,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 }
 /*
- * Share the fairness runtime between parent and child, thus the
+ * called on fork with the child task as argument from the parent's context
- * total amount of pressure for CPU stays equal - new tasks
+ *  - child not yet on the tasklist
- * get a chance to run but frequent forkers are not allowed to
+ *  - preemption disabled
- * monopolize the CPU. Note: the parent runqueue is locked,
- * the child is not running yet.
 */
-static void task_new_fair(struct rq *rq, struct task_struct *p)
+static void task_fork_fair(struct task_struct *p)
 {
-        struct cfs_rq *cfs_rq = task_cfs_rq(p);
+        struct cfs_rq *cfs_rq = task_cfs_rq(current);
        struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
        int this_cpu = smp_processor_id();
+        struct rq *rq = this_rq();
+        unsigned long flags;
-        sched_info_queued(p);
+        raw_spin_lock_irqsave(&rq->lock, flags);
+        if (unlikely(task_cpu(p) != this_cpu))
+                __set_task_cpu(p, this_cpu);
        update_curr(cfs_rq);
        if (curr)
                se->vruntime = curr->vruntime;
        place_entity(cfs_rq, se, 1);
-        /* 'curr' will be NULL if the child belongs to a different group */
+        if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
-        if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
-                        curr && entity_before(curr, se)) {
                /*
                 * Upon rescheduling, sched_class::put_prev_task() will place
                 * 'current' within the tree based on its new key value.
@@ -1897,7 +2011,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
                resched_task(rq->curr);
        }
-        enqueue_task_fair(rq, p, 0);
+        se->vruntime -= cfs_rq->min_vruntime;
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
@@ -1950,30 +2066,27 @@ static void set_curr_task_fair(struct rq *rq)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p)
+static void moved_group_fair(struct task_struct *p, int on_rq)
 {
        struct cfs_rq *cfs_rq = task_cfs_rq(p);
        update_curr(cfs_rq);
-        place_entity(cfs_rq, &p->se, 1);
+        if (!on_rq)
+                place_entity(cfs_rq, &p->se, 1);
 }
 #endif
-unsigned int get_rr_interval_fair(struct task_struct *task)
+unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
 {
        struct sched_entity *se = &task->se;
-        unsigned long flags;
-        struct rq *rq;
        unsigned int rr_interval = 0;
        /*
         * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
         * idle runqueue:
         */
-        rq = task_rq_lock(task, &flags);
        if (rq->cfs.load.weight)
                rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
-        task_rq_unlock(rq, &flags);
        return rr_interval;
 }
@@ -1997,11 +2110,15 @@ static const struct sched_class fair_sched_class = {
        .load_balance           = load_balance_fair,
        .move_one_task          = move_one_task_fair,
+        .rq_online              = rq_online_fair,
+        .rq_offline             = rq_offline_fair,
+        .task_waking            = task_waking_fair,
 #endif
        .set_curr_task          = set_curr_task_fair,
        .task_tick              = task_tick_fair,
-        .task_new               = task_new_fair,
+        .task_fork              = task_fork_fair,
        .prio_changed           = prio_changed_fair,
        .switched_to            = switched_to_fair,

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 37087a7fac22..42ac3c9f66f6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
21	*/	21	*/
22		22
23	#include <linux/latencytop.h>	23	#include <linux/latencytop.h>
		24	#include <linux/sched.h>
24		25
25	/*	26	/*
26	* Targeted preemption latency for CPU-bound tasks:	27	* Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
35	* run vmstat and monitor the context-switches (cs) field)	36	* run vmstat and monitor the context-switches (cs) field)
36	*/	37	*/
37	unsigned int sysctl_sched_latency = 5000000ULL;	38	unsigned int sysctl_sched_latency = 5000000ULL;
		39	unsigned int normalized_sysctl_sched_latency = 5000000ULL;
		40
		41	/*
		42	* The initial- and re-scaling of tunables is configurable
		43	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
		44	*
		45	* Options are:
		46	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
		47	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
		48	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
		49	*/
		50	enum sched_tunable_scaling sysctl_sched_tunable_scaling
		51	= SCHED_TUNABLESCALING_LOG;
38		52
39	/*	53	/*
40	* Minimal preemption granularity for CPU-bound tasks:	54	* Minimal preemption granularity for CPU-bound tasks:
41	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)	55	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42	*/	56	*/
43	unsigned int sysctl_sched_min_granularity = 1000000ULL;	57	unsigned int sysctl_sched_min_granularity = 1000000ULL;
		58	unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
44		59
45	/*	60	/*
46	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity	61	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
70	* have immediate wakeup/sleep latencies.	85	* have immediate wakeup/sleep latencies.
71	*/	86	*/
72	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;	87	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
		88	unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
73		89
74	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;	90	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75		91
@@ -383,11 +399,12 @@ static struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
383	*/	399	*/
384		400
385	#ifdef CONFIG_SCHED_DEBUG	401	#ifdef CONFIG_SCHED_DEBUG
386	int sched_nr_latency_handler(struct ctl_table *table, int write,	402	int sched_proc_update_handler(struct ctl_table *table, int write,
387	void __user buffer, size_t lenp,	403	void __user buffer, size_t lenp,
388	loff_t *ppos)	404	loff_t *ppos)
389	{	405	{
390	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);	406	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
		407	int factor = get_update_sysctl_factor();
391		408
392	if (ret \|\| !write)	409	if (ret \|\| !write)
393	return ret;	410	return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
395	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,	412	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
396	sysctl_sched_min_granularity);	413	sysctl_sched_min_granularity);
397		414
		415	#define WRT_SYSCTL(name) \
		416	(normalized_sysctl_##name = sysctl_##name / (factor))
		417	WRT_SYSCTL(sched_min_granularity);
		418	WRT_SYSCTL(sched_latency);
		419	WRT_SYSCTL(sched_wakeup_granularity);
		420	WRT_SYSCTL(sched_shares_ratelimit);
		421	#undef WRT_SYSCTL
		422
398	return 0;	423	return 0;
399	}	424	}
400	#endif	425	#endif
@@ -485,6 +510,7 @@ __update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
485	curr->sum_exec_runtime += delta_exec;	510	curr->sum_exec_runtime += delta_exec;
486	schedstat_add(cfs_rq, exec_clock, delta_exec);	511	schedstat_add(cfs_rq, exec_clock, delta_exec);
487	delta_exec_weighted = calc_delta_fair(delta_exec, curr);	512	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
		513
488	curr->vruntime += delta_exec_weighted;	514	curr->vruntime += delta_exec_weighted;
489	update_min_vruntime(cfs_rq);	515	update_min_vruntime(cfs_rq);
490	}	516	}
@@ -740,16 +766,26 @@ place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
740	se->vruntime = vruntime;	766	se->vruntime = vruntime;
741	}	767	}
742		768
		769	#define ENQUEUE_WAKEUP 1
		770	#define ENQUEUE_MIGRATE 2
		771
743	static void	772	static void
744	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int wakeup)	773	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
745	{	774	{
746	/*	775	/*
		776	* Update the normalized vruntime before updating min_vruntime
		777	* through callig update_curr().
		778	*/
		779	if (!(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_MIGRATE))
		780	se->vruntime += cfs_rq->min_vruntime;
		781
		782	/*
747	* Update run-time statistics of the 'current'.	783	* Update run-time statistics of the 'current'.
748	*/	784	*/
749	update_curr(cfs_rq);	785	update_curr(cfs_rq);
750	account_entity_enqueue(cfs_rq, se);	786	account_entity_enqueue(cfs_rq, se);
751		787
752	if (wakeup) {	788	if (flags & ENQUEUE_WAKEUP) {
753	place_entity(cfs_rq, se, 0);	789	place_entity(cfs_rq, se, 0);
754	enqueue_sleeper(cfs_rq, se);	790	enqueue_sleeper(cfs_rq, se);
755	}	791	}
@@ -803,6 +839,14 @@ dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int sleep)
803	__dequeue_entity(cfs_rq, se);	839	__dequeue_entity(cfs_rq, se);
804	account_entity_dequeue(cfs_rq, se);	840	account_entity_dequeue(cfs_rq, se);
805	update_min_vruntime(cfs_rq);	841	update_min_vruntime(cfs_rq);
		842
		843	/*
		844	* Normalize the entity after updating the min_vruntime because the
		845	* update can refer to the ->curr item and we need to reflect this
		846	* movement in our normalized position.
		847	*/
		848	if (!sleep)
		849	se->vruntime -= cfs_rq->min_vruntime;
806	}	850	}
807		851
808	/*	852	/*
@@ -1013,13 +1057,19 @@ static void enqueue_task_fair(struct rq rq, struct task_struct p, int wakeup)
1013	{	1057	{
1014	struct cfs_rq *cfs_rq;	1058	struct cfs_rq *cfs_rq;
1015	struct sched_entity *se = &p->se;	1059	struct sched_entity *se = &p->se;
		1060	int flags = 0;
		1061
		1062	if (wakeup)
		1063	flags \|= ENQUEUE_WAKEUP;
		1064	if (p->state == TASK_WAKING)
		1065	flags \|= ENQUEUE_MIGRATE;
1016		1066
1017	for_each_sched_entity(se) {	1067	for_each_sched_entity(se) {
1018	if (se->on_rq)	1068	if (se->on_rq)
1019	break;	1069	break;
1020	cfs_rq = cfs_rq_of(se);	1070	cfs_rq = cfs_rq_of(se);
1021	enqueue_entity(cfs_rq, se, wakeup);	1071	enqueue_entity(cfs_rq, se, flags);
1022	wakeup = 1;	1072	flags = ENQUEUE_WAKEUP;
1023	}	1073	}
1024		1074
1025	hrtick_update(rq);	1075	hrtick_update(rq);
@@ -1095,6 +1145,14 @@ static void yield_task_fair(struct rq *rq)
1095		1145
1096	#ifdef CONFIG_SMP	1146	#ifdef CONFIG_SMP
1097		1147
		1148	static void task_waking_fair(struct rq rq, struct task_struct p)
		1149	{
		1150	struct sched_entity *se = &p->se;
		1151	struct cfs_rq *cfs_rq = cfs_rq_of(se);
		1152
		1153	se->vruntime -= cfs_rq->min_vruntime;
		1154	}
		1155
1098	#ifdef CONFIG_FAIR_GROUP_SCHED	1156	#ifdef CONFIG_FAIR_GROUP_SCHED
1099	/*	1157	/*
1100	* effective_load() calculates the load change as seen from the root_task_group	1158	* effective_load() calculates the load change as seen from the root_task_group
@@ -1345,6 +1403,37 @@ find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
1345	}	1403	}
1346		1404
1347	/*	1405	/*
		1406	* Try and locate an idle CPU in the sched_domain.
		1407	*/
		1408	static int
		1409	select_idle_sibling(struct task_struct p, struct sched_domain sd, int target)
		1410	{
		1411	int cpu = smp_processor_id();
		1412	int prev_cpu = task_cpu(p);
		1413	int i;
		1414
		1415	/*
		1416	* If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
		1417	* test in select_task_rq_fair) and the prev_cpu is idle then that's
		1418	* always a better target than the current cpu.
		1419	*/
		1420	if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
		1421	return prev_cpu;
		1422
		1423	/*
		1424	* Otherwise, iterate the domain and find an elegible idle cpu.
		1425	*/
		1426	for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
		1427	if (!cpu_rq(i)->cfs.nr_running) {
		1428	target = i;
		1429	break;
		1430	}
		1431	}
		1432
		1433	return target;
		1434	}
		1435
		1436	/*
1348	* sched_balance_self: balance the current task (running on cpu) in domains	1437	* sched_balance_self: balance the current task (running on cpu) in domains
1349	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and	1438	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1350	* SD_BALANCE_EXEC.	1439	* SD_BALANCE_EXEC.
@@ -1372,8 +1461,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1372	new_cpu = prev_cpu;	1461	new_cpu = prev_cpu;
1373	}	1462	}
1374		1463
1375	rcu_read_lock();
1376	for_each_domain(cpu, tmp) {	1464	for_each_domain(cpu, tmp) {
		1465	if (!(tmp->flags & SD_LOAD_BALANCE))
		1466	continue;
		1467
1377	/*	1468	/*
1378	* If power savings logic is enabled for a domain, see if we	1469	* If power savings logic is enabled for a domain, see if we
1379	* are not overloaded, if so, don't balance wider.	1470	* are not overloaded, if so, don't balance wider.
@@ -1398,11 +1489,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1398	want_sd = 0;	1489	want_sd = 0;
1399	}	1490	}
1400		1491
1401	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&	1492	/*
1402	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {	1493	* While iterating the domains looking for a spanning
		1494	* WAKE_AFFINE domain, adjust the affine target to any idle cpu
		1495	* in cache sharing domains along the way.
		1496	*/
		1497	if (want_affine) {
		1498	int target = -1;
		1499
		1500	/*
		1501	* If both cpu and prev_cpu are part of this domain,
		1502	* cpu is a valid SD_WAKE_AFFINE target.
		1503	*/
		1504	if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
		1505	target = cpu;
1403		1506
1404	affine_sd = tmp;	1507	/*
1405	want_affine = 0;	1508	* If there's an idle sibling in this domain, make that
		1509	* the wake_affine target instead of the current cpu.
		1510	*/
		1511	if (tmp->flags & SD_PREFER_SIBLING)
		1512	target = select_idle_sibling(p, tmp, target);
		1513
		1514	if (target >= 0) {
		1515	if (tmp->flags & SD_WAKE_AFFINE) {
		1516	affine_sd = tmp;
		1517	want_affine = 0;
		1518	}
		1519	cpu = target;
		1520	}
1406	}	1521	}
1407		1522
1408	if (!want_sd && !want_affine)	1523	if (!want_sd && !want_affine)
@@ -1429,10 +1544,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1429	update_shares(tmp);	1544	update_shares(tmp);
1430	}	1545	}
1431		1546
1432	if (affine_sd && wake_affine(affine_sd, p, sync)) {	1547	if (affine_sd && wake_affine(affine_sd, p, sync))
1433	new_cpu = cpu;	1548	return cpu;
1434	goto out;
1435	}
1436		1549
1437	while (sd) {	1550	while (sd) {
1438	int load_idx = sd->forkexec_idx;	1551	int load_idx = sd->forkexec_idx;
@@ -1473,8 +1586,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1473	/* while loop will break here if sd == NULL */	1586	/* while loop will break here if sd == NULL */
1474	}	1587	}
1475		1588
1476	out:
1477	rcu_read_unlock();
1478	return new_cpu;	1589	return new_cpu;
1479	}	1590	}
1480	#endif /* CONFIG_SMP */	1591	#endif /* CONFIG_SMP */
@@ -1596,12 +1707,8 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_
1596	int sync = wake_flags & WF_SYNC;	1707	int sync = wake_flags & WF_SYNC;
1597	int scale = cfs_rq->nr_running >= sched_nr_latency;	1708	int scale = cfs_rq->nr_running >= sched_nr_latency;
1598		1709
1599	update_curr(cfs_rq);	1710	if (unlikely(rt_prio(p->prio)))
1600		1711	goto preempt;
1601	if (unlikely(rt_prio(p->prio))) {
1602	resched_task(curr);
1603	return;
1604	}
1605		1712
1606	if (unlikely(p->sched_class != &fair_sched_class))	1713	if (unlikely(p->sched_class != &fair_sched_class))
1607	return;	1714	return;
@@ -1627,50 +1734,44 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_
1627	return;	1734	return;
1628		1735
1629	/* Idle tasks are by definition preempted by everybody. */	1736	/* Idle tasks are by definition preempted by everybody. */
1630	if (unlikely(curr->policy == SCHED_IDLE)) {	1737	if (unlikely(curr->policy == SCHED_IDLE))
1631	resched_task(curr);	1738	goto preempt;
1632	return;
1633	}
1634		1739
1635	if ((sched_feat(WAKEUP_SYNC) && sync) \|\|	1740	if (sched_feat(WAKEUP_SYNC) && sync)
1636	(sched_feat(WAKEUP_OVERLAP) &&	1741	goto preempt;
1637	(se->avg_overlap < sysctl_sched_migration_cost &&
1638	pse->avg_overlap < sysctl_sched_migration_cost))) {
1639	resched_task(curr);
1640	return;
1641	}
1642		1742
1643	if (sched_feat(WAKEUP_RUNNING)) {	1743	if (sched_feat(WAKEUP_OVERLAP) &&
1644	if (pse->avg_running < se->avg_running) {	1744	se->avg_overlap < sysctl_sched_migration_cost &&
1645	set_next_buddy(pse);	1745	pse->avg_overlap < sysctl_sched_migration_cost)
1646	resched_task(curr);	1746	goto preempt;
1647	return;
1648	}
1649	}
1650		1747
1651	if (!sched_feat(WAKEUP_PREEMPT))	1748	if (!sched_feat(WAKEUP_PREEMPT))
1652	return;	1749	return;
1653		1750
		1751	update_curr(cfs_rq);
1654	find_matching_se(&se, &pse);	1752	find_matching_se(&se, &pse);
1655
1656	BUG_ON(!pse);	1753	BUG_ON(!pse);
		1754	if (wakeup_preempt_entity(se, pse) == 1)
		1755	goto preempt;
1657		1756
1658	if (wakeup_preempt_entity(se, pse) == 1) {	1757	return;
1659	resched_task(curr);	1758
1660	/*	1759	preempt:
1661	* Only set the backward buddy when the current task is still	1760	resched_task(curr);
1662	* on the rq. This can happen when a wakeup gets interleaved	1761	/*
1663	* with schedule on the ->pre_schedule() or idle_balance()	1762	* Only set the backward buddy when the current task is still
1664	* point, either of which can * drop the rq lock.	1763	* on the rq. This can happen when a wakeup gets interleaved
1665	*	1764	* with schedule on the ->pre_schedule() or idle_balance()
1666	* Also, during early boot the idle thread is in the fair class,	1765	* point, either of which can * drop the rq lock.
1667	* for obvious reasons its a bad idea to schedule back to it.	1766	*
1668	*/	1767	* Also, during early boot the idle thread is in the fair class,
1669	if (unlikely(!se->on_rq \|\| curr == rq->idle))	1768	* for obvious reasons its a bad idea to schedule back to it.
1670	return;	1769	*/
1671	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))	1770	if (unlikely(!se->on_rq \|\| curr == rq->idle))
1672	set_last_buddy(se);	1771	return;
1673	}	1772
		1773	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
		1774	set_last_buddy(se);
1674	}	1775	}
1675		1776
1676	static struct task_struct pick_next_task_fair(struct rq rq)	1777	static struct task_struct pick_next_task_fair(struct rq rq)
@@ -1679,7 +1780,7 @@ static struct task_struct pick_next_task_fair(struct rq rq)
1679	struct cfs_rq *cfs_rq = &rq->cfs;	1780	struct cfs_rq *cfs_rq = &rq->cfs;
1680	struct sched_entity *se;	1781	struct sched_entity *se;
1681		1782
1682	if (unlikely(!cfs_rq->nr_running))	1783	if (!cfs_rq->nr_running)
1683	return NULL;	1784	return NULL;
1684		1785
1685	do {	1786	do {
@@ -1850,6 +1951,17 @@ move_one_task_fair(struct rq this_rq, int this_cpu, struct rq busiest,
1850		1951
1851	return 0;	1952	return 0;
1852	}	1953	}
		1954
		1955	static void rq_online_fair(struct rq *rq)
		1956	{
		1957	update_sysctl();
		1958	}
		1959
		1960	static void rq_offline_fair(struct rq *rq)
		1961	{
		1962	update_sysctl();
		1963	}
		1964
1853	#endif /* CONFIG_SMP */	1965	#endif /* CONFIG_SMP */
1854		1966
1855	/*	1967	/*
@@ -1867,28 +1979,30 @@ static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
1867	}	1979	}
1868		1980
1869	/*	1981	/*
1870	* Share the fairness runtime between parent and child, thus the	1982	* called on fork with the child task as argument from the parent's context
1871	* total amount of pressure for CPU stays equal - new tasks	1983	* - child not yet on the tasklist
1872	* get a chance to run but frequent forkers are not allowed to	1984	* - preemption disabled
1873	* monopolize the CPU. Note: the parent runqueue is locked,
1874	* the child is not running yet.
1875	*/	1985	*/
1876	static void task_new_fair(struct rq rq, struct task_struct p)	1986	static void task_fork_fair(struct task_struct *p)
1877	{	1987	{
1878	struct cfs_rq *cfs_rq = task_cfs_rq(p);	1988	struct cfs_rq *cfs_rq = task_cfs_rq(current);
1879	struct sched_entity se = &p->se, curr = cfs_rq->curr;	1989	struct sched_entity se = &p->se, curr = cfs_rq->curr;
1880	int this_cpu = smp_processor_id();	1990	int this_cpu = smp_processor_id();
		1991	struct rq *rq = this_rq();
		1992	unsigned long flags;
1881		1993
1882	sched_info_queued(p);	1994	raw_spin_lock_irqsave(&rq->lock, flags);
		1995
		1996	if (unlikely(task_cpu(p) != this_cpu))
		1997	__set_task_cpu(p, this_cpu);
1883		1998
1884	update_curr(cfs_rq);	1999	update_curr(cfs_rq);
		2000
1885	if (curr)	2001	if (curr)
1886	se->vruntime = curr->vruntime;	2002	se->vruntime = curr->vruntime;
1887	place_entity(cfs_rq, se, 1);	2003	place_entity(cfs_rq, se, 1);
1888		2004
1889	/* 'curr' will be NULL if the child belongs to a different group */	2005	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
1890	if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1891	curr && entity_before(curr, se)) {
1892	/*	2006	/*
1893	* Upon rescheduling, sched_class::put_prev_task() will place	2007	* Upon rescheduling, sched_class::put_prev_task() will place
1894	* 'current' within the tree based on its new key value.	2008	* 'current' within the tree based on its new key value.
@@ -1897,7 +2011,9 @@ static void task_new_fair(struct rq rq, struct task_struct p)
1897	resched_task(rq->curr);	2011	resched_task(rq->curr);
1898	}	2012	}
1899		2013
1900	enqueue_task_fair(rq, p, 0);	2014	se->vruntime -= cfs_rq->min_vruntime;
		2015
		2016	raw_spin_unlock_irqrestore(&rq->lock, flags);
1901	}	2017	}
1902		2018
1903	/*	2019	/*
@@ -1950,30 +2066,27 @@ static void set_curr_task_fair(struct rq *rq)
1950	}	2066	}
1951		2067
1952	#ifdef CONFIG_FAIR_GROUP_SCHED	2068	#ifdef CONFIG_FAIR_GROUP_SCHED
1953	static void moved_group_fair(struct task_struct *p)	2069	static void moved_group_fair(struct task_struct *p, int on_rq)
1954	{	2070	{
1955	struct cfs_rq *cfs_rq = task_cfs_rq(p);	2071	struct cfs_rq *cfs_rq = task_cfs_rq(p);
1956		2072
1957	update_curr(cfs_rq);	2073	update_curr(cfs_rq);
1958	place_entity(cfs_rq, &p->se, 1);	2074	if (!on_rq)
		2075	place_entity(cfs_rq, &p->se, 1);
1959	}	2076	}
1960	#endif	2077	#endif
1961		2078
1962	unsigned int get_rr_interval_fair(struct task_struct *task)	2079	unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)
1963	{	2080	{
1964	struct sched_entity *se = &task->se;	2081	struct sched_entity *se = &task->se;
1965	unsigned long flags;
1966	struct rq *rq;
1967	unsigned int rr_interval = 0;	2082	unsigned int rr_interval = 0;
1968		2083
1969	/*	2084	/*
1970	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise	2085	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
1971	* idle runqueue:	2086	* idle runqueue:
1972	*/	2087	*/
1973	rq = task_rq_lock(task, &flags);
1974	if (rq->cfs.load.weight)	2088	if (rq->cfs.load.weight)
1975	rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));	2089	rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
1976	task_rq_unlock(rq, &flags);
1977		2090
1978	return rr_interval;	2091	return rr_interval;
1979	}	2092	}
@@ -1997,11 +2110,15 @@ static const struct sched_class fair_sched_class = {
1997		2110
1998	.load_balance = load_balance_fair,	2111	.load_balance = load_balance_fair,
1999	.move_one_task = move_one_task_fair,	2112	.move_one_task = move_one_task_fair,
		2113	.rq_online = rq_online_fair,
		2114	.rq_offline = rq_offline_fair,
		2115
		2116	.task_waking = task_waking_fair,
2000	#endif	2117	#endif
2001		2118
2002	.set_curr_task = set_curr_task_fair,	2119	.set_curr_task = set_curr_task_fair,
2003	.task_tick = task_tick_fair,	2120	.task_tick = task_tick_fair,
2004	.task_new = task_new_fair,	2121	.task_fork = task_fork_fair,
2005		2122
2006	.prio_changed = prio_changed_fair,	2123	.prio_changed = prio_changed_fair,
2007	.switched_to = switched_to_fair,	2124	.switched_to = switched_to_fair,