1 files changed, 182 insertions, 86 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ecc637a0d591..5bedf6e3ebf3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
 */
 #include <linux/latencytop.h>
+#include <linux/sched.h>
 /*
 * Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
 *  run vmstat and monitor the context-switches (cs) field)
 */
 unsigned int sysctl_sched_latency = 5000000ULL;
+unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+/*
+ * The initial- and re-scaling of tunables is configurable
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
+ *
+ * Options are:
+ * SCHED_TUNABLESCALING_NONE - unscaled, always *1
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ */
+enum sched_tunable_scaling sysctl_sched_tunable_scaling
+        = SCHED_TUNABLESCALING_LOG;
 /*
 * Minimal preemption granularity for CPU-bound tasks:
 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
 */
 unsigned int sysctl_sched_min_granularity = 1000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
 /*
 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
 * have immediate wakeup/sleep latencies.
 */
 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 */
 #ifdef CONFIG_SCHED_DEBUG
-int sched_nr_latency_handler(struct ctl_table *table, int write,
+int sched_proc_update_handler(struct ctl_table *table, int write,
-                struct file *filp, void __user *buffer, size_t *lenp,
+                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
-        int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        int factor = get_update_sysctl_factor();
        if (ret || !write)
                return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
        sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
                                        sysctl_sched_min_granularity);
+#define WRT_SYSCTL(name) \
+        (normalized_sysctl_##name = sysctl_##name / (factor))
+        WRT_SYSCTL(sched_min_granularity);
+        WRT_SYSCTL(sched_latency);
+        WRT_SYSCTL(sched_wakeup_granularity);
+        WRT_SYSCTL(sched_shares_ratelimit);
+#undef WRT_SYSCTL
        return 0;
 }
 #endif
@@ -822,6 +847,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
                 * re-elected due to buddy favours.
                 */
                clear_buddies(cfs_rq, curr);
+                return;
+        }
+        /*
+         * Ensure that a task that missed wakeup preemption by a
+         * narrow margin doesn't have to wait for a full slice.
+         * This also mitigates buddy induced latencies under load.
+         */
+        if (!sched_feat(WAKEUP_PREEMPT))
+                return;
+        if (delta_exec < sysctl_sched_min_granularity)
+                return;
+        if (cfs_rq->nr_running > 1) {
+                struct sched_entity *se = __pick_next_entity(cfs_rq);
+                s64 delta = curr->vruntime - se->vruntime;
+                if (delta > ideal_runtime)
+                        resched_task(rq_of(cfs_rq)->curr);
        }
 }
@@ -861,12 +906,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *se = __pick_next_entity(cfs_rq);
+        struct sched_entity *left = se;
-        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
+        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
-                return cfs_rq->next;
+                se = cfs_rq->next;
-        if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
+        /*
-                return cfs_rq->last;
+         * Prefer last buddy, try to return the CPU to a preempted task.
+         */
+        if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
+                se = cfs_rq->last;
+        clear_buddies(cfs_rq, se);
        return se;
 }
@@ -1319,6 +1370,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 }
 /*
+ * Try and locate an idle CPU in the sched_domain.
+ */
+static int
+select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
+{
+        int cpu = smp_processor_id();
+        int prev_cpu = task_cpu(p);
+        int i;
+        /*
+         * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
+         * test in select_task_rq_fair) and the prev_cpu is idle then that's
+         * always a better target than the current cpu.
+         */
+        if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+                return prev_cpu;
+        /*
+         * Otherwise, iterate the domain and find an elegible idle cpu.
+         */
+        for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+                if (!cpu_rq(i)->cfs.nr_running) {
+                        target = i;
+                        break;
+                }
+        }
+        return target;
+}
+/*
 * sched_balance_self: balance the current task (running on cpu) in domains
 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
 * SD_BALANCE_EXEC.
@@ -1346,7 +1428,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                new_cpu = prev_cpu;
        }
-        rcu_read_lock();
        for_each_domain(cpu, tmp) {
                /*
                 * If power savings logic is enabled for a domain, see if we
@@ -1372,11 +1453,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                                want_sd = 0;
                }
-                if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+                /*
-                    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+                 * While iterating the domains looking for a spanning
+                 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
+                 * in cache sharing domains along the way.
+                 */
+                if (want_affine) {
+                        int target = -1;
-                        affine_sd = tmp;
+                        /*
-                        want_affine = 0;
+                         * If both cpu and prev_cpu are part of this domain,
+                         * cpu is a valid SD_WAKE_AFFINE target.
+                         */
+                        if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
+                                target = cpu;
+                        /*
+                         * If there's an idle sibling in this domain, make that
+                         * the wake_affine target instead of the current cpu.
+                         */
+                        if (tmp->flags & SD_PREFER_SIBLING)
+                                target = select_idle_sibling(p, tmp, target);
+                        if (target >= 0) {
+                                if (tmp->flags & SD_WAKE_AFFINE) {
+                                        affine_sd = tmp;
+                                        want_affine = 0;
+                                }
+                                cpu = target;
+                        }
                }
                if (!want_sd && !want_affine)
@@ -1403,10 +1508,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                        update_shares(tmp);
        }
-        if (affine_sd && wake_affine(affine_sd, p, sync)) {
+        if (affine_sd && wake_affine(affine_sd, p, sync))
-                new_cpu = cpu;
+                return cpu;
-                goto out;
-        }
        while (sd) {
                int load_idx = sd->forkexec_idx;
@@ -1447,8 +1550,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                /* while loop will break here if sd == NULL */
        }
-out:
-        rcu_read_unlock();
        return new_cpu;
 }
 #endif /* CONFIG_SMP */
@@ -1568,13 +1669,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        struct sched_entity *se = &curr->se, *pse = &p->se;
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
        int sync = wake_flags & WF_SYNC;
+        int scale = cfs_rq->nr_running >= sched_nr_latency;
-        update_curr(cfs_rq);
+        if (unlikely(rt_prio(p->prio)))
+                goto preempt;
-        if (unlikely(rt_prio(p->prio))) {
-                resched_task(curr);
-                return;
-        }
        if (unlikely(p->sched_class != &fair_sched_class))
                return;
@@ -1582,18 +1680,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        if (unlikely(se == pse))
                return;
-        /*
+        if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
-         * Only set the backward buddy when the current task is still on the
-         * rq. This can happen when a wakeup gets interleaved with schedule on
-         * the ->pre_schedule() or idle_balance() point, either of which can
-         * drop the rq lock.
-         *
-         * Also, during early boot the idle thread is in the fair class, for
-         * obvious reasons its a bad idea to schedule back to the idle thread.
-         */
-        if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
-                set_last_buddy(se);
-        if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
                set_next_buddy(pse);
        /*
@@ -1611,36 +1698,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
                return;
        /* Idle tasks are by definition preempted by everybody. */
-        if (unlikely(curr->policy == SCHED_IDLE)) {
+        if (unlikely(curr->policy == SCHED_IDLE))
-                resched_task(curr);
+                goto preempt;
-                return;
-        }
-        if ((sched_feat(WAKEUP_SYNC) && sync) ||
+        if (sched_feat(WAKEUP_SYNC) && sync)
-            (sched_feat(WAKEUP_OVERLAP) &&
+                goto preempt;
-             (se->avg_overlap < sysctl_sched_migration_cost &&
-              pse->avg_overlap < sysctl_sched_migration_cost))) {
-                resched_task(curr);
-                return;
-        }
-        if (sched_feat(WAKEUP_RUNNING)) {
+        if (sched_feat(WAKEUP_OVERLAP) &&
-                if (pse->avg_running < se->avg_running) {
+                        se->avg_overlap < sysctl_sched_migration_cost &&
-                        set_next_buddy(pse);
+                        pse->avg_overlap < sysctl_sched_migration_cost)
-                        resched_task(curr);
+                goto preempt;
-                        return;
-                }
-        }
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
+        update_curr(cfs_rq);
        find_matching_se(&se, &pse);
        BUG_ON(!pse);
        if (wakeup_preempt_entity(se, pse) == 1)
-                resched_task(curr);
+                goto preempt;
+        return;
+preempt:
+        resched_task(curr);
+        /*
+         * Only set the backward buddy when the current task is still
+         * on the rq. This can happen when a wakeup gets interleaved
+         * with schedule on the ->pre_schedule() or idle_balance()
+         * point, either of which can * drop the rq lock.
+         *
+         * Also, during early boot the idle thread is in the fair class,
+         * for obvious reasons its a bad idea to schedule back to it.
+         */
+        if (unlikely(!se->on_rq || curr == rq->idle))
+                return;
+        if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+                set_last_buddy(se);
 }
 static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1649,21 +1744,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
        struct cfs_rq *cfs_rq = &rq->cfs;
        struct sched_entity *se;
-        if (unlikely(!cfs_rq->nr_running))
+        if (!cfs_rq->nr_running)
                return NULL;
        do {
                se = pick_next_entity(cfs_rq);
-                /*
-                 * If se was a buddy, clear it so that it will have to earn
-                 * the favour again.
-                 *
-                 * If se was not a buddy, clear the buddies because neither
-                 * was elegible to run, let them earn it again.
-                 *
-                 * IOW. unconditionally clear buddies.
-                 */
-                __clear_buddies(cfs_rq, NULL);
                set_next_entity(cfs_rq, se);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);
@@ -1830,6 +1915,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        return 0;
 }
+static void rq_online_fair(struct rq *rq)
+{
+        update_sysctl();
+}
+static void rq_offline_fair(struct rq *rq)
+{
+        update_sysctl();
+}
 #endif /* CONFIG_SMP */
 /*
@@ -1847,28 +1943,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 }
 /*
- * Share the fairness runtime between parent and child, thus the
+ * called on fork with the child task as argument from the parent's context
- * total amount of pressure for CPU stays equal - new tasks
+ *  - child not yet on the tasklist
- * get a chance to run but frequent forkers are not allowed to
+ *  - preemption disabled
- * monopolize the CPU. Note: the parent runqueue is locked,
- * the child is not running yet.
 */
-static void task_new_fair(struct rq *rq, struct task_struct *p)
+static void task_fork_fair(struct task_struct *p)
 {
-        struct cfs_rq *cfs_rq = task_cfs_rq(p);
+        struct cfs_rq *cfs_rq = task_cfs_rq(current);
        struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
        int this_cpu = smp_processor_id();
+        struct rq *rq = this_rq();
+        unsigned long flags;
+        raw_spin_lock_irqsave(&rq->lock, flags);
-        sched_info_queued(p);
+        if (unlikely(task_cpu(p) != this_cpu))
+                __set_task_cpu(p, this_cpu);
        update_curr(cfs_rq);
        if (curr)
                se->vruntime = curr->vruntime;
        place_entity(cfs_rq, se, 1);
-        /* 'curr' will be NULL if the child belongs to a different group */
+        if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
-        if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
-                        curr && entity_before(curr, se)) {
                /*
                 * Upon rescheduling, sched_class::put_prev_task() will place
                 * 'current' within the tree based on its new key value.
@@ -1877,7 +1975,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
                resched_task(rq->curr);
        }
-        enqueue_task_fair(rq, p, 0);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
@@ -1939,21 +2037,17 @@ static void moved_group_fair(struct task_struct *p)
 }
 #endif
-unsigned int get_rr_interval_fair(struct task_struct *task)
+unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
 {
        struct sched_entity *se = &task->se;
-        unsigned long flags;
-        struct rq *rq;
        unsigned int rr_interval = 0;
        /*
         * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
         * idle runqueue:
         */
-        rq = task_rq_lock(task, &flags);
        if (rq->cfs.load.weight)
                rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
-        task_rq_unlock(rq, &flags);
        return rr_interval;
 }
@@ -1977,11 +2071,13 @@ static const struct sched_class fair_sched_class = {
        .load_balance           = load_balance_fair,
        .move_one_task          = move_one_task_fair,
+        .rq_online              = rq_online_fair,
+        .rq_offline             = rq_offline_fair,
 #endif
        .set_curr_task          = set_curr_task_fair,
        .task_tick              = task_tick_fair,
-        .task_new               = task_new_fair,
+        .task_fork              = task_fork_fair,
        .prio_changed           = prio_changed_fair,
        .switched_to            = switched_to_fair,

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ecc637a0d591..5bedf6e3ebf3 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
21	*/	21	*/
22		22
23	#include <linux/latencytop.h>	23	#include <linux/latencytop.h>
		24	#include <linux/sched.h>
24		25
25	/*	26	/*
26	* Targeted preemption latency for CPU-bound tasks:	27	* Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
35	* run vmstat and monitor the context-switches (cs) field)	36	* run vmstat and monitor the context-switches (cs) field)
36	*/	37	*/
37	unsigned int sysctl_sched_latency = 5000000ULL;	38	unsigned int sysctl_sched_latency = 5000000ULL;
		39	unsigned int normalized_sysctl_sched_latency = 5000000ULL;
		40
		41	/*
		42	* The initial- and re-scaling of tunables is configurable
		43	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
		44	*
		45	* Options are:
		46	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
		47	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
		48	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
		49	*/
		50	enum sched_tunable_scaling sysctl_sched_tunable_scaling
		51	= SCHED_TUNABLESCALING_LOG;
38		52
39	/*	53	/*
40	* Minimal preemption granularity for CPU-bound tasks:	54	* Minimal preemption granularity for CPU-bound tasks:
41	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)	55	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42	*/	56	*/
43	unsigned int sysctl_sched_min_granularity = 1000000ULL;	57	unsigned int sysctl_sched_min_granularity = 1000000ULL;
		58	unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
44		59
45	/*	60	/*
46	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity	61	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
70	* have immediate wakeup/sleep latencies.	85	* have immediate wakeup/sleep latencies.
71	*/	86	*/
72	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;	87	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
		88	unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
73		89
74	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;	90	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75		91
@@ -383,11 +399,12 @@ static struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
383	*/	399	*/
384		400
385	#ifdef CONFIG_SCHED_DEBUG	401	#ifdef CONFIG_SCHED_DEBUG
386	int sched_nr_latency_handler(struct ctl_table *table, int write,	402	int sched_proc_update_handler(struct ctl_table *table, int write,
387	struct file filp, void __user buffer, size_t *lenp,	403	void __user buffer, size_t lenp,
388	loff_t *ppos)	404	loff_t *ppos)
389	{	405	{
390	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);	406	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
		407	int factor = get_update_sysctl_factor();
391		408
392	if (ret \|\| !write)	409	if (ret \|\| !write)
393	return ret;	410	return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
395	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,	412	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
396	sysctl_sched_min_granularity);	413	sysctl_sched_min_granularity);
397		414
		415	#define WRT_SYSCTL(name) \
		416	(normalized_sysctl_##name = sysctl_##name / (factor))
		417	WRT_SYSCTL(sched_min_granularity);
		418	WRT_SYSCTL(sched_latency);
		419	WRT_SYSCTL(sched_wakeup_granularity);
		420	WRT_SYSCTL(sched_shares_ratelimit);
		421	#undef WRT_SYSCTL
		422
398	return 0;	423	return 0;
399	}	424	}
400	#endif	425	#endif
@@ -822,6 +847,26 @@ check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
822	* re-elected due to buddy favours.	847	* re-elected due to buddy favours.
823	*/	848	*/
824	clear_buddies(cfs_rq, curr);	849	clear_buddies(cfs_rq, curr);
		850	return;
		851	}
		852
		853	/*
		854	* Ensure that a task that missed wakeup preemption by a
		855	* narrow margin doesn't have to wait for a full slice.
		856	* This also mitigates buddy induced latencies under load.
		857	*/
		858	if (!sched_feat(WAKEUP_PREEMPT))
		859	return;
		860
		861	if (delta_exec < sysctl_sched_min_granularity)
		862	return;
		863
		864	if (cfs_rq->nr_running > 1) {
		865	struct sched_entity *se = __pick_next_entity(cfs_rq);
		866	s64 delta = curr->vruntime - se->vruntime;
		867
		868	if (delta > ideal_runtime)
		869	resched_task(rq_of(cfs_rq)->curr);
825	}	870	}
826	}	871	}
827		872
@@ -861,12 +906,18 @@ wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
861	static struct sched_entity pick_next_entity(struct cfs_rq cfs_rq)	906	static struct sched_entity pick_next_entity(struct cfs_rq cfs_rq)
862	{	907	{
863	struct sched_entity *se = __pick_next_entity(cfs_rq);	908	struct sched_entity *se = __pick_next_entity(cfs_rq);
		909	struct sched_entity *left = se;
864		910
865	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)	911	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
866	return cfs_rq->next;	912	se = cfs_rq->next;
867		913
868	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)	914	/*
869	return cfs_rq->last;	915	* Prefer last buddy, try to return the CPU to a preempted task.
		916	*/
		917	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
		918	se = cfs_rq->last;
		919
		920	clear_buddies(cfs_rq, se);
870		921
871	return se;	922	return se;
872	}	923	}
@@ -1319,6 +1370,37 @@ find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
1319	}	1370	}
1320		1371
1321	/*	1372	/*
		1373	* Try and locate an idle CPU in the sched_domain.
		1374	*/
		1375	static int
		1376	select_idle_sibling(struct task_struct p, struct sched_domain sd, int target)
		1377	{
		1378	int cpu = smp_processor_id();
		1379	int prev_cpu = task_cpu(p);
		1380	int i;
		1381
		1382	/*
		1383	* If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
		1384	* test in select_task_rq_fair) and the prev_cpu is idle then that's
		1385	* always a better target than the current cpu.
		1386	*/
		1387	if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
		1388	return prev_cpu;
		1389
		1390	/*
		1391	* Otherwise, iterate the domain and find an elegible idle cpu.
		1392	*/
		1393	for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
		1394	if (!cpu_rq(i)->cfs.nr_running) {
		1395	target = i;
		1396	break;
		1397	}
		1398	}
		1399
		1400	return target;
		1401	}
		1402
		1403	/*
1322	* sched_balance_self: balance the current task (running on cpu) in domains	1404	* sched_balance_self: balance the current task (running on cpu) in domains
1323	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and	1405	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1324	* SD_BALANCE_EXEC.	1406	* SD_BALANCE_EXEC.
@@ -1346,7 +1428,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1346	new_cpu = prev_cpu;	1428	new_cpu = prev_cpu;
1347	}	1429	}
1348		1430
1349	rcu_read_lock();
1350	for_each_domain(cpu, tmp) {	1431	for_each_domain(cpu, tmp) {
1351	/*	1432	/*
1352	* If power savings logic is enabled for a domain, see if we	1433	* If power savings logic is enabled for a domain, see if we
@@ -1372,11 +1453,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1372	want_sd = 0;	1453	want_sd = 0;
1373	}	1454	}
1374		1455
1375	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&	1456	/*
1376	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {	1457	* While iterating the domains looking for a spanning
		1458	* WAKE_AFFINE domain, adjust the affine target to any idle cpu
		1459	* in cache sharing domains along the way.
		1460	*/
		1461	if (want_affine) {
		1462	int target = -1;
1377		1463
1378	affine_sd = tmp;	1464	/*
1379	want_affine = 0;	1465	* If both cpu and prev_cpu are part of this domain,
		1466	* cpu is a valid SD_WAKE_AFFINE target.
		1467	*/
		1468	if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
		1469	target = cpu;
		1470
		1471	/*
		1472	* If there's an idle sibling in this domain, make that
		1473	* the wake_affine target instead of the current cpu.
		1474	*/
		1475	if (tmp->flags & SD_PREFER_SIBLING)
		1476	target = select_idle_sibling(p, tmp, target);
		1477
		1478	if (target >= 0) {
		1479	if (tmp->flags & SD_WAKE_AFFINE) {
		1480	affine_sd = tmp;
		1481	want_affine = 0;
		1482	}
		1483	cpu = target;
		1484	}
1380	}	1485	}
1381		1486
1382	if (!want_sd && !want_affine)	1487	if (!want_sd && !want_affine)
@@ -1403,10 +1508,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1403	update_shares(tmp);	1508	update_shares(tmp);
1404	}	1509	}
1405		1510
1406	if (affine_sd && wake_affine(affine_sd, p, sync)) {	1511	if (affine_sd && wake_affine(affine_sd, p, sync))
1407	new_cpu = cpu;	1512	return cpu;
1408	goto out;
1409	}
1410		1513
1411	while (sd) {	1514	while (sd) {
1412	int load_idx = sd->forkexec_idx;	1515	int load_idx = sd->forkexec_idx;
@@ -1447,8 +1550,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1447	/* while loop will break here if sd == NULL */	1550	/* while loop will break here if sd == NULL */
1448	}	1551	}
1449		1552
1450	out:
1451	rcu_read_unlock();
1452	return new_cpu;	1553	return new_cpu;
1453	}	1554	}
1454	#endif /* CONFIG_SMP */	1555	#endif /* CONFIG_SMP */
@@ -1568,13 +1669,10 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_
1568	struct sched_entity se = &curr->se, pse = &p->se;	1669	struct sched_entity se = &curr->se, pse = &p->se;
1569	struct cfs_rq *cfs_rq = task_cfs_rq(curr);	1670	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1570	int sync = wake_flags & WF_SYNC;	1671	int sync = wake_flags & WF_SYNC;
		1672	int scale = cfs_rq->nr_running >= sched_nr_latency;
1571		1673
1572	update_curr(cfs_rq);	1674	if (unlikely(rt_prio(p->prio)))
1573		1675	goto preempt;
1574	if (unlikely(rt_prio(p->prio))) {
1575	resched_task(curr);
1576	return;
1577	}
1578		1676
1579	if (unlikely(p->sched_class != &fair_sched_class))	1677	if (unlikely(p->sched_class != &fair_sched_class))
1580	return;	1678	return;
@@ -1582,18 +1680,7 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_
1582	if (unlikely(se == pse))	1680	if (unlikely(se == pse))
1583	return;	1681	return;
1584		1682
1585	/*	1683	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
1586	* Only set the backward buddy when the current task is still on the
1587	* rq. This can happen when a wakeup gets interleaved with schedule on
1588	* the ->pre_schedule() or idle_balance() point, either of which can
1589	* drop the rq lock.
1590	*
1591	* Also, during early boot the idle thread is in the fair class, for
1592	* obvious reasons its a bad idea to schedule back to the idle thread.
1593	*/
1594	if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1595	set_last_buddy(se);
1596	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
1597	set_next_buddy(pse);	1684	set_next_buddy(pse);
1598		1685
1599	/*	1686	/*
@@ -1611,36 +1698,44 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_
1611	return;	1698	return;
1612		1699
1613	/* Idle tasks are by definition preempted by everybody. */	1700	/* Idle tasks are by definition preempted by everybody. */
1614	if (unlikely(curr->policy == SCHED_IDLE)) {	1701	if (unlikely(curr->policy == SCHED_IDLE))
1615	resched_task(curr);	1702	goto preempt;
1616	return;
1617	}
1618		1703
1619	if ((sched_feat(WAKEUP_SYNC) && sync) \|\|	1704	if (sched_feat(WAKEUP_SYNC) && sync)
1620	(sched_feat(WAKEUP_OVERLAP) &&	1705	goto preempt;
1621	(se->avg_overlap < sysctl_sched_migration_cost &&
1622	pse->avg_overlap < sysctl_sched_migration_cost))) {
1623	resched_task(curr);
1624	return;
1625	}
1626		1706
1627	if (sched_feat(WAKEUP_RUNNING)) {	1707	if (sched_feat(WAKEUP_OVERLAP) &&
1628	if (pse->avg_running < se->avg_running) {	1708	se->avg_overlap < sysctl_sched_migration_cost &&
1629	set_next_buddy(pse);	1709	pse->avg_overlap < sysctl_sched_migration_cost)
1630	resched_task(curr);	1710	goto preempt;
1631	return;
1632	}
1633	}
1634		1711
1635	if (!sched_feat(WAKEUP_PREEMPT))	1712	if (!sched_feat(WAKEUP_PREEMPT))
1636	return;	1713	return;
1637		1714
		1715	update_curr(cfs_rq);
1638	find_matching_se(&se, &pse);	1716	find_matching_se(&se, &pse);
1639
1640	BUG_ON(!pse);	1717	BUG_ON(!pse);
1641
1642	if (wakeup_preempt_entity(se, pse) == 1)	1718	if (wakeup_preempt_entity(se, pse) == 1)
1643	resched_task(curr);	1719	goto preempt;
		1720
		1721	return;
		1722
		1723	preempt:
		1724	resched_task(curr);
		1725	/*
		1726	* Only set the backward buddy when the current task is still
		1727	* on the rq. This can happen when a wakeup gets interleaved
		1728	* with schedule on the ->pre_schedule() or idle_balance()
		1729	* point, either of which can * drop the rq lock.
		1730	*
		1731	* Also, during early boot the idle thread is in the fair class,
		1732	* for obvious reasons its a bad idea to schedule back to it.
		1733	*/
		1734	if (unlikely(!se->on_rq \|\| curr == rq->idle))
		1735	return;
		1736
		1737	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
		1738	set_last_buddy(se);
1644	}	1739	}
1645		1740
1646	static struct task_struct pick_next_task_fair(struct rq rq)	1741	static struct task_struct pick_next_task_fair(struct rq rq)
@@ -1649,21 +1744,11 @@ static struct task_struct pick_next_task_fair(struct rq rq)
1649	struct cfs_rq *cfs_rq = &rq->cfs;	1744	struct cfs_rq *cfs_rq = &rq->cfs;
1650	struct sched_entity *se;	1745	struct sched_entity *se;
1651		1746
1652	if (unlikely(!cfs_rq->nr_running))	1747	if (!cfs_rq->nr_running)
1653	return NULL;	1748	return NULL;
1654		1749
1655	do {	1750	do {
1656	se = pick_next_entity(cfs_rq);	1751	se = pick_next_entity(cfs_rq);
1657	/*
1658	* If se was a buddy, clear it so that it will have to earn
1659	* the favour again.
1660	*
1661	* If se was not a buddy, clear the buddies because neither
1662	* was elegible to run, let them earn it again.
1663	*
1664	* IOW. unconditionally clear buddies.
1665	*/
1666	__clear_buddies(cfs_rq, NULL);
1667	set_next_entity(cfs_rq, se);	1752	set_next_entity(cfs_rq, se);
1668	cfs_rq = group_cfs_rq(se);	1753	cfs_rq = group_cfs_rq(se);
1669	} while (cfs_rq);	1754	} while (cfs_rq);
@@ -1830,6 +1915,17 @@ move_one_task_fair(struct rq this_rq, int this_cpu, struct rq busiest,
1830		1915
1831	return 0;	1916	return 0;
1832	}	1917	}
		1918
		1919	static void rq_online_fair(struct rq *rq)
		1920	{
		1921	update_sysctl();
		1922	}
		1923
		1924	static void rq_offline_fair(struct rq *rq)
		1925	{
		1926	update_sysctl();
		1927	}
		1928
1833	#endif /* CONFIG_SMP */	1929	#endif /* CONFIG_SMP */
1834		1930
1835	/*	1931	/*
@@ -1847,28 +1943,30 @@ static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
1847	}	1943	}
1848		1944
1849	/*	1945	/*
1850	* Share the fairness runtime between parent and child, thus the	1946	* called on fork with the child task as argument from the parent's context
1851	* total amount of pressure for CPU stays equal - new tasks	1947	* - child not yet on the tasklist
1852	* get a chance to run but frequent forkers are not allowed to	1948	* - preemption disabled
1853	* monopolize the CPU. Note: the parent runqueue is locked,
1854	* the child is not running yet.
1855	*/	1949	*/
1856	static void task_new_fair(struct rq rq, struct task_struct p)	1950	static void task_fork_fair(struct task_struct *p)
1857	{	1951	{
1858	struct cfs_rq *cfs_rq = task_cfs_rq(p);	1952	struct cfs_rq *cfs_rq = task_cfs_rq(current);
1859	struct sched_entity se = &p->se, curr = cfs_rq->curr;	1953	struct sched_entity se = &p->se, curr = cfs_rq->curr;
1860	int this_cpu = smp_processor_id();	1954	int this_cpu = smp_processor_id();
		1955	struct rq *rq = this_rq();
		1956	unsigned long flags;
		1957
		1958	raw_spin_lock_irqsave(&rq->lock, flags);
1861		1959
1862	sched_info_queued(p);	1960	if (unlikely(task_cpu(p) != this_cpu))
		1961	__set_task_cpu(p, this_cpu);
1863		1962
1864	update_curr(cfs_rq);	1963	update_curr(cfs_rq);
		1964
1865	if (curr)	1965	if (curr)
1866	se->vruntime = curr->vruntime;	1966	se->vruntime = curr->vruntime;
1867	place_entity(cfs_rq, se, 1);	1967	place_entity(cfs_rq, se, 1);
1868		1968
1869	/* 'curr' will be NULL if the child belongs to a different group */	1969	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
1870	if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1871	curr && entity_before(curr, se)) {
1872	/*	1970	/*
1873	* Upon rescheduling, sched_class::put_prev_task() will place	1971	* Upon rescheduling, sched_class::put_prev_task() will place
1874	* 'current' within the tree based on its new key value.	1972	* 'current' within the tree based on its new key value.
@@ -1877,7 +1975,7 @@ static void task_new_fair(struct rq rq, struct task_struct p)
1877	resched_task(rq->curr);	1975	resched_task(rq->curr);
1878	}	1976	}
1879		1977
1880	enqueue_task_fair(rq, p, 0);	1978	raw_spin_unlock_irqrestore(&rq->lock, flags);
1881	}	1979	}
1882		1980
1883	/*	1981	/*
@@ -1939,21 +2037,17 @@ static void moved_group_fair(struct task_struct *p)
1939	}	2037	}
1940	#endif	2038	#endif
1941		2039
1942	unsigned int get_rr_interval_fair(struct task_struct *task)	2040	unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)
1943	{	2041	{
1944	struct sched_entity *se = &task->se;	2042	struct sched_entity *se = &task->se;
1945	unsigned long flags;
1946	struct rq *rq;
1947	unsigned int rr_interval = 0;	2043	unsigned int rr_interval = 0;
1948		2044
1949	/*	2045	/*
1950	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise	2046	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
1951	* idle runqueue:	2047	* idle runqueue:
1952	*/	2048	*/
1953	rq = task_rq_lock(task, &flags);
1954	if (rq->cfs.load.weight)	2049	if (rq->cfs.load.weight)
1955	rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));	2050	rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
1956	task_rq_unlock(rq, &flags);
1957		2051
1958	return rr_interval;	2052	return rr_interval;
1959	}	2053	}
@@ -1977,11 +2071,13 @@ static const struct sched_class fair_sched_class = {
1977		2071
1978	.load_balance = load_balance_fair,	2072	.load_balance = load_balance_fair,
1979	.move_one_task = move_one_task_fair,	2073	.move_one_task = move_one_task_fair,
		2074	.rq_online = rq_online_fair,
		2075	.rq_offline = rq_offline_fair,
1980	#endif	2076	#endif
1981		2077
1982	.set_curr_task = set_curr_task_fair,	2078	.set_curr_task = set_curr_task_fair,
1983	.task_tick = task_tick_fair,	2079	.task_tick = task_tick_fair,
1984	.task_new = task_new_fair,	2080	.task_fork = task_fork_fair,
1985		2081
1986	.prio_changed = prio_changed_fair,	2082	.prio_changed = prio_changed_fair,
1987	.switched_to = switched_to_fair,	2083	.switched_to = switched_to_fair,