1 files changed, 148 insertions, 72 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 37087a7fac22..5bedf6e3ebf3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
 */
 #include <linux/latencytop.h>
+#include <linux/sched.h>
 /*
 * Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
 *  run vmstat and monitor the context-switches (cs) field)
 */
 unsigned int sysctl_sched_latency = 5000000ULL;
+unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+/*
+ * The initial- and re-scaling of tunables is configurable
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
+ *
+ * Options are:
+ * SCHED_TUNABLESCALING_NONE - unscaled, always *1
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ */
+enum sched_tunable_scaling sysctl_sched_tunable_scaling
+        = SCHED_TUNABLESCALING_LOG;
 /*
 * Minimal preemption granularity for CPU-bound tasks:
 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
 */
 unsigned int sysctl_sched_min_granularity = 1000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
 /*
 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
 * have immediate wakeup/sleep latencies.
 */
 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 */
 #ifdef CONFIG_SCHED_DEBUG
-int sched_nr_latency_handler(struct ctl_table *table, int write,
+int sched_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        int factor = get_update_sysctl_factor();
        if (ret || !write)
                return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
        sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
                                        sysctl_sched_min_granularity);
+#define WRT_SYSCTL(name) \
+        (normalized_sysctl_##name = sysctl_##name / (factor))
+        WRT_SYSCTL(sched_min_granularity);
+        WRT_SYSCTL(sched_latency);
+        WRT_SYSCTL(sched_wakeup_granularity);
+        WRT_SYSCTL(sched_shares_ratelimit);
+#undef WRT_SYSCTL
        return 0;
 }
 #endif
@@ -1345,6 +1370,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 }
 /*
+ * Try and locate an idle CPU in the sched_domain.
+ */
+static int
+select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
+{
+        int cpu = smp_processor_id();
+        int prev_cpu = task_cpu(p);
+        int i;
+        /*
+         * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
+         * test in select_task_rq_fair) and the prev_cpu is idle then that's
+         * always a better target than the current cpu.
+         */
+        if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+                return prev_cpu;
+        /*
+         * Otherwise, iterate the domain and find an elegible idle cpu.
+         */
+        for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+                if (!cpu_rq(i)->cfs.nr_running) {
+                        target = i;
+                        break;
+                }
+        }
+        return target;
+}
+/*
 * sched_balance_self: balance the current task (running on cpu) in domains
 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
 * SD_BALANCE_EXEC.
@@ -1372,7 +1428,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                new_cpu = prev_cpu;
        }
-        rcu_read_lock();
        for_each_domain(cpu, tmp) {
                /*
                 * If power savings logic is enabled for a domain, see if we
@@ -1398,11 +1453,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                                want_sd = 0;
                }
-                if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+                /*
-                    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+                 * While iterating the domains looking for a spanning
+                 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
+                 * in cache sharing domains along the way.
+                 */
+                if (want_affine) {
+                        int target = -1;
+                        /*
+                         * If both cpu and prev_cpu are part of this domain,
+                         * cpu is a valid SD_WAKE_AFFINE target.
+                         */
+                        if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
+                                target = cpu;
-                        affine_sd = tmp;
+                        /*
-                        want_affine = 0;
+                         * If there's an idle sibling in this domain, make that
+                         * the wake_affine target instead of the current cpu.
+                         */
+                        if (tmp->flags & SD_PREFER_SIBLING)
+                                target = select_idle_sibling(p, tmp, target);
+                        if (target >= 0) {
+                                if (tmp->flags & SD_WAKE_AFFINE) {
+                                        affine_sd = tmp;
+                                        want_affine = 0;
+                                }
+                                cpu = target;
+                        }
                }
                if (!want_sd && !want_affine)
@@ -1429,10 +1508,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                        update_shares(tmp);
        }
-        if (affine_sd && wake_affine(affine_sd, p, sync)) {
+        if (affine_sd && wake_affine(affine_sd, p, sync))
-                new_cpu = cpu;
+                return cpu;
-                goto out;
-        }
        while (sd) {
                int load_idx = sd->forkexec_idx;
@@ -1473,8 +1550,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                /* while loop will break here if sd == NULL */
        }
-out:
-        rcu_read_unlock();
        return new_cpu;
 }
 #endif /* CONFIG_SMP */
@@ -1596,12 +1671,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        int sync = wake_flags & WF_SYNC;
        int scale = cfs_rq->nr_running >= sched_nr_latency;
-        update_curr(cfs_rq);
+        if (unlikely(rt_prio(p->prio)))
+                goto preempt;
-        if (unlikely(rt_prio(p->prio))) {
-                resched_task(curr);
-                return;
-        }
        if (unlikely(p->sched_class != &fair_sched_class))
                return;
@@ -1627,50 +1698,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
                return;
        /* Idle tasks are by definition preempted by everybody. */
-        if (unlikely(curr->policy == SCHED_IDLE)) {
+        if (unlikely(curr->policy == SCHED_IDLE))
-                resched_task(curr);
+                goto preempt;
-                return;
-        }
-        if ((sched_feat(WAKEUP_SYNC) && sync) ||
+        if (sched_feat(WAKEUP_SYNC) && sync)
-            (sched_feat(WAKEUP_OVERLAP) &&
+                goto preempt;
-             (se->avg_overlap < sysctl_sched_migration_cost &&
-              pse->avg_overlap < sysctl_sched_migration_cost))) {
-                resched_task(curr);
-                return;
-        }
-        if (sched_feat(WAKEUP_RUNNING)) {
+        if (sched_feat(WAKEUP_OVERLAP) &&
-                if (pse->avg_running < se->avg_running) {
+                        se->avg_overlap < sysctl_sched_migration_cost &&
-                        set_next_buddy(pse);
+                        pse->avg_overlap < sysctl_sched_migration_cost)
-                        resched_task(curr);
+                goto preempt;
-                        return;
-                }
-        }
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
+        update_curr(cfs_rq);
        find_matching_se(&se, &pse);
        BUG_ON(!pse);
+        if (wakeup_preempt_entity(se, pse) == 1)
+                goto preempt;
-        if (wakeup_preempt_entity(se, pse) == 1) {
+        return;
-                resched_task(curr);
-                /*
+preempt:
-                 * Only set the backward buddy when the current task is still
+        resched_task(curr);
-                 * on the rq. This can happen when a wakeup gets interleaved
+        /*
-                 * with schedule on the ->pre_schedule() or idle_balance()
+         * Only set the backward buddy when the current task is still
-                 * point, either of which can * drop the rq lock.
+         * on the rq. This can happen when a wakeup gets interleaved
-                 *
+         * with schedule on the ->pre_schedule() or idle_balance()
-                 * Also, during early boot the idle thread is in the fair class,
+         * point, either of which can * drop the rq lock.
-                 * for obvious reasons its a bad idea to schedule back to it.
+         *
-                 */
+         * Also, during early boot the idle thread is in the fair class,
-                if (unlikely(!se->on_rq || curr == rq->idle))
+         * for obvious reasons its a bad idea to schedule back to it.
-                        return;
+         */
-                if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+        if (unlikely(!se->on_rq || curr == rq->idle))
-                        set_last_buddy(se);
+                return;
-        }
+        if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+                set_last_buddy(se);
 }
 static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1679,7 +1744,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
        struct cfs_rq *cfs_rq = &rq->cfs;
        struct sched_entity *se;
-        if (unlikely(!cfs_rq->nr_running))
+        if (!cfs_rq->nr_running)
                return NULL;
        do {
@@ -1850,6 +1915,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        return 0;
 }
+static void rq_online_fair(struct rq *rq)
+{
+        update_sysctl();
+}
+static void rq_offline_fair(struct rq *rq)
+{
+        update_sysctl();
+}
 #endif /* CONFIG_SMP */
 /*
@@ -1867,28 +1943,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 }
 /*
- * Share the fairness runtime between parent and child, thus the
+ * called on fork with the child task as argument from the parent's context
- * total amount of pressure for CPU stays equal - new tasks
+ *  - child not yet on the tasklist
- * get a chance to run but frequent forkers are not allowed to
+ *  - preemption disabled
- * monopolize the CPU. Note: the parent runqueue is locked,
- * the child is not running yet.
 */
-static void task_new_fair(struct rq *rq, struct task_struct *p)
+static void task_fork_fair(struct task_struct *p)
 {
-        struct cfs_rq *cfs_rq = task_cfs_rq(p);
+        struct cfs_rq *cfs_rq = task_cfs_rq(current);
        struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
        int this_cpu = smp_processor_id();
+        struct rq *rq = this_rq();
+        unsigned long flags;
+        raw_spin_lock_irqsave(&rq->lock, flags);
-        sched_info_queued(p);
+        if (unlikely(task_cpu(p) != this_cpu))
+                __set_task_cpu(p, this_cpu);
        update_curr(cfs_rq);
        if (curr)
                se->vruntime = curr->vruntime;
        place_entity(cfs_rq, se, 1);
-        /* 'curr' will be NULL if the child belongs to a different group */
+        if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
-        if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
-                        curr && entity_before(curr, se)) {
                /*
                 * Upon rescheduling, sched_class::put_prev_task() will place
                 * 'current' within the tree based on its new key value.
@@ -1897,7 +1975,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
                resched_task(rq->curr);
        }
-        enqueue_task_fair(rq, p, 0);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
@@ -1959,21 +2037,17 @@ static void moved_group_fair(struct task_struct *p)
 }
 #endif
-unsigned int get_rr_interval_fair(struct task_struct *task)
+unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
 {
        struct sched_entity *se = &task->se;
-        unsigned long flags;
-        struct rq *rq;
        unsigned int rr_interval = 0;
        /*
         * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
         * idle runqueue:
         */
-        rq = task_rq_lock(task, &flags);
        if (rq->cfs.load.weight)
                rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
-        task_rq_unlock(rq, &flags);
        return rr_interval;
 }
@@ -1997,11 +2071,13 @@ static const struct sched_class fair_sched_class = {
        .load_balance           = load_balance_fair,
        .move_one_task          = move_one_task_fair,
+        .rq_online              = rq_online_fair,
+        .rq_offline             = rq_offline_fair,
 #endif
        .set_curr_task          = set_curr_task_fair,
        .task_tick              = task_tick_fair,
-        .task_new               = task_new_fair,
+        .task_fork              = task_fork_fair,
        .prio_changed           = prio_changed_fair,
        .switched_to            = switched_to_fair,

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 37087a7fac22..5bedf6e3ebf3 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
21	*/	21	*/
22		22
23	#include <linux/latencytop.h>	23	#include <linux/latencytop.h>
		24	#include <linux/sched.h>
24		25
25	/*	26	/*
26	* Targeted preemption latency for CPU-bound tasks:	27	* Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
35	* run vmstat and monitor the context-switches (cs) field)	36	* run vmstat and monitor the context-switches (cs) field)
36	*/	37	*/
37	unsigned int sysctl_sched_latency = 5000000ULL;	38	unsigned int sysctl_sched_latency = 5000000ULL;
		39	unsigned int normalized_sysctl_sched_latency = 5000000ULL;
		40
		41	/*
		42	* The initial- and re-scaling of tunables is configurable
		43	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
		44	*
		45	* Options are:
		46	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
		47	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
		48	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
		49	*/
		50	enum sched_tunable_scaling sysctl_sched_tunable_scaling
		51	= SCHED_TUNABLESCALING_LOG;
38		52
39	/*	53	/*
40	* Minimal preemption granularity for CPU-bound tasks:	54	* Minimal preemption granularity for CPU-bound tasks:
41	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)	55	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42	*/	56	*/
43	unsigned int sysctl_sched_min_granularity = 1000000ULL;	57	unsigned int sysctl_sched_min_granularity = 1000000ULL;
		58	unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
44		59
45	/*	60	/*
46	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity	61	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
70	* have immediate wakeup/sleep latencies.	85	* have immediate wakeup/sleep latencies.
71	*/	86	*/
72	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;	87	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
		88	unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
73		89
74	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;	90	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75		91
@@ -383,11 +399,12 @@ static struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
383	*/	399	*/
384		400
385	#ifdef CONFIG_SCHED_DEBUG	401	#ifdef CONFIG_SCHED_DEBUG
386	int sched_nr_latency_handler(struct ctl_table *table, int write,	402	int sched_proc_update_handler(struct ctl_table *table, int write,
387	void __user buffer, size_t lenp,	403	void __user buffer, size_t lenp,
388	loff_t *ppos)	404	loff_t *ppos)
389	{	405	{
390	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);	406	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
		407	int factor = get_update_sysctl_factor();
391		408
392	if (ret \|\| !write)	409	if (ret \|\| !write)
393	return ret;	410	return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
395	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,	412	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
396	sysctl_sched_min_granularity);	413	sysctl_sched_min_granularity);
397		414
		415	#define WRT_SYSCTL(name) \
		416	(normalized_sysctl_##name = sysctl_##name / (factor))
		417	WRT_SYSCTL(sched_min_granularity);
		418	WRT_SYSCTL(sched_latency);
		419	WRT_SYSCTL(sched_wakeup_granularity);
		420	WRT_SYSCTL(sched_shares_ratelimit);
		421	#undef WRT_SYSCTL
		422
398	return 0;	423	return 0;
399	}	424	}
400	#endif	425	#endif
@@ -1345,6 +1370,37 @@ find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
1345	}	1370	}
1346		1371
1347	/*	1372	/*
		1373	* Try and locate an idle CPU in the sched_domain.
		1374	*/
		1375	static int
		1376	select_idle_sibling(struct task_struct p, struct sched_domain sd, int target)
		1377	{
		1378	int cpu = smp_processor_id();
		1379	int prev_cpu = task_cpu(p);
		1380	int i;
		1381
		1382	/*
		1383	* If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
		1384	* test in select_task_rq_fair) and the prev_cpu is idle then that's
		1385	* always a better target than the current cpu.
		1386	*/
		1387	if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
		1388	return prev_cpu;
		1389
		1390	/*
		1391	* Otherwise, iterate the domain and find an elegible idle cpu.
		1392	*/
		1393	for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
		1394	if (!cpu_rq(i)->cfs.nr_running) {
		1395	target = i;
		1396	break;
		1397	}
		1398	}
		1399
		1400	return target;
		1401	}
		1402
		1403	/*
1348	* sched_balance_self: balance the current task (running on cpu) in domains	1404	* sched_balance_self: balance the current task (running on cpu) in domains
1349	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and	1405	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1350	* SD_BALANCE_EXEC.	1406	* SD_BALANCE_EXEC.
@@ -1372,7 +1428,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1372	new_cpu = prev_cpu;	1428	new_cpu = prev_cpu;
1373	}	1429	}
1374		1430
1375	rcu_read_lock();
1376	for_each_domain(cpu, tmp) {	1431	for_each_domain(cpu, tmp) {
1377	/*	1432	/*
1378	* If power savings logic is enabled for a domain, see if we	1433	* If power savings logic is enabled for a domain, see if we
@@ -1398,11 +1453,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1398	want_sd = 0;	1453	want_sd = 0;
1399	}	1454	}
1400		1455
1401	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&	1456	/*
1402	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {	1457	* While iterating the domains looking for a spanning
		1458	* WAKE_AFFINE domain, adjust the affine target to any idle cpu
		1459	* in cache sharing domains along the way.
		1460	*/
		1461	if (want_affine) {
		1462	int target = -1;
		1463
		1464	/*
		1465	* If both cpu and prev_cpu are part of this domain,
		1466	* cpu is a valid SD_WAKE_AFFINE target.
		1467	*/
		1468	if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
		1469	target = cpu;
1403		1470
1404	affine_sd = tmp;	1471	/*
1405	want_affine = 0;	1472	* If there's an idle sibling in this domain, make that
		1473	* the wake_affine target instead of the current cpu.
		1474	*/
		1475	if (tmp->flags & SD_PREFER_SIBLING)
		1476	target = select_idle_sibling(p, tmp, target);
		1477
		1478	if (target >= 0) {
		1479	if (tmp->flags & SD_WAKE_AFFINE) {
		1480	affine_sd = tmp;
		1481	want_affine = 0;
		1482	}
		1483	cpu = target;
		1484	}
1406	}	1485	}
1407		1486
1408	if (!want_sd && !want_affine)	1487	if (!want_sd && !want_affine)
@@ -1429,10 +1508,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1429	update_shares(tmp);	1508	update_shares(tmp);
1430	}	1509	}
1431		1510
1432	if (affine_sd && wake_affine(affine_sd, p, sync)) {	1511	if (affine_sd && wake_affine(affine_sd, p, sync))
1433	new_cpu = cpu;	1512	return cpu;
1434	goto out;
1435	}
1436		1513
1437	while (sd) {	1514	while (sd) {
1438	int load_idx = sd->forkexec_idx;	1515	int load_idx = sd->forkexec_idx;
@@ -1473,8 +1550,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1473	/* while loop will break here if sd == NULL */	1550	/* while loop will break here if sd == NULL */
1474	}	1551	}
1475		1552
1476	out:
1477	rcu_read_unlock();
1478	return new_cpu;	1553	return new_cpu;
1479	}	1554	}
1480	#endif /* CONFIG_SMP */	1555	#endif /* CONFIG_SMP */
@@ -1596,12 +1671,8 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_
1596	int sync = wake_flags & WF_SYNC;	1671	int sync = wake_flags & WF_SYNC;
1597	int scale = cfs_rq->nr_running >= sched_nr_latency;	1672	int scale = cfs_rq->nr_running >= sched_nr_latency;
1598		1673
1599	update_curr(cfs_rq);	1674	if (unlikely(rt_prio(p->prio)))
1600		1675	goto preempt;
1601	if (unlikely(rt_prio(p->prio))) {
1602	resched_task(curr);
1603	return;
1604	}
1605		1676
1606	if (unlikely(p->sched_class != &fair_sched_class))	1677	if (unlikely(p->sched_class != &fair_sched_class))
1607	return;	1678	return;
@@ -1627,50 +1698,44 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_
1627	return;	1698	return;
1628		1699
1629	/* Idle tasks are by definition preempted by everybody. */	1700	/* Idle tasks are by definition preempted by everybody. */
1630	if (unlikely(curr->policy == SCHED_IDLE)) {	1701	if (unlikely(curr->policy == SCHED_IDLE))
1631	resched_task(curr);	1702	goto preempt;
1632	return;
1633	}
1634		1703
1635	if ((sched_feat(WAKEUP_SYNC) && sync) \|\|	1704	if (sched_feat(WAKEUP_SYNC) && sync)
1636	(sched_feat(WAKEUP_OVERLAP) &&	1705	goto preempt;
1637	(se->avg_overlap < sysctl_sched_migration_cost &&
1638	pse->avg_overlap < sysctl_sched_migration_cost))) {
1639	resched_task(curr);
1640	return;
1641	}
1642		1706
1643	if (sched_feat(WAKEUP_RUNNING)) {	1707	if (sched_feat(WAKEUP_OVERLAP) &&
1644	if (pse->avg_running < se->avg_running) {	1708	se->avg_overlap < sysctl_sched_migration_cost &&
1645	set_next_buddy(pse);	1709	pse->avg_overlap < sysctl_sched_migration_cost)
1646	resched_task(curr);	1710	goto preempt;
1647	return;
1648	}
1649	}
1650		1711
1651	if (!sched_feat(WAKEUP_PREEMPT))	1712	if (!sched_feat(WAKEUP_PREEMPT))
1652	return;	1713	return;
1653		1714
		1715	update_curr(cfs_rq);
1654	find_matching_se(&se, &pse);	1716	find_matching_se(&se, &pse);
1655
1656	BUG_ON(!pse);	1717	BUG_ON(!pse);
		1718	if (wakeup_preempt_entity(se, pse) == 1)
		1719	goto preempt;
1657		1720
1658	if (wakeup_preempt_entity(se, pse) == 1) {	1721	return;
1659	resched_task(curr);	1722
1660	/*	1723	preempt:
1661	* Only set the backward buddy when the current task is still	1724	resched_task(curr);
1662	* on the rq. This can happen when a wakeup gets interleaved	1725	/*
1663	* with schedule on the ->pre_schedule() or idle_balance()	1726	* Only set the backward buddy when the current task is still
1664	* point, either of which can * drop the rq lock.	1727	* on the rq. This can happen when a wakeup gets interleaved
1665	*	1728	* with schedule on the ->pre_schedule() or idle_balance()
1666	* Also, during early boot the idle thread is in the fair class,	1729	* point, either of which can * drop the rq lock.
1667	* for obvious reasons its a bad idea to schedule back to it.	1730	*
1668	*/	1731	* Also, during early boot the idle thread is in the fair class,
1669	if (unlikely(!se->on_rq \|\| curr == rq->idle))	1732	* for obvious reasons its a bad idea to schedule back to it.
1670	return;	1733	*/
1671	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))	1734	if (unlikely(!se->on_rq \|\| curr == rq->idle))
1672	set_last_buddy(se);	1735	return;
1673	}	1736
		1737	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
		1738	set_last_buddy(se);
1674	}	1739	}
1675		1740
1676	static struct task_struct pick_next_task_fair(struct rq rq)	1741	static struct task_struct pick_next_task_fair(struct rq rq)
@@ -1679,7 +1744,7 @@ static struct task_struct pick_next_task_fair(struct rq rq)
1679	struct cfs_rq *cfs_rq = &rq->cfs;	1744	struct cfs_rq *cfs_rq = &rq->cfs;
1680	struct sched_entity *se;	1745	struct sched_entity *se;
1681		1746
1682	if (unlikely(!cfs_rq->nr_running))	1747	if (!cfs_rq->nr_running)
1683	return NULL;	1748	return NULL;
1684		1749
1685	do {	1750	do {
@@ -1850,6 +1915,17 @@ move_one_task_fair(struct rq this_rq, int this_cpu, struct rq busiest,
1850		1915
1851	return 0;	1916	return 0;
1852	}	1917	}
		1918
		1919	static void rq_online_fair(struct rq *rq)
		1920	{
		1921	update_sysctl();
		1922	}
		1923
		1924	static void rq_offline_fair(struct rq *rq)
		1925	{
		1926	update_sysctl();
		1927	}
		1928
1853	#endif /* CONFIG_SMP */	1929	#endif /* CONFIG_SMP */
1854		1930
1855	/*	1931	/*
@@ -1867,28 +1943,30 @@ static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
1867	}	1943	}
1868		1944
1869	/*	1945	/*
1870	* Share the fairness runtime between parent and child, thus the	1946	* called on fork with the child task as argument from the parent's context
1871	* total amount of pressure for CPU stays equal - new tasks	1947	* - child not yet on the tasklist
1872	* get a chance to run but frequent forkers are not allowed to	1948	* - preemption disabled
1873	* monopolize the CPU. Note: the parent runqueue is locked,
1874	* the child is not running yet.
1875	*/	1949	*/
1876	static void task_new_fair(struct rq rq, struct task_struct p)	1950	static void task_fork_fair(struct task_struct *p)
1877	{	1951	{
1878	struct cfs_rq *cfs_rq = task_cfs_rq(p);	1952	struct cfs_rq *cfs_rq = task_cfs_rq(current);
1879	struct sched_entity se = &p->se, curr = cfs_rq->curr;	1953	struct sched_entity se = &p->se, curr = cfs_rq->curr;
1880	int this_cpu = smp_processor_id();	1954	int this_cpu = smp_processor_id();
		1955	struct rq *rq = this_rq();
		1956	unsigned long flags;
		1957
		1958	raw_spin_lock_irqsave(&rq->lock, flags);
1881		1959
1882	sched_info_queued(p);	1960	if (unlikely(task_cpu(p) != this_cpu))
		1961	__set_task_cpu(p, this_cpu);
1883		1962
1884	update_curr(cfs_rq);	1963	update_curr(cfs_rq);
		1964
1885	if (curr)	1965	if (curr)
1886	se->vruntime = curr->vruntime;	1966	se->vruntime = curr->vruntime;
1887	place_entity(cfs_rq, se, 1);	1967	place_entity(cfs_rq, se, 1);
1888		1968
1889	/* 'curr' will be NULL if the child belongs to a different group */	1969	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
1890	if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1891	curr && entity_before(curr, se)) {
1892	/*	1970	/*
1893	* Upon rescheduling, sched_class::put_prev_task() will place	1971	* Upon rescheduling, sched_class::put_prev_task() will place
1894	* 'current' within the tree based on its new key value.	1972	* 'current' within the tree based on its new key value.
@@ -1897,7 +1975,7 @@ static void task_new_fair(struct rq rq, struct task_struct p)
1897	resched_task(rq->curr);	1975	resched_task(rq->curr);
1898	}	1976	}
1899		1977
1900	enqueue_task_fair(rq, p, 0);	1978	raw_spin_unlock_irqrestore(&rq->lock, flags);
1901	}	1979	}
1902		1980
1903	/*	1981	/*
@@ -1959,21 +2037,17 @@ static void moved_group_fair(struct task_struct *p)
1959	}	2037	}
1960	#endif	2038	#endif
1961		2039
1962	unsigned int get_rr_interval_fair(struct task_struct *task)	2040	unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)
1963	{	2041	{
1964	struct sched_entity *se = &task->se;	2042	struct sched_entity *se = &task->se;
1965	unsigned long flags;
1966	struct rq *rq;
1967	unsigned int rr_interval = 0;	2043	unsigned int rr_interval = 0;
1968		2044
1969	/*	2045	/*
1970	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise	2046	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
1971	* idle runqueue:	2047	* idle runqueue:
1972	*/	2048	*/
1973	rq = task_rq_lock(task, &flags);
1974	if (rq->cfs.load.weight)	2049	if (rq->cfs.load.weight)
1975	rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));	2050	rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
1976	task_rq_unlock(rq, &flags);
1977		2051
1978	return rr_interval;	2052	return rr_interval;
1979	}	2053	}
@@ -1997,11 +2071,13 @@ static const struct sched_class fair_sched_class = {
1997		2071
1998	.load_balance = load_balance_fair,	2072	.load_balance = load_balance_fair,
1999	.move_one_task = move_one_task_fair,	2073	.move_one_task = move_one_task_fair,
		2074	.rq_online = rq_online_fair,
		2075	.rq_offline = rq_offline_fair,
2000	#endif	2076	#endif
2001		2077
2002	.set_curr_task = set_curr_task_fair,	2078	.set_curr_task = set_curr_task_fair,
2003	.task_tick = task_tick_fair,	2079	.task_tick = task_tick_fair,
2004	.task_new = task_new_fair,	2080	.task_fork = task_fork_fair,
2005		2081
2006	.prio_changed = prio_changed_fair,	2082	.prio_changed = prio_changed_fair,
2007	.switched_to = switched_to_fair,	2083	.switched_to = switched_to_fair,