1 files changed, 107 insertions, 32 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4e777b47eeda..f61837ad336d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -822,6 +822,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
                 * re-elected due to buddy favours.
                 */
                clear_buddies(cfs_rq, curr);
+                return;
+        }
+        /*
+         * Ensure that a task that missed wakeup preemption by a
+         * narrow margin doesn't have to wait for a full slice.
+         * This also mitigates buddy induced latencies under load.
+         */
+        if (!sched_feat(WAKEUP_PREEMPT))
+                return;
+        if (delta_exec < sysctl_sched_min_granularity)
+                return;
+        if (cfs_rq->nr_running > 1) {
+                struct sched_entity *se = __pick_next_entity(cfs_rq);
+                s64 delta = curr->vruntime - se->vruntime;
+                if (delta > ideal_runtime)
+                        resched_task(rq_of(cfs_rq)->curr);
        }
 }
@@ -861,12 +881,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *se = __pick_next_entity(cfs_rq);
+        struct sched_entity *left = se;
-        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
+        if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
-                return cfs_rq->next;
+                se = cfs_rq->next;
-        if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
+        /*
-                return cfs_rq->last;
+         * Prefer last buddy, try to return the CPU to a preempted task.
+         */
+        if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
+                se = cfs_rq->last;
+        clear_buddies(cfs_rq, se);
        return se;
 }
@@ -1319,6 +1345,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 }
 /*
+ * Try and locate an idle CPU in the sched_domain.
+ */
+static int
+select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
+{
+        int cpu = smp_processor_id();
+        int prev_cpu = task_cpu(p);
+        int i;
+        /*
+         * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
+         * test in select_task_rq_fair) and the prev_cpu is idle then that's
+         * always a better target than the current cpu.
+         */
+        if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+                return prev_cpu;
+        /*
+         * Otherwise, iterate the domain and find an elegible idle cpu.
+         */
+        for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+                if (!cpu_rq(i)->cfs.nr_running) {
+                        target = i;
+                        break;
+                }
+        }
+        return target;
+}
+/*
 * sched_balance_self: balance the current task (running on cpu) in domains
 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
 * SD_BALANCE_EXEC.
@@ -1372,11 +1429,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                                want_sd = 0;
                }
-                if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+                /*
-                    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+                 * While iterating the domains looking for a spanning
+                 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
+                 * in cache sharing domains along the way.
+                 */
+                if (want_affine) {
+                        int target = -1;
-                        affine_sd = tmp;
+                        /*
-                        want_affine = 0;
+                         * If both cpu and prev_cpu are part of this domain,
+                         * cpu is a valid SD_WAKE_AFFINE target.
+                         */
+                        if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
+                                target = cpu;
+                        /*
+                         * If there's an idle sibling in this domain, make that
+                         * the wake_affine target instead of the current cpu.
+                         */
+                        if (tmp->flags & SD_PREFER_SIBLING)
+                                target = select_idle_sibling(p, tmp, target);
+                        if (target >= 0) {
+                                if (tmp->flags & SD_WAKE_AFFINE) {
+                                        affine_sd = tmp;
+                                        want_affine = 0;
+                                }
+                                cpu = target;
+                        }
                }
                if (!want_sd && !want_affine)
@@ -1568,6 +1649,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        struct sched_entity *se = &curr->se, *pse = &p->se;
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
        int sync = wake_flags & WF_SYNC;
+        int scale = cfs_rq->nr_running >= sched_nr_latency;
        update_curr(cfs_rq);
@@ -1582,18 +1664,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        if (unlikely(se == pse))
                return;
-        /*
+        if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
-         * Only set the backward buddy when the current task is still on the
-         * rq. This can happen when a wakeup gets interleaved with schedule on
-         * the ->pre_schedule() or idle_balance() point, either of which can
-         * drop the rq lock.
-         *
-         * Also, during early boot the idle thread is in the fair class, for
-         * obvious reasons its a bad idea to schedule back to the idle thread.
-         */
-        if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
-                set_last_buddy(se);
-        if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
                set_next_buddy(pse);
        /*
@@ -1639,8 +1710,22 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        BUG_ON(!pse);
-        if (wakeup_preempt_entity(se, pse) == 1)
+        if (wakeup_preempt_entity(se, pse) == 1) {
                resched_task(curr);
+                /*
+                 * Only set the backward buddy when the current task is still
+                 * on the rq. This can happen when a wakeup gets interleaved
+                 * with schedule on the ->pre_schedule() or idle_balance()
+                 * point, either of which can * drop the rq lock.
+                 *
+                 * Also, during early boot the idle thread is in the fair class,
+                 * for obvious reasons its a bad idea to schedule back to it.
+                 */
+                if (unlikely(!se->on_rq || curr == rq->idle))
+                        return;
+                if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+                        set_last_buddy(se);
+        }
 }
 static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1649,21 +1734,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
        struct cfs_rq *cfs_rq = &rq->cfs;
        struct sched_entity *se;
-        if (unlikely(!cfs_rq->nr_running))
+        if (!cfs_rq->nr_running)
                return NULL;
        do {
                se = pick_next_entity(cfs_rq);
-                /*
-                 * If se was a buddy, clear it so that it will have to earn
-                 * the favour again.
-                 *
-                 * If se was not a buddy, clear the buddies because neither
-                 * was elegible to run, let them earn it again.
-                 *
-                 * IOW. unconditionally clear buddies.
-                 */
-                __clear_buddies(cfs_rq, NULL);
                set_next_entity(cfs_rq, se);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4e777b47eeda..f61837ad336d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -822,6 +822,26 @@ check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
822	* re-elected due to buddy favours.	822	* re-elected due to buddy favours.
823	*/	823	*/
824	clear_buddies(cfs_rq, curr);	824	clear_buddies(cfs_rq, curr);
		825	return;
		826	}
		827
		828	/*
		829	* Ensure that a task that missed wakeup preemption by a
		830	* narrow margin doesn't have to wait for a full slice.
		831	* This also mitigates buddy induced latencies under load.
		832	*/
		833	if (!sched_feat(WAKEUP_PREEMPT))
		834	return;
		835
		836	if (delta_exec < sysctl_sched_min_granularity)
		837	return;
		838
		839	if (cfs_rq->nr_running > 1) {
		840	struct sched_entity *se = __pick_next_entity(cfs_rq);
		841	s64 delta = curr->vruntime - se->vruntime;
		842
		843	if (delta > ideal_runtime)
		844	resched_task(rq_of(cfs_rq)->curr);
825	}	845	}
826	}	846	}
827		847
@@ -861,12 +881,18 @@ wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
861	static struct sched_entity pick_next_entity(struct cfs_rq cfs_rq)	881	static struct sched_entity pick_next_entity(struct cfs_rq cfs_rq)
862	{	882	{
863	struct sched_entity *se = __pick_next_entity(cfs_rq);	883	struct sched_entity *se = __pick_next_entity(cfs_rq);
		884	struct sched_entity *left = se;
864		885
865	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)	886	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
866	return cfs_rq->next;	887	se = cfs_rq->next;
867		888
868	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)	889	/*
869	return cfs_rq->last;	890	* Prefer last buddy, try to return the CPU to a preempted task.
		891	*/
		892	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
		893	se = cfs_rq->last;
		894
		895	clear_buddies(cfs_rq, se);
870		896
871	return se;	897	return se;
872	}	898	}
@@ -1319,6 +1345,37 @@ find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
1319	}	1345	}
1320		1346
1321	/*	1347	/*
		1348	* Try and locate an idle CPU in the sched_domain.
		1349	*/
		1350	static int
		1351	select_idle_sibling(struct task_struct p, struct sched_domain sd, int target)
		1352	{
		1353	int cpu = smp_processor_id();
		1354	int prev_cpu = task_cpu(p);
		1355	int i;
		1356
		1357	/*
		1358	* If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
		1359	* test in select_task_rq_fair) and the prev_cpu is idle then that's
		1360	* always a better target than the current cpu.
		1361	*/
		1362	if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
		1363	return prev_cpu;
		1364
		1365	/*
		1366	* Otherwise, iterate the domain and find an elegible idle cpu.
		1367	*/
		1368	for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
		1369	if (!cpu_rq(i)->cfs.nr_running) {
		1370	target = i;
		1371	break;
		1372	}
		1373	}
		1374
		1375	return target;
		1376	}
		1377
		1378	/*
1322	* sched_balance_self: balance the current task (running on cpu) in domains	1379	* sched_balance_self: balance the current task (running on cpu) in domains
1323	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and	1380	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1324	* SD_BALANCE_EXEC.	1381	* SD_BALANCE_EXEC.
@@ -1372,11 +1429,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1372	want_sd = 0;	1429	want_sd = 0;
1373	}	1430	}
1374		1431
1375	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&	1432	/*
1376	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {	1433	* While iterating the domains looking for a spanning
		1434	* WAKE_AFFINE domain, adjust the affine target to any idle cpu
		1435	* in cache sharing domains along the way.
		1436	*/
		1437	if (want_affine) {
		1438	int target = -1;
1377		1439
1378	affine_sd = tmp;	1440	/*
1379	want_affine = 0;	1441	* If both cpu and prev_cpu are part of this domain,
		1442	* cpu is a valid SD_WAKE_AFFINE target.
		1443	*/
		1444	if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
		1445	target = cpu;
		1446
		1447	/*
		1448	* If there's an idle sibling in this domain, make that
		1449	* the wake_affine target instead of the current cpu.
		1450	*/
		1451	if (tmp->flags & SD_PREFER_SIBLING)
		1452	target = select_idle_sibling(p, tmp, target);
		1453
		1454	if (target >= 0) {
		1455	if (tmp->flags & SD_WAKE_AFFINE) {
		1456	affine_sd = tmp;
		1457	want_affine = 0;
		1458	}
		1459	cpu = target;
		1460	}
1380	}	1461	}
1381		1462
1382	if (!want_sd && !want_affine)	1463	if (!want_sd && !want_affine)
@@ -1568,6 +1649,7 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_
1568	struct sched_entity se = &curr->se, pse = &p->se;	1649	struct sched_entity se = &curr->se, pse = &p->se;
1569	struct cfs_rq *cfs_rq = task_cfs_rq(curr);	1650	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1570	int sync = wake_flags & WF_SYNC;	1651	int sync = wake_flags & WF_SYNC;
		1652	int scale = cfs_rq->nr_running >= sched_nr_latency;
1571		1653
1572	update_curr(cfs_rq);	1654	update_curr(cfs_rq);
1573		1655
@@ -1582,18 +1664,7 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_
1582	if (unlikely(se == pse))	1664	if (unlikely(se == pse))
1583	return;	1665	return;
1584		1666
1585	/*	1667	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
1586	* Only set the backward buddy when the current task is still on the
1587	* rq. This can happen when a wakeup gets interleaved with schedule on
1588	* the ->pre_schedule() or idle_balance() point, either of which can
1589	* drop the rq lock.
1590	*
1591	* Also, during early boot the idle thread is in the fair class, for
1592	* obvious reasons its a bad idea to schedule back to the idle thread.
1593	*/
1594	if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1595	set_last_buddy(se);
1596	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
1597	set_next_buddy(pse);	1668	set_next_buddy(pse);
1598		1669
1599	/*	1670	/*
@@ -1639,8 +1710,22 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_
1639		1710
1640	BUG_ON(!pse);	1711	BUG_ON(!pse);
1641		1712
1642	if (wakeup_preempt_entity(se, pse) == 1)	1713	if (wakeup_preempt_entity(se, pse) == 1) {
1643	resched_task(curr);	1714	resched_task(curr);
		1715	/*
		1716	* Only set the backward buddy when the current task is still
		1717	* on the rq. This can happen when a wakeup gets interleaved
		1718	* with schedule on the ->pre_schedule() or idle_balance()
		1719	* point, either of which can * drop the rq lock.
		1720	*
		1721	* Also, during early boot the idle thread is in the fair class,
		1722	* for obvious reasons its a bad idea to schedule back to it.
		1723	*/
		1724	if (unlikely(!se->on_rq \|\| curr == rq->idle))
		1725	return;
		1726	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
		1727	set_last_buddy(se);
		1728	}
1644	}	1729	}
1645		1730
1646	static struct task_struct pick_next_task_fair(struct rq rq)	1731	static struct task_struct pick_next_task_fair(struct rq rq)
@@ -1649,21 +1734,11 @@ static struct task_struct pick_next_task_fair(struct rq rq)
1649	struct cfs_rq *cfs_rq = &rq->cfs;	1734	struct cfs_rq *cfs_rq = &rq->cfs;
1650	struct sched_entity *se;	1735	struct sched_entity *se;
1651		1736
1652	if (unlikely(!cfs_rq->nr_running))	1737	if (!cfs_rq->nr_running)
1653	return NULL;	1738	return NULL;
1654		1739
1655	do {	1740	do {
1656	se = pick_next_entity(cfs_rq);	1741	se = pick_next_entity(cfs_rq);
1657	/*
1658	* If se was a buddy, clear it so that it will have to earn
1659	* the favour again.
1660	*
1661	* If se was not a buddy, clear the buddies because neither
1662	* was elegible to run, let them earn it again.
1663	*
1664	* IOW. unconditionally clear buddies.
1665	*/
1666	__clear_buddies(cfs_rq, NULL);
1667	set_next_entity(cfs_rq, se);	1742	set_next_entity(cfs_rq, se);
1668	cfs_rq = group_cfs_rq(se);	1743	cfs_rq = group_cfs_rq(se);
1669	} while (cfs_rq);	1744	} while (cfs_rq);