sched: Improve scalability via 'CPU buddies', which withstand random perturbations

Traversing an entire package is not only expensive, it also leads to tasks bouncing all over a partially idle and possible quite large package. Fix that up by assigning a 'buddy' CPU to try to motivate. Each buddy may try to motivate that one other CPU, if it's busy, tough, it may then try its SMT sibling, but that's all this optimization is allowed to cost. Sibling cache buddies are cross-wired to prevent bouncing. 4 socket 40 core + SMT Westmere box, single 30 sec tbench runs, higher is better: clients 1 2 4 8 16 32 64 128 .......................................................................... pre 30 41 118 645 3769 6214 12233 14312 post 299 603 1211 2418 4697 6847 11606 14557 A nice increase in performance. Signed-off-by: Mike Galbraith <efault@gmx.de> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1339471112.7352.32.camel@marge.simpson.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Mike Galbraith <efault@gmx.de> 2012-06-11 23:18:32 -0400
committer: Ingo Molnar <mingo@kernel.org> 2012-07-24 07:53:34 -0400
commit: 970e178985cadbca660feb02f4d2ee3a09f7fdda (patch)
tree: 20f47abb069a5d13940e8cdc48dae5f5563eb59a /kernel
parent: a1cd2b13f754b2c56fb87b8c4912c015f8f57c0c (diff)
2 files changed, 45 insertions, 22 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4b4a63d34396..536b213f0ce5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
 * allows us to avoid some pointer chasing select_idle_sibling().
 *
+ * Iterate domains and sched_groups downward, assigning CPUs to be
+ * select_idle_sibling() hw buddy.  Cross-wiring hw makes bouncing
+ * due to random perturbation self canceling, ie sw buddies pull
+ * their counterpart to their CPU's hw counterpart.
+ *
 * Also keep a unique ID per domain (we use the first cpu number in
 * the cpumask of the domain), this allows us to quickly tell if
 * two cpus are in the same cache domain, see cpus_share_cache().
@@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu)
        int id = cpu;
        sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
-        if (sd)
+        if (sd) {
+                struct sched_domain *tmp = sd;
+                struct sched_group *sg, *prev;
+                bool right;
+                /*
+                 * Traverse to first CPU in group, and count hops
+                 * to cpu from there, switching direction on each
+                 * hop, never ever pointing the last CPU rightward.
+                 */
+                do {
+                        id = cpumask_first(sched_domain_span(tmp));
+                        prev = sg = tmp->groups;
+                        right = 1;
+                        while (cpumask_first(sched_group_cpus(sg)) != id)
+                                sg = sg->next;
+                        while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
+                                prev = sg;
+                                sg = sg->next;
+                                right = !right;
+                        }
+                        /* A CPU went down, never point back to domain start. */
+                        if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
+                                right = false;
+                        sg = right ? sg->next : prev;
+                        tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
+                } while ((tmp = tmp->child));
                id = cpumask_first(sched_domain_span(sd));
+        }
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_id, cpu) = id;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c099cc6eebe3..dd00aaf44fda 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
        int cpu = smp_processor_id();
        int prev_cpu = task_cpu(p);
        struct sched_domain *sd;
-        struct sched_group *sg;
-        int i;
        /*
         * If the task is going to be woken-up on this cpu and if it is
@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
                return prev_cpu;
        /*
-         * Otherwise, iterate the domains and find an elegible idle cpu.
+         * Otherwise, check assigned siblings to find an elegible idle cpu.
         */
        sd = rcu_dereference(per_cpu(sd_llc, target));
-        for_each_lower_domain(sd) {
-                sg = sd->groups;
-                do {
-                        if (!cpumask_intersects(sched_group_cpus(sg),
-                                                tsk_cpus_allowed(p)))
-                                goto next;
-                        for_each_cpu(i, sched_group_cpus(sg)) {
-                                if (!idle_cpu(i))
-                                        goto next;
-                        }
-                        target = cpumask_first_and(sched_group_cpus(sg),
+        for_each_lower_domain(sd) {
-                                        tsk_cpus_allowed(p));
+                if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
-                        goto done;
+                        continue;
-next:
+                if (idle_cpu(sd->idle_buddy))
-                        sg = sg->next;
+                        return sd->idle_buddy;
-                } while (sg != sd->groups);
        }
-done:
        return target;
 }
author	Mike Galbraith <efault@gmx.de>	2012-06-11 23:18:32 -0400
committer	Ingo Molnar <mingo@kernel.org>	2012-07-24 07:53:34 -0400
commit	970e178985cadbca660feb02f4d2ee3a09f7fdda (patch)
tree	20f47abb069a5d13940e8cdc48dae5f5563eb59a /kernel
parent	a1cd2b13f754b2c56fb87b8c4912c015f8f57c0c (diff)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4b4a63d34396..536b213f0ce5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6024	* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this	6024	* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
6025	* allows us to avoid some pointer chasing select_idle_sibling().	6025	* allows us to avoid some pointer chasing select_idle_sibling().
6026	*	6026	*
		6027	* Iterate domains and sched_groups downward, assigning CPUs to be
		6028	* select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
		6029	* due to random perturbation self canceling, ie sw buddies pull
		6030	* their counterpart to their CPU's hw counterpart.
		6031	*
6027	* Also keep a unique ID per domain (we use the first cpu number in	6032	* Also keep a unique ID per domain (we use the first cpu number in
6028	* the cpumask of the domain), this allows us to quickly tell if	6033	* the cpumask of the domain), this allows us to quickly tell if
6029	* two cpus are in the same cache domain, see cpus_share_cache().	6034	* two cpus are in the same cache domain, see cpus_share_cache().
@@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu)
6037	int id = cpu;	6042	int id = cpu;
6038		6043
6039	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);	6044	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
6040	if (sd)	6045	if (sd) {
		6046	struct sched_domain *tmp = sd;
		6047	struct sched_group sg, prev;
		6048	bool right;
		6049
		6050	/*
		6051	* Traverse to first CPU in group, and count hops
		6052	* to cpu from there, switching direction on each
		6053	* hop, never ever pointing the last CPU rightward.
		6054	*/
		6055	do {
		6056	id = cpumask_first(sched_domain_span(tmp));
		6057	prev = sg = tmp->groups;
		6058	right = 1;
		6059
		6060	while (cpumask_first(sched_group_cpus(sg)) != id)
		6061	sg = sg->next;
		6062
		6063	while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
		6064	prev = sg;
		6065	sg = sg->next;
		6066	right = !right;
		6067	}
		6068
		6069	/* A CPU went down, never point back to domain start. */
		6070	if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
		6071	right = false;
		6072
		6073	sg = right ? sg->next : prev;
		6074	tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
		6075	} while ((tmp = tmp->child));
		6076
6041	id = cpumask_first(sched_domain_span(sd));	6077	id = cpumask_first(sched_domain_span(sd));
		6078	}
6042		6079
6043	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);	6080	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
6044	per_cpu(sd_llc_id, cpu) = id;	6081	per_cpu(sd_llc_id, cpu) = id;


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c099cc6eebe3..dd00aaf44fda 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
2637	int cpu = smp_processor_id();	2637	int cpu = smp_processor_id();
2638	int prev_cpu = task_cpu(p);	2638	int prev_cpu = task_cpu(p);
2639	struct sched_domain *sd;	2639	struct sched_domain *sd;
2640	struct sched_group *sg;
2641	int i;
2642		2640
2643	/*	2641	/*
2644	* If the task is going to be woken-up on this cpu and if it is	2642	* If the task is going to be woken-up on this cpu and if it is
@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
2655	return prev_cpu;	2653	return prev_cpu;
2656		2654
2657	/*	2655	/*
2658	* Otherwise, iterate the domains and find an elegible idle cpu.	2656	* Otherwise, check assigned siblings to find an elegible idle cpu.
2659	*/	2657	*/
2660	sd = rcu_dereference(per_cpu(sd_llc, target));	2658	sd = rcu_dereference(per_cpu(sd_llc, target));
2661	for_each_lower_domain(sd) {
2662	sg = sd->groups;
2663	do {
2664	if (!cpumask_intersects(sched_group_cpus(sg),
2665	tsk_cpus_allowed(p)))
2666	goto next;
2667
2668	for_each_cpu(i, sched_group_cpus(sg)) {
2669	if (!idle_cpu(i))
2670	goto next;
2671	}
2672		2659
2673	target = cpumask_first_and(sched_group_cpus(sg),	2660	for_each_lower_domain(sd) {
2674	tsk_cpus_allowed(p));	2661	if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
2675	goto done;	2662	continue;
2676	next:	2663	if (idle_cpu(sd->idle_buddy))
2677	sg = sg->next;	2664	return sd->idle_buddy;
2678	} while (sg != sd->groups);
2679	}	2665	}
2680	done:	2666
2681	return target;	2667	return target;
2682	}	2668	}
2683		2669