aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorMike Galbraith <efault@gmx.de>2012-06-11 23:18:32 -0400
committerIngo Molnar <mingo@kernel.org>2012-07-24 07:53:34 -0400
commit970e178985cadbca660feb02f4d2ee3a09f7fdda (patch)
tree20f47abb069a5d13940e8cdc48dae5f5563eb59a /kernel
parenta1cd2b13f754b2c56fb87b8c4912c015f8f57c0c (diff)
sched: Improve scalability via 'CPU buddies', which withstand random perturbations
Traversing an entire package is not only expensive, it also leads to tasks bouncing all over a partially idle and possible quite large package. Fix that up by assigning a 'buddy' CPU to try to motivate. Each buddy may try to motivate that one other CPU, if it's busy, tough, it may then try its SMT sibling, but that's all this optimization is allowed to cost. Sibling cache buddies are cross-wired to prevent bouncing. 4 socket 40 core + SMT Westmere box, single 30 sec tbench runs, higher is better: clients 1 2 4 8 16 32 64 128 .......................................................................... pre 30 41 118 645 3769 6214 12233 14312 post 299 603 1211 2418 4697 6847 11606 14557 A nice increase in performance. Signed-off-by: Mike Galbraith <efault@gmx.de> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1339471112.7352.32.camel@marge.simpson.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/core.c39
-rw-r--r--kernel/sched/fair.c28
2 files changed, 45 insertions, 22 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4b4a63d3439..536b213f0ce 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6024 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 6024 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
6025 * allows us to avoid some pointer chasing select_idle_sibling(). 6025 * allows us to avoid some pointer chasing select_idle_sibling().
6026 * 6026 *
6027 * Iterate domains and sched_groups downward, assigning CPUs to be
6028 * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
6029 * due to random perturbation self canceling, ie sw buddies pull
6030 * their counterpart to their CPU's hw counterpart.
6031 *
6027 * Also keep a unique ID per domain (we use the first cpu number in 6032 * Also keep a unique ID per domain (we use the first cpu number in
6028 * the cpumask of the domain), this allows us to quickly tell if 6033 * the cpumask of the domain), this allows us to quickly tell if
6029 * two cpus are in the same cache domain, see cpus_share_cache(). 6034 * two cpus are in the same cache domain, see cpus_share_cache().
@@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu)
6037 int id = cpu; 6042 int id = cpu;
6038 6043
6039 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 6044 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
6040 if (sd) 6045 if (sd) {
6046 struct sched_domain *tmp = sd;
6047 struct sched_group *sg, *prev;
6048 bool right;
6049
6050 /*
6051 * Traverse to first CPU in group, and count hops
6052 * to cpu from there, switching direction on each
6053 * hop, never ever pointing the last CPU rightward.
6054 */
6055 do {
6056 id = cpumask_first(sched_domain_span(tmp));
6057 prev = sg = tmp->groups;
6058 right = 1;
6059
6060 while (cpumask_first(sched_group_cpus(sg)) != id)
6061 sg = sg->next;
6062
6063 while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
6064 prev = sg;
6065 sg = sg->next;
6066 right = !right;
6067 }
6068
6069 /* A CPU went down, never point back to domain start. */
6070 if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
6071 right = false;
6072
6073 sg = right ? sg->next : prev;
6074 tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
6075 } while ((tmp = tmp->child));
6076
6041 id = cpumask_first(sched_domain_span(sd)); 6077 id = cpumask_first(sched_domain_span(sd));
6078 }
6042 6079
6043 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 6080 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
6044 per_cpu(sd_llc_id, cpu) = id; 6081 per_cpu(sd_llc_id, cpu) = id;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c099cc6eebe..dd00aaf44fd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
2637 int cpu = smp_processor_id(); 2637 int cpu = smp_processor_id();
2638 int prev_cpu = task_cpu(p); 2638 int prev_cpu = task_cpu(p);
2639 struct sched_domain *sd; 2639 struct sched_domain *sd;
2640 struct sched_group *sg;
2641 int i;
2642 2640
2643 /* 2641 /*
2644 * If the task is going to be woken-up on this cpu and if it is 2642 * If the task is going to be woken-up on this cpu and if it is
@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
2655 return prev_cpu; 2653 return prev_cpu;
2656 2654
2657 /* 2655 /*
2658 * Otherwise, iterate the domains and find an elegible idle cpu. 2656 * Otherwise, check assigned siblings to find an elegible idle cpu.
2659 */ 2657 */
2660 sd = rcu_dereference(per_cpu(sd_llc, target)); 2658 sd = rcu_dereference(per_cpu(sd_llc, target));
2661 for_each_lower_domain(sd) {
2662 sg = sd->groups;
2663 do {
2664 if (!cpumask_intersects(sched_group_cpus(sg),
2665 tsk_cpus_allowed(p)))
2666 goto next;
2667
2668 for_each_cpu(i, sched_group_cpus(sg)) {
2669 if (!idle_cpu(i))
2670 goto next;
2671 }
2672 2659
2673 target = cpumask_first_and(sched_group_cpus(sg), 2660 for_each_lower_domain(sd) {
2674 tsk_cpus_allowed(p)); 2661 if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
2675 goto done; 2662 continue;
2676next: 2663 if (idle_cpu(sd->idle_buddy))
2677 sg = sg->next; 2664 return sd->idle_buddy;
2678 } while (sg != sd->groups);
2679 } 2665 }
2680done: 2666
2681 return target; 2667 return target;
2682} 2668}
2683 2669