aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSiddha, Suresh B <suresh.b.siddha@intel.com>2006-03-27 04:15:22 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-27 11:44:43 -0500
commit1e9f28fa1eb9773bf65bae08288c6a0a38eef4a7 (patch)
treeccfa4927ebc7a8f663f9ac9e7789a713a33253ff
parent77e4bfbcf071f795b54862455dce8902b3fc29c2 (diff)
[PATCH] sched: new sched domain for representing multi-core
Add a new sched domain for representing multi-core with shared caches between cores. Consider a dual package system, each package containing two cores and with last level cache shared between cores with in a package. If there are two runnable processes, with this appended patch those two processes will be scheduled on different packages. On such systems, with this patch we have observed 8% perf improvement with specJBB(2 warehouse) benchmark and 35% improvement with CFP2000 rate(with 2 users). This new domain will come into play only on multi-core systems with shared caches. On other systems, this sched domain will be removed by domain degeneration code. This new domain can be also used for implementing power savings policy (see OLS 2005 CMP kernel scheduler paper for more details.. I will post another patch for power savings policy soon) Most of the arch/* file changes are for cpu_coregroup_map() implementation. Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Cc: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/i386/Kconfig9
-rw-r--r--arch/i386/kernel/cpu/common.c10
-rw-r--r--arch/i386/kernel/cpu/intel_cacheinfo.c22
-rw-r--r--arch/i386/kernel/smpboot.c24
-rw-r--r--arch/x86_64/Kconfig9
-rw-r--r--arch/x86_64/kernel/setup.c3
-rw-r--r--arch/x86_64/kernel/smpboot.c24
-rw-r--r--include/asm-i386/processor.h5
-rw-r--r--include/asm-i386/topology.h2
-rw-r--r--include/asm-x86_64/processor.h4
-rw-r--r--include/asm-x86_64/smp.h1
-rw-r--r--include/asm-x86_64/topology.h2
-rw-r--r--include/linux/topology.h9
-rw-r--r--kernel/sched.c73
14 files changed, 186 insertions, 11 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index f7db71d0b913..f17bd1d2707e 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -231,6 +231,15 @@ config SCHED_SMT
231 cost of slightly increased overhead in some places. If unsure say 231 cost of slightly increased overhead in some places. If unsure say
232 N here. 232 N here.
233 233
234config SCHED_MC
235 bool "Multi-core scheduler support"
236 depends on SMP
237 default y
238 help
239 Multi-core scheduler support improves the CPU scheduler's decision
240 making when dealing with multi-core CPU chips at a cost of slightly
241 increased overhead in some places. If unsure say N here.
242
234source "kernel/Kconfig.preempt" 243source "kernel/Kconfig.preempt"
235 244
236config X86_UP_APIC 245config X86_UP_APIC
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 7e3d6b6a4e96..a06a49075f10 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -266,7 +266,7 @@ static void __init early_cpu_detect(void)
266void __cpuinit generic_identify(struct cpuinfo_x86 * c) 266void __cpuinit generic_identify(struct cpuinfo_x86 * c)
267{ 267{
268 u32 tfms, xlvl; 268 u32 tfms, xlvl;
269 int junk; 269 int ebx;
270 270
271 if (have_cpuid_p()) { 271 if (have_cpuid_p()) {
272 /* Get vendor name */ 272 /* Get vendor name */
@@ -282,7 +282,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
282 /* Intel-defined flags: level 0x00000001 */ 282 /* Intel-defined flags: level 0x00000001 */
283 if ( c->cpuid_level >= 0x00000001 ) { 283 if ( c->cpuid_level >= 0x00000001 ) {
284 u32 capability, excap; 284 u32 capability, excap;
285 cpuid(0x00000001, &tfms, &junk, &excap, &capability); 285 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
286 c->x86_capability[0] = capability; 286 c->x86_capability[0] = capability;
287 c->x86_capability[4] = excap; 287 c->x86_capability[4] = excap;
288 c->x86 = (tfms >> 8) & 15; 288 c->x86 = (tfms >> 8) & 15;
@@ -292,6 +292,11 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
292 if (c->x86 >= 0x6) 292 if (c->x86 >= 0x6)
293 c->x86_model += ((tfms >> 16) & 0xF) << 4; 293 c->x86_model += ((tfms >> 16) & 0xF) << 4;
294 c->x86_mask = tfms & 15; 294 c->x86_mask = tfms & 15;
295#ifdef CONFIG_SMP
296 c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
297#else
298 c->apicid = (ebx >> 24) & 0xFF;
299#endif
295 } else { 300 } else {
296 /* Have CPUID level 0 only - unheard of */ 301 /* Have CPUID level 0 only - unheard of */
297 c->x86 = 4; 302 c->x86 = 4;
@@ -474,7 +479,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
474 479
475 cpuid(1, &eax, &ebx, &ecx, &edx); 480 cpuid(1, &eax, &ebx, &ecx, &edx);
476 481
477 c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
478 482
479 if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) 483 if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
480 return; 484 return;
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index ce61921369e5..7e7fd4e67dd0 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -173,6 +173,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
173 unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */ 173 unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */
174 unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ 174 unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
175 unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ 175 unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
176 unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
177#ifdef CONFIG_SMP
178 unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data);
179#endif
176 180
177 if (c->cpuid_level > 3) { 181 if (c->cpuid_level > 3) {
178 static int is_initialized; 182 static int is_initialized;
@@ -205,9 +209,15 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
205 break; 209 break;
206 case 2: 210 case 2:
207 new_l2 = this_leaf.size/1024; 211 new_l2 = this_leaf.size/1024;
212 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
213 index_msb = get_count_order(num_threads_sharing);
214 l2_id = c->apicid >> index_msb;
208 break; 215 break;
209 case 3: 216 case 3:
210 new_l3 = this_leaf.size/1024; 217 new_l3 = this_leaf.size/1024;
218 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
219 index_msb = get_count_order(num_threads_sharing);
220 l3_id = c->apicid >> index_msb;
211 break; 221 break;
212 default: 222 default:
213 break; 223 break;
@@ -273,11 +283,19 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
273 if (new_l1i) 283 if (new_l1i)
274 l1i = new_l1i; 284 l1i = new_l1i;
275 285
276 if (new_l2) 286 if (new_l2) {
277 l2 = new_l2; 287 l2 = new_l2;
288#ifdef CONFIG_SMP
289 cpu_llc_id[cpu] = l2_id;
290#endif
291 }
278 292
279 if (new_l3) 293 if (new_l3) {
280 l3 = new_l3; 294 l3 = new_l3;
295#ifdef CONFIG_SMP
296 cpu_llc_id[cpu] = l3_id;
297#endif
298 }
281 299
282 if ( trace ) 300 if ( trace )
283 printk (KERN_INFO "CPU: Trace cache: %dK uops", trace); 301 printk (KERN_INFO "CPU: Trace cache: %dK uops", trace);
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 82371d83bfa9..a6969903f2d6 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -72,6 +72,9 @@ int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
72/* Core ID of each logical CPU */ 72/* Core ID of each logical CPU */
73int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID}; 73int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
74 74
75/* Last level cache ID of each logical CPU */
76int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
77
75/* representing HT siblings of each logical CPU */ 78/* representing HT siblings of each logical CPU */
76cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; 79cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
77EXPORT_SYMBOL(cpu_sibling_map); 80EXPORT_SYMBOL(cpu_sibling_map);
@@ -440,6 +443,18 @@ static void __devinit smp_callin(void)
440 443
441static int cpucount; 444static int cpucount;
442 445
446/* maps the cpu to the sched domain representing multi-core */
447cpumask_t cpu_coregroup_map(int cpu)
448{
449 struct cpuinfo_x86 *c = cpu_data + cpu;
450 /*
451 * For perf, we return last level cache shared map.
452 * TBD: when power saving sched policy is added, we will return
453 * cpu_core_map when power saving policy is enabled
454 */
455 return c->llc_shared_map;
456}
457
443/* representing cpus for which sibling maps can be computed */ 458/* representing cpus for which sibling maps can be computed */
444static cpumask_t cpu_sibling_setup_map; 459static cpumask_t cpu_sibling_setup_map;
445 460
@@ -459,12 +474,16 @@ set_cpu_sibling_map(int cpu)
459 cpu_set(cpu, cpu_sibling_map[i]); 474 cpu_set(cpu, cpu_sibling_map[i]);
460 cpu_set(i, cpu_core_map[cpu]); 475 cpu_set(i, cpu_core_map[cpu]);
461 cpu_set(cpu, cpu_core_map[i]); 476 cpu_set(cpu, cpu_core_map[i]);
477 cpu_set(i, c[cpu].llc_shared_map);
478 cpu_set(cpu, c[i].llc_shared_map);
462 } 479 }
463 } 480 }
464 } else { 481 } else {
465 cpu_set(cpu, cpu_sibling_map[cpu]); 482 cpu_set(cpu, cpu_sibling_map[cpu]);
466 } 483 }
467 484
485 cpu_set(cpu, c[cpu].llc_shared_map);
486
468 if (current_cpu_data.x86_max_cores == 1) { 487 if (current_cpu_data.x86_max_cores == 1) {
469 cpu_core_map[cpu] = cpu_sibling_map[cpu]; 488 cpu_core_map[cpu] = cpu_sibling_map[cpu];
470 c[cpu].booted_cores = 1; 489 c[cpu].booted_cores = 1;
@@ -472,6 +491,11 @@ set_cpu_sibling_map(int cpu)
472 } 491 }
473 492
474 for_each_cpu_mask(i, cpu_sibling_setup_map) { 493 for_each_cpu_mask(i, cpu_sibling_setup_map) {
494 if (cpu_llc_id[cpu] != BAD_APICID &&
495 cpu_llc_id[cpu] == cpu_llc_id[i]) {
496 cpu_set(i, c[cpu].llc_shared_map);
497 cpu_set(cpu, c[i].llc_shared_map);
498 }
475 if (phys_proc_id[cpu] == phys_proc_id[i]) { 499 if (phys_proc_id[cpu] == phys_proc_id[i]) {
476 cpu_set(i, cpu_core_map[cpu]); 500 cpu_set(i, cpu_core_map[cpu]);
477 cpu_set(cpu, cpu_core_map[i]); 501 cpu_set(cpu, cpu_core_map[i]);
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 45efe0ca88f8..1cb4aa241c8c 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -250,6 +250,15 @@ config SCHED_SMT
250 cost of slightly increased overhead in some places. If unsure say 250 cost of slightly increased overhead in some places. If unsure say
251 N here. 251 N here.
252 252
253config SCHED_MC
254 bool "Multi-core scheduler support"
255 depends on SMP
256 default y
257 help
258 Multi-core scheduler support improves the CPU scheduler's decision
259 making when dealing with multi-core CPU chips at a cost of slightly
260 increased overhead in some places. If unsure say N here.
261
253source "kernel/Kconfig.preempt" 262source "kernel/Kconfig.preempt"
254 263
255config NUMA 264config NUMA
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index a57eec8311a7..d1f3e9272c05 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -962,7 +962,6 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
962 962
963 cpuid(1, &eax, &ebx, &ecx, &edx); 963 cpuid(1, &eax, &ebx, &ecx, &edx);
964 964
965 c->apicid = phys_pkg_id(0);
966 965
967 if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) 966 if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
968 return; 967 return;
@@ -1171,6 +1170,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
1171 c->x86_capability[2] = cpuid_edx(0x80860001); 1170 c->x86_capability[2] = cpuid_edx(0x80860001);
1172 } 1171 }
1173 1172
1173 c->apicid = phys_pkg_id(0);
1174
1174 /* 1175 /*
1175 * Vendor-specific initialization. In this section we 1176 * Vendor-specific initialization. In this section we
1176 * canonicalize the feature flags, meaning if there are 1177 * canonicalize the feature flags, meaning if there are
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 66e98659d077..ea48fa638070 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -68,6 +68,9 @@ u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
68/* core ID of each logical CPU */ 68/* core ID of each logical CPU */
69u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; 69u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
70 70
71/* Last level cache ID of each logical CPU */
72u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
73
71/* Bitmask of currently online CPUs */ 74/* Bitmask of currently online CPUs */
72cpumask_t cpu_online_map __read_mostly; 75cpumask_t cpu_online_map __read_mostly;
73 76
@@ -445,6 +448,18 @@ void __cpuinit smp_callin(void)
445 cpu_set(cpuid, cpu_callin_map); 448 cpu_set(cpuid, cpu_callin_map);
446} 449}
447 450
451/* maps the cpu to the sched domain representing multi-core */
452cpumask_t cpu_coregroup_map(int cpu)
453{
454 struct cpuinfo_x86 *c = cpu_data + cpu;
455 /*
456 * For perf, we return last level cache shared map.
457 * TBD: when power saving sched policy is added, we will return
458 * cpu_core_map when power saving policy is enabled
459 */
460 return c->llc_shared_map;
461}
462
448/* representing cpus for which sibling maps can be computed */ 463/* representing cpus for which sibling maps can be computed */
449static cpumask_t cpu_sibling_setup_map; 464static cpumask_t cpu_sibling_setup_map;
450 465
@@ -463,12 +478,16 @@ static inline void set_cpu_sibling_map(int cpu)
463 cpu_set(cpu, cpu_sibling_map[i]); 478 cpu_set(cpu, cpu_sibling_map[i]);
464 cpu_set(i, cpu_core_map[cpu]); 479 cpu_set(i, cpu_core_map[cpu]);
465 cpu_set(cpu, cpu_core_map[i]); 480 cpu_set(cpu, cpu_core_map[i]);
481 cpu_set(i, c[cpu].llc_shared_map);
482 cpu_set(cpu, c[i].llc_shared_map);
466 } 483 }
467 } 484 }
468 } else { 485 } else {
469 cpu_set(cpu, cpu_sibling_map[cpu]); 486 cpu_set(cpu, cpu_sibling_map[cpu]);
470 } 487 }
471 488
489 cpu_set(cpu, c[cpu].llc_shared_map);
490
472 if (current_cpu_data.x86_max_cores == 1) { 491 if (current_cpu_data.x86_max_cores == 1) {
473 cpu_core_map[cpu] = cpu_sibling_map[cpu]; 492 cpu_core_map[cpu] = cpu_sibling_map[cpu];
474 c[cpu].booted_cores = 1; 493 c[cpu].booted_cores = 1;
@@ -476,6 +495,11 @@ static inline void set_cpu_sibling_map(int cpu)
476 } 495 }
477 496
478 for_each_cpu_mask(i, cpu_sibling_setup_map) { 497 for_each_cpu_mask(i, cpu_sibling_setup_map) {
498 if (cpu_llc_id[cpu] != BAD_APICID &&
499 cpu_llc_id[cpu] == cpu_llc_id[i]) {
500 cpu_set(i, c[cpu].llc_shared_map);
501 cpu_set(cpu, c[i].llc_shared_map);
502 }
479 if (phys_proc_id[cpu] == phys_proc_id[i]) { 503 if (phys_proc_id[cpu] == phys_proc_id[i]) {
480 cpu_set(i, cpu_core_map[cpu]); 504 cpu_set(i, cpu_core_map[cpu]);
481 cpu_set(cpu, cpu_core_map[i]); 505 cpu_set(cpu, cpu_core_map[i]);
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index feca5d961e2b..af4bfd012475 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -20,6 +20,7 @@
20#include <linux/config.h> 20#include <linux/config.h>
21#include <linux/threads.h> 21#include <linux/threads.h>
22#include <asm/percpu.h> 22#include <asm/percpu.h>
23#include <linux/cpumask.h>
23 24
24/* flag for disabling the tsc */ 25/* flag for disabling the tsc */
25extern int tsc_disable; 26extern int tsc_disable;
@@ -67,6 +68,9 @@ struct cpuinfo_x86 {
67 char pad0; 68 char pad0;
68 int x86_power; 69 int x86_power;
69 unsigned long loops_per_jiffy; 70 unsigned long loops_per_jiffy;
71#ifdef CONFIG_SMP
72 cpumask_t llc_shared_map; /* cpus sharing the last level cache */
73#endif
70 unsigned char x86_max_cores; /* cpuid returned max cores value */ 74 unsigned char x86_max_cores; /* cpuid returned max cores value */
71 unsigned char booted_cores; /* number of cores as seen by OS */ 75 unsigned char booted_cores; /* number of cores as seen by OS */
72 unsigned char apicid; 76 unsigned char apicid;
@@ -103,6 +107,7 @@ extern struct cpuinfo_x86 cpu_data[];
103 107
104extern int phys_proc_id[NR_CPUS]; 108extern int phys_proc_id[NR_CPUS];
105extern int cpu_core_id[NR_CPUS]; 109extern int cpu_core_id[NR_CPUS];
110extern int cpu_llc_id[NR_CPUS];
106extern char ignore_fpu_irq; 111extern char ignore_fpu_irq;
107 112
108extern void identify_cpu(struct cpuinfo_x86 *); 113extern void identify_cpu(struct cpuinfo_x86 *);
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index aa958c6ee83e..b94e5eeef917 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -112,4 +112,6 @@ extern unsigned long node_remap_size[];
112 112
113#endif /* CONFIG_NUMA */ 113#endif /* CONFIG_NUMA */
114 114
115extern cpumask_t cpu_coregroup_map(int cpu);
116
115#endif /* _ASM_I386_TOPOLOGY_H */ 117#endif /* _ASM_I386_TOPOLOGY_H */
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 8c8d88c036ed..1aa2cee43344 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -20,6 +20,7 @@
20#include <asm/mmsegment.h> 20#include <asm/mmsegment.h>
21#include <asm/percpu.h> 21#include <asm/percpu.h>
22#include <linux/personality.h> 22#include <linux/personality.h>
23#include <linux/cpumask.h>
23 24
24#define TF_MASK 0x00000100 25#define TF_MASK 0x00000100
25#define IF_MASK 0x00000200 26#define IF_MASK 0x00000200
@@ -65,6 +66,9 @@ struct cpuinfo_x86 {
65 __u32 x86_power; 66 __u32 x86_power;
66 __u32 extended_cpuid_level; /* Max extended CPUID function supported */ 67 __u32 extended_cpuid_level; /* Max extended CPUID function supported */
67 unsigned long loops_per_jiffy; 68 unsigned long loops_per_jiffy;
69#ifdef CONFIG_SMP
70 cpumask_t llc_shared_map; /* cpus sharing the last level cache */
71#endif
68 __u8 apicid; 72 __u8 apicid;
69 __u8 booted_cores; /* number of cores as seen by OS */ 73 __u8 booted_cores; /* number of cores as seen by OS */
70} ____cacheline_aligned; 74} ____cacheline_aligned;
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index 9ccbb2cfd5c0..a4fdaeb5c397 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -56,6 +56,7 @@ extern cpumask_t cpu_sibling_map[NR_CPUS];
56extern cpumask_t cpu_core_map[NR_CPUS]; 56extern cpumask_t cpu_core_map[NR_CPUS];
57extern u8 phys_proc_id[NR_CPUS]; 57extern u8 phys_proc_id[NR_CPUS];
58extern u8 cpu_core_id[NR_CPUS]; 58extern u8 cpu_core_id[NR_CPUS];
59extern u8 cpu_llc_id[NR_CPUS];
59 60
60#define SMP_TRAMPOLINE_BASE 0x6000 61#define SMP_TRAMPOLINE_BASE 0x6000
61 62
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index c642f5d9882d..9db54e9d17bb 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -68,4 +68,6 @@ extern int __node_distance(int, int);
68 68
69#include <asm-generic/topology.h> 69#include <asm-generic/topology.h>
70 70
71extern cpumask_t cpu_coregroup_map(int cpu);
72
71#endif 73#endif
diff --git a/include/linux/topology.h b/include/linux/topology.h
index e8eb0040ce3a..a305ae2e44b6 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -164,6 +164,15 @@
164 .nr_balance_failed = 0, \ 164 .nr_balance_failed = 0, \
165} 165}
166 166
167#ifdef CONFIG_SCHED_MC
168#ifndef SD_MC_INIT
169/* for now its same as SD_CPU_INIT.
170 * TBD: Tune Domain parameters!
171 */
172#define SD_MC_INIT SD_CPU_INIT
173#endif
174#endif
175
167#ifdef CONFIG_NUMA 176#ifdef CONFIG_NUMA
168#ifndef SD_NODE_INIT 177#ifndef SD_NODE_INIT
169#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!! 178#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/kernel/sched.c b/kernel/sched.c
index a96a05d23262..8a8b71b5751b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5574,11 +5574,31 @@ static int cpu_to_cpu_group(int cpu)
5574} 5574}
5575#endif 5575#endif
5576 5576
5577#ifdef CONFIG_SCHED_MC
5578static DEFINE_PER_CPU(struct sched_domain, core_domains);
5579static struct sched_group sched_group_core[NR_CPUS];
5580#endif
5581
5582#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5583static int cpu_to_core_group(int cpu)
5584{
5585 return first_cpu(cpu_sibling_map[cpu]);
5586}
5587#elif defined(CONFIG_SCHED_MC)
5588static int cpu_to_core_group(int cpu)
5589{
5590 return cpu;
5591}
5592#endif
5593
5577static DEFINE_PER_CPU(struct sched_domain, phys_domains); 5594static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5578static struct sched_group sched_group_phys[NR_CPUS]; 5595static struct sched_group sched_group_phys[NR_CPUS];
5579static int cpu_to_phys_group(int cpu) 5596static int cpu_to_phys_group(int cpu)
5580{ 5597{
5581#ifdef CONFIG_SCHED_SMT 5598#if defined(CONFIG_SCHED_MC)
5599 cpumask_t mask = cpu_coregroup_map(cpu);
5600 return first_cpu(mask);
5601#elif defined(CONFIG_SCHED_SMT)
5582 return first_cpu(cpu_sibling_map[cpu]); 5602 return first_cpu(cpu_sibling_map[cpu]);
5583#else 5603#else
5584 return cpu; 5604 return cpu;
@@ -5676,6 +5696,17 @@ void build_sched_domains(const cpumask_t *cpu_map)
5676 sd->parent = p; 5696 sd->parent = p;
5677 sd->groups = &sched_group_phys[group]; 5697 sd->groups = &sched_group_phys[group];
5678 5698
5699#ifdef CONFIG_SCHED_MC
5700 p = sd;
5701 sd = &per_cpu(core_domains, i);
5702 group = cpu_to_core_group(i);
5703 *sd = SD_MC_INIT;
5704 sd->span = cpu_coregroup_map(i);
5705 cpus_and(sd->span, sd->span, *cpu_map);
5706 sd->parent = p;
5707 sd->groups = &sched_group_core[group];
5708#endif
5709
5679#ifdef CONFIG_SCHED_SMT 5710#ifdef CONFIG_SCHED_SMT
5680 p = sd; 5711 p = sd;
5681 sd = &per_cpu(cpu_domains, i); 5712 sd = &per_cpu(cpu_domains, i);
@@ -5701,6 +5732,19 @@ void build_sched_domains(const cpumask_t *cpu_map)
5701 } 5732 }
5702#endif 5733#endif
5703 5734
5735#ifdef CONFIG_SCHED_MC
5736 /* Set up multi-core groups */
5737 for_each_cpu_mask(i, *cpu_map) {
5738 cpumask_t this_core_map = cpu_coregroup_map(i);
5739 cpus_and(this_core_map, this_core_map, *cpu_map);
5740 if (i != first_cpu(this_core_map))
5741 continue;
5742 init_sched_build_groups(sched_group_core, this_core_map,
5743 &cpu_to_core_group);
5744 }
5745#endif
5746
5747
5704 /* Set up physical groups */ 5748 /* Set up physical groups */
5705 for (i = 0; i < MAX_NUMNODES; i++) { 5749 for (i = 0; i < MAX_NUMNODES; i++) {
5706 cpumask_t nodemask = node_to_cpumask(i); 5750 cpumask_t nodemask = node_to_cpumask(i);
@@ -5797,11 +5841,31 @@ void build_sched_domains(const cpumask_t *cpu_map)
5797 power = SCHED_LOAD_SCALE; 5841 power = SCHED_LOAD_SCALE;
5798 sd->groups->cpu_power = power; 5842 sd->groups->cpu_power = power;
5799#endif 5843#endif
5844#ifdef CONFIG_SCHED_MC
5845 sd = &per_cpu(core_domains, i);
5846 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
5847 * SCHED_LOAD_SCALE / 10;
5848 sd->groups->cpu_power = power;
5849
5850 sd = &per_cpu(phys_domains, i);
5800 5851
5852 /*
5853 * This has to be < 2 * SCHED_LOAD_SCALE
5854 * Lets keep it SCHED_LOAD_SCALE, so that
5855 * while calculating NUMA group's cpu_power
5856 * we can simply do
5857 * numa_group->cpu_power += phys_group->cpu_power;
5858 *
5859 * See "only add power once for each physical pkg"
5860 * comment below
5861 */
5862 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5863#else
5801 sd = &per_cpu(phys_domains, i); 5864 sd = &per_cpu(phys_domains, i);
5802 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * 5865 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5803 (cpus_weight(sd->groups->cpumask)-1) / 10; 5866 (cpus_weight(sd->groups->cpumask)-1) / 10;
5804 sd->groups->cpu_power = power; 5867 sd->groups->cpu_power = power;
5868#endif
5805 5869
5806#ifdef CONFIG_NUMA 5870#ifdef CONFIG_NUMA
5807 sd = &per_cpu(allnodes_domains, i); 5871 sd = &per_cpu(allnodes_domains, i);
@@ -5823,7 +5887,6 @@ void build_sched_domains(const cpumask_t *cpu_map)
5823next_sg: 5887next_sg:
5824 for_each_cpu_mask(j, sg->cpumask) { 5888 for_each_cpu_mask(j, sg->cpumask) {
5825 struct sched_domain *sd; 5889 struct sched_domain *sd;
5826 int power;
5827 5890
5828 sd = &per_cpu(phys_domains, j); 5891 sd = &per_cpu(phys_domains, j);
5829 if (j != first_cpu(sd->groups->cpumask)) { 5892 if (j != first_cpu(sd->groups->cpumask)) {
@@ -5833,10 +5896,8 @@ next_sg:
5833 */ 5896 */
5834 continue; 5897 continue;
5835 } 5898 }
5836 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5837 (cpus_weight(sd->groups->cpumask)-1) / 10;
5838 5899
5839 sg->cpu_power += power; 5900 sg->cpu_power += sd->groups->cpu_power;
5840 } 5901 }
5841 sg = sg->next; 5902 sg = sg->next;
5842 if (sg != sched_group_nodes[i]) 5903 if (sg != sched_group_nodes[i])
@@ -5849,6 +5910,8 @@ next_sg:
5849 struct sched_domain *sd; 5910 struct sched_domain *sd;
5850#ifdef CONFIG_SCHED_SMT 5911#ifdef CONFIG_SCHED_SMT
5851 sd = &per_cpu(cpu_domains, i); 5912 sd = &per_cpu(cpu_domains, i);
5913#elif defined(CONFIG_SCHED_MC)
5914 sd = &per_cpu(core_domains, i);
5852#else 5915#else
5853 sd = &per_cpu(phys_domains, i); 5916 sd = &per_cpu(phys_domains, i);
5854#endif 5917#endif