14 files changed, 186 insertions, 11 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index f7db71d0b913..f17bd1d2707e 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -231,6 +231,15 @@ config SCHED_SMT
          cost of slightly increased overhead in some places. If unsure say
          N here.
+config SCHED_MC
+        bool "Multi-core scheduler support"
+        depends on SMP
+        default y
+        help
+          Multi-core scheduler support improves the CPU scheduler's decision
+          making when dealing with multi-core CPU chips at a cost of slightly
+          increased overhead in some places. If unsure say N here.
 source "kernel/Kconfig.preempt"
 config X86_UP_APIC
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 7e3d6b6a4e96..a06a49075f10 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -266,7 +266,7 @@ static void __init early_cpu_detect(void)
 void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 {
        u32 tfms, xlvl;
-        int junk;
+        int ebx;
        if (have_cpuid_p()) {
                /* Get vendor name */
@@ -282,7 +282,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
                /* Intel-defined flags: level 0x00000001 */
                if ( c->cpuid_level >= 0x00000001 ) {
                        u32 capability, excap;
-                        cpuid(0x00000001, &tfms, &junk, &excap, &capability);
+                        cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
                        c->x86_capability[0] = capability;
                        c->x86_capability[4] = excap;
                        c->x86 = (tfms >> 8) & 15;
@@ -292,6 +292,11 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
                        if (c->x86 >= 0x6)
                                c->x86_model += ((tfms >> 16) & 0xF) << 4;
                        c->x86_mask = tfms & 15;
+#ifdef CONFIG_SMP
+                        c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
+#else
+                        c->apicid = (ebx >> 24) & 0xFF;
+#endif
                } else {
                        /* Have CPUID level 0 only - unheard of */
                        c->x86 = 4;
@@ -474,7 +479,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
        cpuid(1, &eax, &ebx, &ecx, &edx);
-        c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
        if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
                return;
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index ce61921369e5..7e7fd4e67dd0 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -173,6 +173,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
        unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */
        unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
        unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
+        unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
+#ifdef CONFIG_SMP
+        unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data);
+#endif
        if (c->cpuid_level > 3) {
                static int is_initialized;
@@ -205,9 +209,15 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
                                        break;
                                    case 2:
                                        new_l2 = this_leaf.size/1024;
+                                        num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+                                        index_msb = get_count_order(num_threads_sharing);
+                                        l2_id = c->apicid >> index_msb;
                                        break;
                                    case 3:
                                        new_l3 = this_leaf.size/1024;
+                                        num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+                                        index_msb = get_count_order(num_threads_sharing);
+                                        l3_id = c->apicid >> index_msb;
                                        break;
                                    default:
                                        break;
@@ -273,11 +283,19 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
                if (new_l1i)
                        l1i = new_l1i;
-                if (new_l2)
+                if (new_l2) {
                        l2 = new_l2;
+#ifdef CONFIG_SMP
+                        cpu_llc_id[cpu] = l2_id;
+#endif
+                }
-                if (new_l3)
+                if (new_l3) {
                        l3 = new_l3;
+#ifdef CONFIG_SMP
+                        cpu_llc_id[cpu] = l3_id;
+#endif
+                }
                if ( trace )
                        printk (KERN_INFO "CPU: Trace cache: %dK uops", trace);
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 82371d83bfa9..a6969903f2d6 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -72,6 +72,9 @@ int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
 /* Core ID of each logical CPU */
 int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
+/* Last level cache ID of each logical CPU */
+int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
 /* representing HT siblings of each logical CPU */
 cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_sibling_map);
@@ -440,6 +443,18 @@ static void __devinit smp_callin(void)
 static int cpucount;
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+        struct cpuinfo_x86 *c = cpu_data + cpu;
+        /*
+         * For perf, we return last level cache shared map.
+         * TBD: when power saving sched policy is added, we will return
+         *      cpu_core_map when power saving policy is enabled
+         */
+        return c->llc_shared_map;
+}
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
@@ -459,12 +474,16 @@ set_cpu_sibling_map(int cpu)
                                cpu_set(cpu, cpu_sibling_map[i]);
                                cpu_set(i, cpu_core_map[cpu]);
                                cpu_set(cpu, cpu_core_map[i]);
+                                cpu_set(i, c[cpu].llc_shared_map);
+                                cpu_set(cpu, c[i].llc_shared_map);
                        }
                }
        } else {
                cpu_set(cpu, cpu_sibling_map[cpu]);
        }
+        cpu_set(cpu, c[cpu].llc_shared_map);
        if (current_cpu_data.x86_max_cores == 1) {
                cpu_core_map[cpu] = cpu_sibling_map[cpu];
                c[cpu].booted_cores = 1;
@@ -472,6 +491,11 @@ set_cpu_sibling_map(int cpu)
        }
        for_each_cpu_mask(i, cpu_sibling_setup_map) {
+                if (cpu_llc_id[cpu] != BAD_APICID &&
+                    cpu_llc_id[cpu] == cpu_llc_id[i]) {
+                        cpu_set(i, c[cpu].llc_shared_map);
+                        cpu_set(cpu, c[i].llc_shared_map);
+                }
                if (phys_proc_id[cpu] == phys_proc_id[i]) {
                        cpu_set(i, cpu_core_map[cpu]);
                        cpu_set(cpu, cpu_core_map[i]);
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 45efe0ca88f8..1cb4aa241c8c 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -250,6 +250,15 @@ config SCHED_SMT
          cost of slightly increased overhead in some places. If unsure say
          N here.
+config SCHED_MC
+        bool "Multi-core scheduler support"
+        depends on SMP
+        default y
+        help
+          Multi-core scheduler support improves the CPU scheduler's decision
+          making when dealing with multi-core CPU chips at a cost of slightly
+          increased overhead in some places. If unsure say N here.
 source "kernel/Kconfig.preempt"
 config NUMA
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index a57eec8311a7..d1f3e9272c05 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -962,7 +962,6 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
        cpuid(1, &eax, &ebx, &ecx, &edx);
-        c->apicid = phys_pkg_id(0);
        if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
                return;
@@ -1171,6 +1170,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
                        c->x86_capability[2] = cpuid_edx(0x80860001);
        }
+        c->apicid = phys_pkg_id(0);
        /*
         * Vendor-specific initialization.  In this section we
         * canonicalize the feature flags, meaning if there are
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 66e98659d077..ea48fa638070 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -68,6 +68,9 @@ u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 /* core ID of each logical CPU */
 u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+/* Last level cache ID of each logical CPU */
+u8 cpu_llc_id[NR_CPUS] __cpuinitdata  = {[0 ... NR_CPUS-1] = BAD_APICID};
 /* Bitmask of currently online CPUs */
 cpumask_t cpu_online_map __read_mostly;
@@ -445,6 +448,18 @@ void __cpuinit smp_callin(void)
        cpu_set(cpuid, cpu_callin_map);
 }
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+        struct cpuinfo_x86 *c = cpu_data + cpu;
+        /*
+         * For perf, we return last level cache shared map.
+         * TBD: when power saving sched policy is added, we will return
+         *      cpu_core_map when power saving policy is enabled
+         */
+        return c->llc_shared_map;
+}
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
@@ -463,12 +478,16 @@ static inline void set_cpu_sibling_map(int cpu)
                                cpu_set(cpu, cpu_sibling_map[i]);
                                cpu_set(i, cpu_core_map[cpu]);
                                cpu_set(cpu, cpu_core_map[i]);
+                                cpu_set(i, c[cpu].llc_shared_map);
+                                cpu_set(cpu, c[i].llc_shared_map);
                        }
                }
        } else {
                cpu_set(cpu, cpu_sibling_map[cpu]);
        }
+        cpu_set(cpu, c[cpu].llc_shared_map);
        if (current_cpu_data.x86_max_cores == 1) {
                cpu_core_map[cpu] = cpu_sibling_map[cpu];
                c[cpu].booted_cores = 1;
@@ -476,6 +495,11 @@ static inline void set_cpu_sibling_map(int cpu)
        }
        for_each_cpu_mask(i, cpu_sibling_setup_map) {
+                if (cpu_llc_id[cpu] != BAD_APICID &&
+                    cpu_llc_id[cpu] == cpu_llc_id[i]) {
+                        cpu_set(i, c[cpu].llc_shared_map);
+                        cpu_set(cpu, c[i].llc_shared_map);
+                }
                if (phys_proc_id[cpu] == phys_proc_id[i]) {
                        cpu_set(i, cpu_core_map[cpu]);
                        cpu_set(cpu, cpu_core_map[i]);
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index feca5d961e2b..af4bfd012475 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -20,6 +20,7 @@
 #include <linux/config.h>
 #include <linux/threads.h>
 #include <asm/percpu.h>
+#include <linux/cpumask.h>
 /* flag for disabling the tsc */
 extern int tsc_disable;
@@ -67,6 +68,9 @@ struct cpuinfo_x86 {
        char    pad0;
        int     x86_power;
        unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+        cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
+#endif
        unsigned char x86_max_cores;    /* cpuid returned max cores value */
        unsigned char booted_cores;     /* number of cores as seen by OS */
        unsigned char apicid;
@@ -103,6 +107,7 @@ extern struct cpuinfo_x86 cpu_data[];
 extern  int phys_proc_id[NR_CPUS];
 extern  int cpu_core_id[NR_CPUS];
+extern  int cpu_llc_id[NR_CPUS];
 extern char ignore_fpu_irq;
 extern void identify_cpu(struct cpuinfo_x86 *);
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index aa958c6ee83e..b94e5eeef917 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -112,4 +112,6 @@ extern unsigned long node_remap_size[];
 #endif /* CONFIG_NUMA */
+extern cpumask_t cpu_coregroup_map(int cpu);
 #endif /* _ASM_I386_TOPOLOGY_H */
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 8c8d88c036ed..1aa2cee43344 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -20,6 +20,7 @@
 #include <asm/mmsegment.h>
 #include <asm/percpu.h>
 #include <linux/personality.h>
+#include <linux/cpumask.h>
 #define TF_MASK         0x00000100
 #define IF_MASK         0x00000200
@@ -65,6 +66,9 @@ struct cpuinfo_x86 {
        __u32   x86_power;      
        __u32   extended_cpuid_level;   /* Max extended CPUID function supported */
        unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+        cpumask_t llc_shared_map;       /* cpus sharing the last level cache */
+#endif
        __u8    apicid;
        __u8    booted_cores;   /* number of cores as seen by OS */
 } ____cacheline_aligned;
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index 9ccbb2cfd5c0..a4fdaeb5c397 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -56,6 +56,7 @@ extern cpumask_t cpu_sibling_map[NR_CPUS];
 extern cpumask_t cpu_core_map[NR_CPUS];
 extern u8 phys_proc_id[NR_CPUS];
 extern u8 cpu_core_id[NR_CPUS];
+extern u8 cpu_llc_id[NR_CPUS];
 #define SMP_TRAMPOLINE_BASE 0x6000
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index c642f5d9882d..9db54e9d17bb 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -68,4 +68,6 @@ extern int __node_distance(int, int);
 #include <asm-generic/topology.h>
+extern cpumask_t cpu_coregroup_map(int cpu);
 #endif
diff --git a/include/linux/topology.h b/include/linux/topology.h
index e8eb0040ce3a..a305ae2e44b6 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -164,6 +164,15 @@
        .nr_balance_failed      = 0,                    \
 }
+#ifdef CONFIG_SCHED_MC
+#ifndef SD_MC_INIT
+/* for now its same as SD_CPU_INIT.
+ * TBD: Tune Domain parameters!
+ */
+#define SD_MC_INIT   SD_CPU_INIT
+#endif
+#endif
 #ifdef CONFIG_NUMA
 #ifndef SD_NODE_INIT
 #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/kernel/sched.c b/kernel/sched.c
index a96a05d23262..8a8b71b5751b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5574,11 +5574,31 @@ static int cpu_to_cpu_group(int cpu)
 }
 #endif
+#ifdef CONFIG_SCHED_MC
+static DEFINE_PER_CPU(struct sched_domain, core_domains);
+static struct sched_group sched_group_core[NR_CPUS];
+#endif
+#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
+static int cpu_to_core_group(int cpu)
+{
+        return first_cpu(cpu_sibling_map[cpu]);
+}
+#elif defined(CONFIG_SCHED_MC)
+static int cpu_to_core_group(int cpu)
+{
+        return cpu;
+}
+#endif
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static struct sched_group sched_group_phys[NR_CPUS];
 static int cpu_to_phys_group(int cpu)
 {
-#ifdef CONFIG_SCHED_SMT
+#if defined(CONFIG_SCHED_MC)
+        cpumask_t mask = cpu_coregroup_map(cpu);
+        return first_cpu(mask);
+#elif defined(CONFIG_SCHED_SMT)
        return first_cpu(cpu_sibling_map[cpu]);
 #else
        return cpu;
@@ -5676,6 +5696,17 @@ void build_sched_domains(const cpumask_t *cpu_map)
                sd->parent = p;
                sd->groups = &sched_group_phys[group];
+#ifdef CONFIG_SCHED_MC
+                p = sd;
+                sd = &per_cpu(core_domains, i);
+                group = cpu_to_core_group(i);
+                *sd = SD_MC_INIT;
+                sd->span = cpu_coregroup_map(i);
+                cpus_and(sd->span, sd->span, *cpu_map);
+                sd->parent = p;
+                sd->groups = &sched_group_core[group];
+#endif
 #ifdef CONFIG_SCHED_SMT
                p = sd;
                sd = &per_cpu(cpu_domains, i);
@@ -5701,6 +5732,19 @@ void build_sched_domains(const cpumask_t *cpu_map)
        }
 #endif
+#ifdef CONFIG_SCHED_MC
+        /* Set up multi-core groups */
+        for_each_cpu_mask(i, *cpu_map) {
+                cpumask_t this_core_map = cpu_coregroup_map(i);
+                cpus_and(this_core_map, this_core_map, *cpu_map);
+                if (i != first_cpu(this_core_map))
+                        continue;
+                init_sched_build_groups(sched_group_core, this_core_map,
+                                        &cpu_to_core_group);
+        }
+#endif
        /* Set up physical groups */
        for (i = 0; i < MAX_NUMNODES; i++) {
                cpumask_t nodemask = node_to_cpumask(i);
@@ -5797,11 +5841,31 @@ void build_sched_domains(const cpumask_t *cpu_map)
                power = SCHED_LOAD_SCALE;
                sd->groups->cpu_power = power;
 #endif
+#ifdef CONFIG_SCHED_MC
+                sd = &per_cpu(core_domains, i);
+                power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+                                            * SCHED_LOAD_SCALE / 10;
+                sd->groups->cpu_power = power;
+                sd = &per_cpu(phys_domains, i);
+                /*
+                 * This has to be < 2 * SCHED_LOAD_SCALE
+                 * Lets keep it SCHED_LOAD_SCALE, so that
+                 * while calculating NUMA group's cpu_power
+                 * we can simply do
+                 *  numa_group->cpu_power += phys_group->cpu_power;
+                 *
+                 * See "only add power once for each physical pkg"
+                 * comment below
+                 */
+                sd->groups->cpu_power = SCHED_LOAD_SCALE;
+#else
                sd = &per_cpu(phys_domains, i);
                power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
                                (cpus_weight(sd->groups->cpumask)-1) / 10;
                sd->groups->cpu_power = power;
+#endif
 #ifdef CONFIG_NUMA
                sd = &per_cpu(allnodes_domains, i);
@@ -5823,7 +5887,6 @@ void build_sched_domains(const cpumask_t *cpu_map)
 next_sg:
                for_each_cpu_mask(j, sg->cpumask) {
                        struct sched_domain *sd;
-                        int power;
                        sd = &per_cpu(phys_domains, j);
                        if (j != first_cpu(sd->groups->cpumask)) {
@@ -5833,10 +5896,8 @@ next_sg:
                                 */
                                continue;
                        }
-                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                                (cpus_weight(sd->groups->cpumask)-1) / 10;
-                        sg->cpu_power += power;
+                        sg->cpu_power += sd->groups->cpu_power;
                }
                sg = sg->next;
                if (sg != sched_group_nodes[i])
@@ -5849,6 +5910,8 @@ next_sg:
                struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
                sd = &per_cpu(cpu_domains, i);
+#elif defined(CONFIG_SCHED_MC)
+                sd = &per_cpu(core_domains, i);
 #else
                sd = &per_cpu(phys_domains, i);
 #endif