[PATCH] sched: new sched domain for representing multi-core

Add a new sched domain for representing multi-core with shared caches between cores. Consider a dual package system, each package containing two cores and with last level cache shared between cores with in a package. If there are two runnable processes, with this appended patch those two processes will be scheduled on different packages. On such systems, with this patch we have observed 8% perf improvement with specJBB(2 warehouse) benchmark and 35% improvement with CFP2000 rate(with 2 users). This new domain will come into play only on multi-core systems with shared caches. On other systems, this sched domain will be removed by domain degeneration code. This new domain can be also used for implementing power savings policy (see OLS 2005 CMP kernel scheduler paper for more details.. I will post another patch for power savings policy soon) Most of the arch/* file changes are for cpu_coregroup_map() implementation. Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Cc: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Siddha, Suresh B <suresh.b.siddha@intel.com> 2006-03-27 04:15:22 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-03-27 11:44:43 -0500
commit: 1e9f28fa1eb9773bf65bae08288c6a0a38eef4a7 (patch)
tree: ccfa4927ebc7a8f663f9ac9e7789a713a33253ff /kernel
parent: 77e4bfbcf071f795b54862455dce8902b3fc29c2 (diff)
1 files changed, 68 insertions, 5 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index a96a05d23262..8a8b71b5751b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5574,11 +5574,31 @@ static int cpu_to_cpu_group(int cpu)
 }
 #endif
+#ifdef CONFIG_SCHED_MC
+static DEFINE_PER_CPU(struct sched_domain, core_domains);
+static struct sched_group sched_group_core[NR_CPUS];
+#endif
+#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
+static int cpu_to_core_group(int cpu)
+{
+        return first_cpu(cpu_sibling_map[cpu]);
+}
+#elif defined(CONFIG_SCHED_MC)
+static int cpu_to_core_group(int cpu)
+{
+        return cpu;
+}
+#endif
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static struct sched_group sched_group_phys[NR_CPUS];
 static int cpu_to_phys_group(int cpu)
 {
-#ifdef CONFIG_SCHED_SMT
+#if defined(CONFIG_SCHED_MC)
+        cpumask_t mask = cpu_coregroup_map(cpu);
+        return first_cpu(mask);
+#elif defined(CONFIG_SCHED_SMT)
        return first_cpu(cpu_sibling_map[cpu]);
 #else
        return cpu;
@@ -5676,6 +5696,17 @@ void build_sched_domains(const cpumask_t *cpu_map)
                sd->parent = p;
                sd->groups = &sched_group_phys[group];
+#ifdef CONFIG_SCHED_MC
+                p = sd;
+                sd = &per_cpu(core_domains, i);
+                group = cpu_to_core_group(i);
+                *sd = SD_MC_INIT;
+                sd->span = cpu_coregroup_map(i);
+                cpus_and(sd->span, sd->span, *cpu_map);
+                sd->parent = p;
+                sd->groups = &sched_group_core[group];
+#endif
 #ifdef CONFIG_SCHED_SMT
                p = sd;
                sd = &per_cpu(cpu_domains, i);
@@ -5701,6 +5732,19 @@ void build_sched_domains(const cpumask_t *cpu_map)
        }
 #endif
+#ifdef CONFIG_SCHED_MC
+        /* Set up multi-core groups */
+        for_each_cpu_mask(i, *cpu_map) {
+                cpumask_t this_core_map = cpu_coregroup_map(i);
+                cpus_and(this_core_map, this_core_map, *cpu_map);
+                if (i != first_cpu(this_core_map))
+                        continue;
+                init_sched_build_groups(sched_group_core, this_core_map,
+                                        &cpu_to_core_group);
+        }
+#endif
        /* Set up physical groups */
        for (i = 0; i < MAX_NUMNODES; i++) {
                cpumask_t nodemask = node_to_cpumask(i);
@@ -5797,11 +5841,31 @@ void build_sched_domains(const cpumask_t *cpu_map)
                power = SCHED_LOAD_SCALE;
                sd->groups->cpu_power = power;
 #endif
+#ifdef CONFIG_SCHED_MC
+                sd = &per_cpu(core_domains, i);
+                power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+                                            * SCHED_LOAD_SCALE / 10;
+                sd->groups->cpu_power = power;
+                sd = &per_cpu(phys_domains, i);
+                /*
+                 * This has to be < 2 * SCHED_LOAD_SCALE
+                 * Lets keep it SCHED_LOAD_SCALE, so that
+                 * while calculating NUMA group's cpu_power
+                 * we can simply do
+                 *  numa_group->cpu_power += phys_group->cpu_power;
+                 *
+                 * See "only add power once for each physical pkg"
+                 * comment below
+                 */
+                sd->groups->cpu_power = SCHED_LOAD_SCALE;
+#else
                sd = &per_cpu(phys_domains, i);
                power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
                                (cpus_weight(sd->groups->cpumask)-1) / 10;
                sd->groups->cpu_power = power;
+#endif
 #ifdef CONFIG_NUMA
                sd = &per_cpu(allnodes_domains, i);
@@ -5823,7 +5887,6 @@ void build_sched_domains(const cpumask_t *cpu_map)
 next_sg:
                for_each_cpu_mask(j, sg->cpumask) {
                        struct sched_domain *sd;
-                        int power;
                        sd = &per_cpu(phys_domains, j);
                        if (j != first_cpu(sd->groups->cpumask)) {
@@ -5833,10 +5896,8 @@ next_sg:
                                 */
                                continue;
                        }
-                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                                (cpus_weight(sd->groups->cpumask)-1) / 10;
-                        sg->cpu_power += power;
+                        sg->cpu_power += sd->groups->cpu_power;
                }
                sg = sg->next;
                if (sg != sched_group_nodes[i])
@@ -5849,6 +5910,8 @@ next_sg:
                struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
                sd = &per_cpu(cpu_domains, i);
+#elif defined(CONFIG_SCHED_MC)
+                sd = &per_cpu(core_domains, i);
 #else
                sd = &per_cpu(phys_domains, i);
 #endif
author	Siddha, Suresh B <suresh.b.siddha@intel.com>	2006-03-27 04:15:22 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-03-27 11:44:43 -0500
commit	1e9f28fa1eb9773bf65bae08288c6a0a38eef4a7 (patch)
tree	ccfa4927ebc7a8f663f9ac9e7789a713a33253ff /kernel
parent	77e4bfbcf071f795b54862455dce8902b3fc29c2 (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index a96a05d23262..8a8b71b5751b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -5574,11 +5574,31 @@ static int cpu_to_cpu_group(int cpu)
5574	}	5574	}
5575	#endif	5575	#endif
5576		5576
		5577	#ifdef CONFIG_SCHED_MC
		5578	static DEFINE_PER_CPU(struct sched_domain, core_domains);
		5579	static struct sched_group sched_group_core[NR_CPUS];
		5580	#endif
		5581
		5582	#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
		5583	static int cpu_to_core_group(int cpu)
		5584	{
		5585	return first_cpu(cpu_sibling_map[cpu]);
		5586	}
		5587	#elif defined(CONFIG_SCHED_MC)
		5588	static int cpu_to_core_group(int cpu)
		5589	{
		5590	return cpu;
		5591	}
		5592	#endif
		5593
5577	static DEFINE_PER_CPU(struct sched_domain, phys_domains);	5594	static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5578	static struct sched_group sched_group_phys[NR_CPUS];	5595	static struct sched_group sched_group_phys[NR_CPUS];
5579	static int cpu_to_phys_group(int cpu)	5596	static int cpu_to_phys_group(int cpu)
5580	{	5597	{
5581	#ifdef CONFIG_SCHED_SMT	5598	#if defined(CONFIG_SCHED_MC)
		5599	cpumask_t mask = cpu_coregroup_map(cpu);
		5600	return first_cpu(mask);
		5601	#elif defined(CONFIG_SCHED_SMT)
5582	return first_cpu(cpu_sibling_map[cpu]);	5602	return first_cpu(cpu_sibling_map[cpu]);
5583	#else	5603	#else
5584	return cpu;	5604	return cpu;
@@ -5676,6 +5696,17 @@ void build_sched_domains(const cpumask_t *cpu_map)
5676	sd->parent = p;	5696	sd->parent = p;
5677	sd->groups = &sched_group_phys[group];	5697	sd->groups = &sched_group_phys[group];
5678		5698
		5699	#ifdef CONFIG_SCHED_MC
		5700	p = sd;
		5701	sd = &per_cpu(core_domains, i);
		5702	group = cpu_to_core_group(i);
		5703	*sd = SD_MC_INIT;
		5704	sd->span = cpu_coregroup_map(i);
		5705	cpus_and(sd->span, sd->span, *cpu_map);
		5706	sd->parent = p;
		5707	sd->groups = &sched_group_core[group];
		5708	#endif
		5709
5679	#ifdef CONFIG_SCHED_SMT	5710	#ifdef CONFIG_SCHED_SMT
5680	p = sd;	5711	p = sd;
5681	sd = &per_cpu(cpu_domains, i);	5712	sd = &per_cpu(cpu_domains, i);
@@ -5701,6 +5732,19 @@ void build_sched_domains(const cpumask_t *cpu_map)
5701	}	5732	}
5702	#endif	5733	#endif
5703		5734
		5735	#ifdef CONFIG_SCHED_MC
		5736	/* Set up multi-core groups */
		5737	for_each_cpu_mask(i, *cpu_map) {
		5738	cpumask_t this_core_map = cpu_coregroup_map(i);
		5739	cpus_and(this_core_map, this_core_map, *cpu_map);
		5740	if (i != first_cpu(this_core_map))
		5741	continue;
		5742	init_sched_build_groups(sched_group_core, this_core_map,
		5743	&cpu_to_core_group);
		5744	}
		5745	#endif
		5746
		5747
5704	/* Set up physical groups */	5748	/* Set up physical groups */
5705	for (i = 0; i < MAX_NUMNODES; i++) {	5749	for (i = 0; i < MAX_NUMNODES; i++) {
5706	cpumask_t nodemask = node_to_cpumask(i);	5750	cpumask_t nodemask = node_to_cpumask(i);
@@ -5797,11 +5841,31 @@ void build_sched_domains(const cpumask_t *cpu_map)
5797	power = SCHED_LOAD_SCALE;	5841	power = SCHED_LOAD_SCALE;
5798	sd->groups->cpu_power = power;	5842	sd->groups->cpu_power = power;
5799	#endif	5843	#endif
		5844	#ifdef CONFIG_SCHED_MC
		5845	sd = &per_cpu(core_domains, i);
		5846	power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
		5847	* SCHED_LOAD_SCALE / 10;
		5848	sd->groups->cpu_power = power;
		5849
		5850	sd = &per_cpu(phys_domains, i);
5800		5851
		5852	/*
		5853	* This has to be < 2 * SCHED_LOAD_SCALE
		5854	* Lets keep it SCHED_LOAD_SCALE, so that
		5855	* while calculating NUMA group's cpu_power
		5856	* we can simply do
		5857	* numa_group->cpu_power += phys_group->cpu_power;
		5858	*
		5859	* See "only add power once for each physical pkg"
		5860	* comment below
		5861	*/
		5862	sd->groups->cpu_power = SCHED_LOAD_SCALE;
		5863	#else
5801	sd = &per_cpu(phys_domains, i);	5864	sd = &per_cpu(phys_domains, i);
5802	power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *	5865	power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5803	(cpus_weight(sd->groups->cpumask)-1) / 10;	5866	(cpus_weight(sd->groups->cpumask)-1) / 10;
5804	sd->groups->cpu_power = power;	5867	sd->groups->cpu_power = power;
		5868	#endif
5805		5869
5806	#ifdef CONFIG_NUMA	5870	#ifdef CONFIG_NUMA
5807	sd = &per_cpu(allnodes_domains, i);	5871	sd = &per_cpu(allnodes_domains, i);
@@ -5823,7 +5887,6 @@ void build_sched_domains(const cpumask_t *cpu_map)
5823	next_sg:	5887	next_sg:
5824	for_each_cpu_mask(j, sg->cpumask) {	5888	for_each_cpu_mask(j, sg->cpumask) {
5825	struct sched_domain *sd;	5889	struct sched_domain *sd;
5826	int power;
5827		5890
5828	sd = &per_cpu(phys_domains, j);	5891	sd = &per_cpu(phys_domains, j);
5829	if (j != first_cpu(sd->groups->cpumask)) {	5892	if (j != first_cpu(sd->groups->cpumask)) {
@@ -5833,10 +5896,8 @@ next_sg:
5833	*/	5896	*/
5834	continue;	5897	continue;
5835	}	5898	}
5836	power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5837	(cpus_weight(sd->groups->cpumask)-1) / 10;
5838		5899
5839	sg->cpu_power += power;	5900	sg->cpu_power += sd->groups->cpu_power;
5840	}	5901	}
5841	sg = sg->next;	5902	sg = sg->next;
5842	if (sg != sched_group_nodes[i])	5903	if (sg != sched_group_nodes[i])
@@ -5849,6 +5910,8 @@ next_sg:
5849	struct sched_domain *sd;	5910	struct sched_domain *sd;
5850	#ifdef CONFIG_SCHED_SMT	5911	#ifdef CONFIG_SCHED_SMT
5851	sd = &per_cpu(cpu_domains, i);	5912	sd = &per_cpu(cpu_domains, i);
		5913	#elif defined(CONFIG_SCHED_MC)
		5914	sd = &per_cpu(core_domains, i);
5852	#else	5915	#else
5853	sd = &per_cpu(phys_domains, i);	5916	sd = &per_cpu(phys_domains, i);
5854	#endif	5917	#endif