[PATCH] sched: mc/smt power savings sched policy

sysfs entries 'sched_mc_power_savings' and 'sched_smt_power_savings' in /sys/devices/system/cpu/ control the MC/SMT power savings policy for the scheduler. Based on the values (1-enable, 0-disable) for these controls, sched groups cpu power will be determined for different domains. When power savings policy is enabled and under light load conditions, scheduler will minimize the physical packages/cpu cores carrying the load and thus conserving power(with a perf impact based on the workload characteristics... see OLS 2005 CMP kernel scheduler paper for more details..) Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Con Kolivas <kernel@kolivas.org> Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com> Cc: "David S. Miller" <davem@davemloft.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Siddha, Suresh B <suresh.b.siddha@intel.com> 2006-06-27 05:54:42 -0400
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-06-27 20:32:45 -0400
commit: 5c45bf279d378d436ce45825c0f136696c7b6109 (patch)
tree: 80e2fcf4866b84fccb787562e1a83b16f4bc8850 /kernel/sched.c
parent: 369381694ddcf03f1de403501c8b97099b5109ec (diff)
1 files changed, 215 insertions, 25 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 122b75584a13..54fa282657cc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1162,6 +1162,11 @@ static int sched_balance_self(int cpu, int flag)
        struct sched_domain *tmp, *sd = NULL;
        for_each_domain(cpu, tmp) {
+                /*
+                 * If power savings logic is enabled for a domain, stop there.
+                 */
+                if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+                        break;
                if (tmp->flags & flag)
                        sd = tmp;
        }
@@ -2082,6 +2087,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        unsigned long busiest_load_per_task, busiest_nr_running;
        unsigned long this_load_per_task, this_nr_running;
        int load_idx;
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+        int power_savings_balance = 1;
+        unsigned long leader_nr_running = 0, min_load_per_task = 0;
+        unsigned long min_nr_running = ULONG_MAX;
+        struct sched_group *group_min = NULL, *group_leader = NULL;
+#endif
        max_load = this_load = total_load = total_pwr = 0;
        busiest_load_per_task = busiest_nr_running = 0;
@@ -2094,7 +2105,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                load_idx = sd->idle_idx;
        do {
-                unsigned long load;
+                unsigned long load, group_capacity;
                int local_group;
                int i;
                unsigned long sum_nr_running, sum_weighted_load;
@@ -2127,18 +2138,76 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                /* Adjust by relative CPU power of the group */
                avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+                group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
                if (local_group) {
                        this_load = avg_load;
                        this = group;
                        this_nr_running = sum_nr_running;
                        this_load_per_task = sum_weighted_load;
                } else if (avg_load > max_load &&
-                           sum_nr_running > group->cpu_power / SCHED_LOAD_SCALE) {
+                           sum_nr_running > group_capacity) {
                        max_load = avg_load;
                        busiest = group;
                        busiest_nr_running = sum_nr_running;
                        busiest_load_per_task = sum_weighted_load;
                }
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+                /*
+                 * Busy processors will not participate in power savings
+                 * balance.
+                 */
+                if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+                        goto group_next;
+                /*
+                 * If the local group is idle or completely loaded
+                 * no need to do power savings balance at this domain
+                 */
+                if (local_group && (this_nr_running >= group_capacity ||
+                                    !this_nr_running))
+                        power_savings_balance = 0;
+                /*
+                 * If a group is already running at full capacity or idle,
+                 * don't include that group in power savings calculations
+                 */
+                if (!power_savings_balance || sum_nr_running >= group_capacity
+                    || !sum_nr_running)
+                        goto group_next;
+                /*
+                 * Calculate the group which has the least non-idle load.
+                 * This is the group from where we need to pick up the load
+                 * for saving power
+                 */
+                if ((sum_nr_running < min_nr_running) ||
+                    (sum_nr_running == min_nr_running &&
+                     first_cpu(group->cpumask) <
+                     first_cpu(group_min->cpumask))) {
+                        group_min = group;
+                        min_nr_running = sum_nr_running;
+                        min_load_per_task = sum_weighted_load /
+                                                sum_nr_running;
+                }
+                /*
+                 * Calculate the group which is almost near its
+                 * capacity but still has some space to pick up some load
+                 * from other group and save more power
+                 */
+                if (sum_nr_running <= group_capacity - 1)
+                        if (sum_nr_running > leader_nr_running ||
+                            (sum_nr_running == leader_nr_running &&
+                             first_cpu(group->cpumask) >
+                              first_cpu(group_leader->cpumask))) {
+                                group_leader = group;
+                                leader_nr_running = sum_nr_running;
+                        }
+group_next:
+#endif
                group = group->next;
        } while (group != sd->groups);
@@ -2247,7 +2316,16 @@ small_imbalance:
        return busiest;
 out_balanced:
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+        if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+                goto ret;
+        if (this == group_leader && group_leader != group_min) {
+                *imbalance = min_load_per_task;
+                return group_min;
+        }
+ret:
+#endif
        *imbalance = 0;
        return NULL;
 }
@@ -2300,7 +2378,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
        int active_balance = 0;
        int sd_idle = 0;
-        if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
+        if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+            !sched_smt_power_savings)
                sd_idle = 1;
        schedstat_inc(sd, lb_cnt[idle]);
@@ -2389,7 +2468,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                        sd->balance_interval *= 2;
        }
-        if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+        if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+            !sched_smt_power_savings)
                return -1;
        return nr_moved;
@@ -2404,7 +2484,7 @@ out_one_pinned:
                        (sd->balance_interval < sd->max_interval))
                sd->balance_interval *= 2;
-        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
                return -1;
        return 0;
 }
@@ -2425,7 +2505,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
        int nr_moved = 0;
        int sd_idle = 0;
-        if (sd->flags & SD_SHARE_CPUPOWER)
+        if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
                sd_idle = 1;
        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2466,7 +2546,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 out_balanced:
        schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
-        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
                return -1;
        sd->nr_balance_failed = 0;
        return 0;
@@ -5732,6 +5812,7 @@ static cpumask_t sched_domain_node_span(int node)
 }
 #endif
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 /*
 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
 * can switch it on easily if needed.
@@ -6113,37 +6194,72 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 #endif
        /* Calculate CPU power for physical packages and nodes */
+#ifdef CONFIG_SCHED_SMT
        for_each_cpu_mask(i, *cpu_map) {
-                int power;
                struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
                sd = &per_cpu(cpu_domains, i);
-                power = SCHED_LOAD_SCALE;
+                sd->groups->cpu_power = SCHED_LOAD_SCALE;
-                sd->groups->cpu_power = power;
+        }
 #endif
 #ifdef CONFIG_SCHED_MC
+        for_each_cpu_mask(i, *cpu_map) {
+                int power;
+                struct sched_domain *sd;
                sd = &per_cpu(core_domains, i);
-                power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+                if (sched_smt_power_savings)
+                        power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
+                else
+                        power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
                                            * SCHED_LOAD_SCALE / 10;
                sd->groups->cpu_power = power;
+        }
+#endif
+        for_each_cpu_mask(i, *cpu_map) {
+                struct sched_domain *sd;
+#ifdef CONFIG_SCHED_MC
                sd = &per_cpu(phys_domains, i);
+                if (i != first_cpu(sd->groups->cpumask))
+                        continue;
-                /*
+                sd->groups->cpu_power = 0;
-                 * This has to be < 2 * SCHED_LOAD_SCALE
+                if (sched_mc_power_savings || sched_smt_power_savings) {
-                 * Lets keep it SCHED_LOAD_SCALE, so that
+                        int j;
-                 * while calculating NUMA group's cpu_power
-                 * we can simply do
+                        for_each_cpu_mask(j, sd->groups->cpumask) {
-                 *  numa_group->cpu_power += phys_group->cpu_power;
+                                struct sched_domain *sd1;
-                 *
+                                sd1 = &per_cpu(core_domains, j);
-                 * See "only add power once for each physical pkg"
+                                /*
-                 * comment below
+                                 * for each core we will add once
-                 */
+                                 * to the group in physical domain
-                sd->groups->cpu_power = SCHED_LOAD_SCALE;
+                                 */
+                                if (j != first_cpu(sd1->groups->cpumask))
+                                        continue;
+                                if (sched_smt_power_savings)
+                                        sd->groups->cpu_power += sd1->groups->cpu_power;
+                                else
+                                        sd->groups->cpu_power += SCHED_LOAD_SCALE;
+                        }
+                } else
+                        /*
+                         * This has to be < 2 * SCHED_LOAD_SCALE
+                         * Lets keep it SCHED_LOAD_SCALE, so that
+                         * while calculating NUMA group's cpu_power
+                         * we can simply do
+                         *  numa_group->cpu_power += phys_group->cpu_power;
+                         *
+                         * See "only add power once for each physical pkg"
+                         * comment below
+                         */
+                        sd->groups->cpu_power = SCHED_LOAD_SCALE;
 #else
+                int power;
                sd = &per_cpu(phys_domains, i);
-                power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+                if (sched_smt_power_savings)
-                                (cpus_weight(sd->groups->cpumask)-1) / 10;
+                        power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
+                else
+                        power = SCHED_LOAD_SCALE;
                sd->groups->cpu_power = power;
 #endif
        }
@@ -6244,6 +6360,80 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
        return err;
 }
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+int arch_reinit_sched_domains(void)
+{
+        int err;
+        lock_cpu_hotplug();
+        detach_destroy_domains(&cpu_online_map);
+        err = arch_init_sched_domains(&cpu_online_map);
+        unlock_cpu_hotplug();
+        return err;
+}
+static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
+{
+        int ret;
+        if (buf[0] != '0' && buf[0] != '1')
+                return -EINVAL;
+        if (smt)
+                sched_smt_power_savings = (buf[0] == '1');
+        else
+                sched_mc_power_savings = (buf[0] == '1');
+        ret = arch_reinit_sched_domains();
+        return ret ? ret : count;
+}
+int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+{
+        int err = 0;
+#ifdef CONFIG_SCHED_SMT
+        if (smt_capable())
+                err = sysfs_create_file(&cls->kset.kobj,
+                                        &attr_sched_smt_power_savings.attr);
+#endif
+#ifdef CONFIG_SCHED_MC
+        if (!err && mc_capable())
+                err = sysfs_create_file(&cls->kset.kobj,
+                                        &attr_sched_mc_power_savings.attr);
+#endif
+        return err;
+}
+#endif
+#ifdef CONFIG_SCHED_MC
+static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
+{
+        return sprintf(page, "%u\n", sched_mc_power_savings);
+}
+static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
+{
+        return sched_power_savings_store(buf, count, 0);
+}
+SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
+            sched_mc_power_savings_store);
+#endif
+#ifdef CONFIG_SCHED_SMT
+static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
+{
+        return sprintf(page, "%u\n", sched_smt_power_savings);
+}
+static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
+{
+        return sched_power_savings_store(buf, count, 1);
+}
+SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+            sched_smt_power_savings_store);
+#endif
 #ifdef CONFIG_HOTPLUG_CPU
 /*
 * Force a reinitialization of the sched domains hierarchy.  The domains
author	Siddha, Suresh B <suresh.b.siddha@intel.com>	2006-06-27 05:54:42 -0400
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-06-27 20:32:45 -0400
commit	5c45bf279d378d436ce45825c0f136696c7b6109 (patch)
tree	80e2fcf4866b84fccb787562e1a83b16f4bc8850 /kernel/sched.c
parent	369381694ddcf03f1de403501c8b97099b5109ec (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index 122b75584a13..54fa282657cc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -1162,6 +1162,11 @@ static int sched_balance_self(int cpu, int flag)
1162	struct sched_domain tmp, sd = NULL;	1162	struct sched_domain tmp, sd = NULL;
1163		1163
1164	for_each_domain(cpu, tmp) {	1164	for_each_domain(cpu, tmp) {
		1165	/*
		1166	* If power savings logic is enabled for a domain, stop there.
		1167	*/
		1168	if (tmp->flags & SD_POWERSAVINGS_BALANCE)
		1169	break;
1165	if (tmp->flags & flag)	1170	if (tmp->flags & flag)
1166	sd = tmp;	1171	sd = tmp;
1167	}	1172	}
@@ -2082,6 +2087,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2082	unsigned long busiest_load_per_task, busiest_nr_running;	2087	unsigned long busiest_load_per_task, busiest_nr_running;
2083	unsigned long this_load_per_task, this_nr_running;	2088	unsigned long this_load_per_task, this_nr_running;
2084	int load_idx;	2089	int load_idx;
		2090	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)
		2091	int power_savings_balance = 1;
		2092	unsigned long leader_nr_running = 0, min_load_per_task = 0;
		2093	unsigned long min_nr_running = ULONG_MAX;
		2094	struct sched_group group_min = NULL, group_leader = NULL;
		2095	#endif
2085		2096
2086	max_load = this_load = total_load = total_pwr = 0;	2097	max_load = this_load = total_load = total_pwr = 0;
2087	busiest_load_per_task = busiest_nr_running = 0;	2098	busiest_load_per_task = busiest_nr_running = 0;
@@ -2094,7 +2105,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2094	load_idx = sd->idle_idx;	2105	load_idx = sd->idle_idx;
2095		2106
2096	do {	2107	do {
2097	unsigned long load;	2108	unsigned long load, group_capacity;
2098	int local_group;	2109	int local_group;
2099	int i;	2110	int i;
2100	unsigned long sum_nr_running, sum_weighted_load;	2111	unsigned long sum_nr_running, sum_weighted_load;
@@ -2127,18 +2138,76 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2127	/* Adjust by relative CPU power of the group */	2138	/* Adjust by relative CPU power of the group */
2128	avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;	2139	avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
2129		2140
		2141	group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
		2142
2130	if (local_group) {	2143	if (local_group) {
2131	this_load = avg_load;	2144	this_load = avg_load;
2132	this = group;	2145	this = group;
2133	this_nr_running = sum_nr_running;	2146	this_nr_running = sum_nr_running;
2134	this_load_per_task = sum_weighted_load;	2147	this_load_per_task = sum_weighted_load;
2135	} else if (avg_load > max_load &&	2148	} else if (avg_load > max_load &&
2136	sum_nr_running > group->cpu_power / SCHED_LOAD_SCALE) {	2149	sum_nr_running > group_capacity) {
2137	max_load = avg_load;	2150	max_load = avg_load;
2138	busiest = group;	2151	busiest = group;
2139	busiest_nr_running = sum_nr_running;	2152	busiest_nr_running = sum_nr_running;
2140	busiest_load_per_task = sum_weighted_load;	2153	busiest_load_per_task = sum_weighted_load;
2141	}	2154	}
		2155
		2156	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)
		2157	/*
		2158	* Busy processors will not participate in power savings
		2159	* balance.
		2160	*/
		2161	if (idle == NOT_IDLE \|\| !(sd->flags & SD_POWERSAVINGS_BALANCE))
		2162	goto group_next;
		2163
		2164	/*
		2165	* If the local group is idle or completely loaded
		2166	* no need to do power savings balance at this domain
		2167	*/
		2168	if (local_group && (this_nr_running >= group_capacity \|\|
		2169	!this_nr_running))
		2170	power_savings_balance = 0;
		2171
		2172	/*
		2173	* If a group is already running at full capacity or idle,
		2174	* don't include that group in power savings calculations
		2175	*/
		2176	if (!power_savings_balance \|\| sum_nr_running >= group_capacity
		2177	\|\| !sum_nr_running)
		2178	goto group_next;
		2179
		2180	/*
		2181	* Calculate the group which has the least non-idle load.
		2182	* This is the group from where we need to pick up the load
		2183	* for saving power
		2184	*/
		2185	if ((sum_nr_running < min_nr_running) \|\|
		2186	(sum_nr_running == min_nr_running &&
		2187	first_cpu(group->cpumask) <
		2188	first_cpu(group_min->cpumask))) {
		2189	group_min = group;
		2190	min_nr_running = sum_nr_running;
		2191	min_load_per_task = sum_weighted_load /
		2192	sum_nr_running;
		2193	}
		2194
		2195	/*
		2196	* Calculate the group which is almost near its
		2197	* capacity but still has some space to pick up some load
		2198	* from other group and save more power
		2199	*/
		2200	if (sum_nr_running <= group_capacity - 1)
		2201	if (sum_nr_running > leader_nr_running \|\|
		2202	(sum_nr_running == leader_nr_running &&
		2203	first_cpu(group->cpumask) >
		2204	first_cpu(group_leader->cpumask))) {
		2205	group_leader = group;
		2206	leader_nr_running = sum_nr_running;
		2207	}
		2208
		2209	group_next:
		2210	#endif
2142	group = group->next;	2211	group = group->next;
2143	} while (group != sd->groups);	2212	} while (group != sd->groups);
2144		2213
@@ -2247,7 +2316,16 @@ small_imbalance:
2247	return busiest;	2316	return busiest;
2248		2317
2249	out_balanced:	2318	out_balanced:
		2319	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)
		2320	if (idle == NOT_IDLE \|\| !(sd->flags & SD_POWERSAVINGS_BALANCE))
		2321	goto ret;
2250		2322
		2323	if (this == group_leader && group_leader != group_min) {
		2324	*imbalance = min_load_per_task;
		2325	return group_min;
		2326	}
		2327	ret:
		2328	#endif
2251	*imbalance = 0;	2329	*imbalance = 0;
2252	return NULL;	2330	return NULL;
2253	}	2331	}
@@ -2300,7 +2378,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2300	int active_balance = 0;	2378	int active_balance = 0;
2301	int sd_idle = 0;	2379	int sd_idle = 0;
2302		2380
2303	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)	2381	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
		2382	!sched_smt_power_savings)
2304	sd_idle = 1;	2383	sd_idle = 1;
2305		2384
2306	schedstat_inc(sd, lb_cnt[idle]);	2385	schedstat_inc(sd, lb_cnt[idle]);
@@ -2389,7 +2468,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2389	sd->balance_interval *= 2;	2468	sd->balance_interval *= 2;
2390	}	2469	}
2391		2470
2392	if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)	2471	if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
		2472	!sched_smt_power_savings)
2393	return -1;	2473	return -1;
2394	return nr_moved;	2474	return nr_moved;
2395		2475
@@ -2404,7 +2484,7 @@ out_one_pinned:
2404	(sd->balance_interval < sd->max_interval))	2484	(sd->balance_interval < sd->max_interval))
2405	sd->balance_interval *= 2;	2485	sd->balance_interval *= 2;
2406		2486
2407	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)	2487	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2408	return -1;	2488	return -1;
2409	return 0;	2489	return 0;
2410	}	2490	}
@@ -2425,7 +2505,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2425	int nr_moved = 0;	2505	int nr_moved = 0;
2426	int sd_idle = 0;	2506	int sd_idle = 0;
2427		2507
2428	if (sd->flags & SD_SHARE_CPUPOWER)	2508	if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2429	sd_idle = 1;	2509	sd_idle = 1;
2430		2510
2431	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);	2511	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2466,7 +2546,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2466		2546
2467	out_balanced:	2547	out_balanced:
2468	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);	2548	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2469	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)	2549	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2470	return -1;	2550	return -1;
2471	sd->nr_balance_failed = 0;	2551	sd->nr_balance_failed = 0;
2472	return 0;	2552	return 0;
@@ -5732,6 +5812,7 @@ static cpumask_t sched_domain_node_span(int node)
5732	}	5812	}
5733	#endif	5813	#endif
5734		5814
		5815	int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5735	/*	5816	/*
5736	* At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we	5817	* At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
5737	* can switch it on easily if needed.	5818	* can switch it on easily if needed.
@@ -6113,37 +6194,72 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6113	#endif	6194	#endif
6114		6195
6115	/* Calculate CPU power for physical packages and nodes */	6196	/* Calculate CPU power for physical packages and nodes */
		6197	#ifdef CONFIG_SCHED_SMT
6116	for_each_cpu_mask(i, *cpu_map) {	6198	for_each_cpu_mask(i, *cpu_map) {
6117	int power;
6118	struct sched_domain *sd;	6199	struct sched_domain *sd;
6119	#ifdef CONFIG_SCHED_SMT
6120	sd = &per_cpu(cpu_domains, i);	6200	sd = &per_cpu(cpu_domains, i);
6121	power = SCHED_LOAD_SCALE;	6201	sd->groups->cpu_power = SCHED_LOAD_SCALE;
6122	sd->groups->cpu_power = power;	6202	}
6123	#endif	6203	#endif
6124	#ifdef CONFIG_SCHED_MC	6204	#ifdef CONFIG_SCHED_MC
		6205	for_each_cpu_mask(i, *cpu_map) {
		6206	int power;
		6207	struct sched_domain *sd;
6125	sd = &per_cpu(core_domains, i);	6208	sd = &per_cpu(core_domains, i);
6126	power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)	6209	if (sched_smt_power_savings)
		6210	power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
		6211	else
		6212	power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
6127	* SCHED_LOAD_SCALE / 10;	6213	* SCHED_LOAD_SCALE / 10;
6128	sd->groups->cpu_power = power;	6214	sd->groups->cpu_power = power;
		6215	}
		6216	#endif
6129		6217
		6218	for_each_cpu_mask(i, *cpu_map) {
		6219	struct sched_domain *sd;
		6220	#ifdef CONFIG_SCHED_MC
6130	sd = &per_cpu(phys_domains, i);	6221	sd = &per_cpu(phys_domains, i);
		6222	if (i != first_cpu(sd->groups->cpumask))
		6223	continue;
6131		6224
6132	/*	6225	sd->groups->cpu_power = 0;
6133	* This has to be < 2 * SCHED_LOAD_SCALE	6226	if (sched_mc_power_savings \|\| sched_smt_power_savings) {
6134	* Lets keep it SCHED_LOAD_SCALE, so that	6227	int j;
6135	* while calculating NUMA group's cpu_power	6228
6136	* we can simply do	6229	for_each_cpu_mask(j, sd->groups->cpumask) {
6137	* numa_group->cpu_power += phys_group->cpu_power;	6230	struct sched_domain *sd1;
6138	*	6231	sd1 = &per_cpu(core_domains, j);
6139	* See "only add power once for each physical pkg"	6232	/*
6140	* comment below	6233	* for each core we will add once
6141	*/	6234	* to the group in physical domain
6142	sd->groups->cpu_power = SCHED_LOAD_SCALE;	6235	*/
		6236	if (j != first_cpu(sd1->groups->cpumask))
		6237	continue;
		6238
		6239	if (sched_smt_power_savings)
		6240	sd->groups->cpu_power += sd1->groups->cpu_power;
		6241	else
		6242	sd->groups->cpu_power += SCHED_LOAD_SCALE;
		6243	}
		6244	} else
		6245	/*
		6246	* This has to be < 2 * SCHED_LOAD_SCALE
		6247	* Lets keep it SCHED_LOAD_SCALE, so that
		6248	* while calculating NUMA group's cpu_power
		6249	* we can simply do
		6250	* numa_group->cpu_power += phys_group->cpu_power;
		6251	*
		6252	* See "only add power once for each physical pkg"
		6253	* comment below
		6254	*/
		6255	sd->groups->cpu_power = SCHED_LOAD_SCALE;
6143	#else	6256	#else
		6257	int power;
6144	sd = &per_cpu(phys_domains, i);	6258	sd = &per_cpu(phys_domains, i);
6145	power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *	6259	if (sched_smt_power_savings)
6146	(cpus_weight(sd->groups->cpumask)-1) / 10;	6260	power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
		6261	else
		6262	power = SCHED_LOAD_SCALE;
6147	sd->groups->cpu_power = power;	6263	sd->groups->cpu_power = power;
6148	#endif	6264	#endif
6149	}	6265	}
@@ -6244,6 +6360,80 @@ int partition_sched_domains(cpumask_t partition1, cpumask_t partition2)
6244	return err;	6360	return err;
6245	}	6361	}
6246		6362
		6363	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)
		6364	int arch_reinit_sched_domains(void)
		6365	{
		6366	int err;
		6367
		6368	lock_cpu_hotplug();
		6369	detach_destroy_domains(&cpu_online_map);
		6370	err = arch_init_sched_domains(&cpu_online_map);
		6371	unlock_cpu_hotplug();
		6372
		6373	return err;
		6374	}
		6375
		6376	static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
		6377	{
		6378	int ret;
		6379
		6380	if (buf[0] != '0' && buf[0] != '1')
		6381	return -EINVAL;
		6382
		6383	if (smt)
		6384	sched_smt_power_savings = (buf[0] == '1');
		6385	else
		6386	sched_mc_power_savings = (buf[0] == '1');
		6387
		6388	ret = arch_reinit_sched_domains();
		6389
		6390	return ret ? ret : count;
		6391	}
		6392
		6393	int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
		6394	{
		6395	int err = 0;
		6396	#ifdef CONFIG_SCHED_SMT
		6397	if (smt_capable())
		6398	err = sysfs_create_file(&cls->kset.kobj,
		6399	&attr_sched_smt_power_savings.attr);
		6400	#endif
		6401	#ifdef CONFIG_SCHED_MC
		6402	if (!err && mc_capable())
		6403	err = sysfs_create_file(&cls->kset.kobj,
		6404	&attr_sched_mc_power_savings.attr);
		6405	#endif
		6406	return err;
		6407	}
		6408	#endif
		6409
		6410	#ifdef CONFIG_SCHED_MC
		6411	static ssize_t sched_mc_power_savings_show(struct sys_device dev, char page)
		6412	{
		6413	return sprintf(page, "%u\n", sched_mc_power_savings);
		6414	}
		6415	static ssize_t sched_mc_power_savings_store(struct sys_device dev, const char buf, size_t count)
		6416	{
		6417	return sched_power_savings_store(buf, count, 0);
		6418	}
		6419	SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
		6420	sched_mc_power_savings_store);
		6421	#endif
		6422
		6423	#ifdef CONFIG_SCHED_SMT
		6424	static ssize_t sched_smt_power_savings_show(struct sys_device dev, char page)
		6425	{
		6426	return sprintf(page, "%u\n", sched_smt_power_savings);
		6427	}
		6428	static ssize_t sched_smt_power_savings_store(struct sys_device dev, const char buf, size_t count)
		6429	{
		6430	return sched_power_savings_store(buf, count, 1);
		6431	}
		6432	SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
		6433	sched_smt_power_savings_store);
		6434	#endif
		6435
		6436
6247	#ifdef CONFIG_HOTPLUG_CPU	6437	#ifdef CONFIG_HOTPLUG_CPU
6248	/*	6438	/*
6249	* Force a reinitialization of the sched domains hierarchy. The domains	6439	* Force a reinitialization of the sched domains hierarchy. The domains