Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar. * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched: Fix the relax_domain_level boot parameter sched: Validate assumptions in sched_init_numa() sched: Always initialize cpu-power sched: Fix domain iteration sched/rt: Fix lockdep annotation within find_lock_lowest_rq() sched/numa: Load balance between remote nodes sched/x86: Calculate booted cores after construction of sibling_mask
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-06-08 17:59:29 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-06-08 17:59:29 -0400
commit: 72494504498ff5ac2f086a83473d4dd1ca490bd3 (patch)
tree: 7f1ceab43de3580235f1a56f2ae865901c09e4d7 /kernel
parent: cd96891d48a945ca2011fbeceda73813d6286195 (diff)
parent: a841f8cef4bb124f0f5563314d0beaf2e1249d72 (diff)
4 files changed, 159 insertions, 39 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c46958e26121..d5594a4268d4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5556,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 #ifdef CONFIG_SCHED_DEBUG
-static __read_mostly int sched_domain_debug_enabled;
+static __read_mostly int sched_debug_enabled;
-static int __init sched_domain_debug_setup(char *str)
+static int __init sched_debug_setup(char *str)
 {
-        sched_domain_debug_enabled = 1;
+        sched_debug_enabled = 1;
        return 0;
 }
-early_param("sched_debug", sched_domain_debug_setup);
+early_param("sched_debug", sched_debug_setup);
+static inline bool sched_debug(void)
+{
+        return sched_debug_enabled;
+}
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  struct cpumask *groupmask)
@@ -5604,7 +5609,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
-                if (!group->sgp->power) {
+                /*
+                 * Even though we initialize ->power to something semi-sane,
+                 * we leave power_orig unset. This allows us to detect if
+                 * domain iteration is still funny without causing /0 traps.
+                 */
+                if (!group->sgp->power_orig) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: domain->cpu_power not "
                                        "set\n");
@@ -5652,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
        int level = 0;
-        if (!sched_domain_debug_enabled)
+        if (!sched_debug_enabled)
                return;
        if (!sd) {
@@ -5673,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+        return false;
+}
 #endif /* CONFIG_SCHED_DEBUG */
 static int sd_degenerate(struct sched_domain *sd)
@@ -5994,6 +6008,44 @@ struct sched_domain_topology_level {
        struct sd_data      data;
 };
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * cpu they're built on, so check that.
+ *
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+        const struct cpumask *span = sched_domain_span(sd);
+        struct sd_data *sdd = sd->private;
+        struct sched_domain *sibling;
+        int i;
+        for_each_cpu(i, span) {
+                sibling = *per_cpu_ptr(sdd->sd, i);
+                if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+                        continue;
+                cpumask_set_cpu(i, sched_group_mask(sg));
+        }
+}
+/*
+ * Return the canonical balance cpu for this group, this is the first cpu
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+        return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
 static int
 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 {
@@ -6012,6 +6064,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                if (cpumask_test_cpu(i, covered))
                        continue;
+                child = *per_cpu_ptr(sdd->sd, i);
+                /* See the comment near build_group_mask(). */
+                if (!cpumask_test_cpu(i, sched_domain_span(child)))
+                        continue;
                sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
                                GFP_KERNEL, cpu_to_node(cpu));
@@ -6019,8 +6077,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                        goto fail;
                sg_span = sched_group_cpus(sg);
-                child = *per_cpu_ptr(sdd->sd, i);
                if (child->child) {
                        child = child->child;
                        cpumask_copy(sg_span, sched_domain_span(child));
@@ -6030,13 +6086,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                cpumask_or(covered, covered, sg_span);
                sg->sgp = *per_cpu_ptr(sdd->sgp, i);
-                atomic_inc(&sg->sgp->ref);
+                if (atomic_inc_return(&sg->sgp->ref) == 1)
+                        build_group_mask(sd, sg);
+                /*
+                 * Initialize sgp->power such that even if we mess up the
+                 * domains and no possible iteration will get us here, we won't
+                 * die on a /0 trap.
+                 */
+                sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+                /*
+                 * Make sure the first group of this domain contains the
+                 * canonical balance cpu. Otherwise the sched_domain iteration
+                 * breaks. See update_sg_lb_stats().
+                 */
                if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
-                               cpumask_first(sg_span) == cpu) {
+                    group_balance_cpu(sg) == cpu)
-                        WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
                        groups = sg;
-                }
                if (!first)
                        first = sg;
@@ -6109,6 +6176,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
                cpumask_clear(sched_group_cpus(sg));
                sg->sgp->power = 0;
+                cpumask_setall(sched_group_mask(sg));
                for_each_cpu(j, span) {
                        if (get_group(j, sdd, NULL) != group)
@@ -6150,7 +6218,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
                sg = sg->next;
        } while (sg != sd->groups);
-        if (cpu != group_first_cpu(sg))
+        if (cpu != group_balance_cpu(sg))
                return;
        update_group_power(sd, cpu);
@@ -6200,11 +6268,8 @@ int sched_domain_level_max;
 static int __init setup_relax_domain_level(char *str)
 {
-        unsigned long val;
+        if (kstrtoint(str, 0, &default_relax_domain_level))
+                pr_warn("Unable to set relax_domain_level\n");
-        val = simple_strtoul(str, NULL, 0);
-        if (val < sched_domain_level_max)
-                default_relax_domain_level = val;
        return 1;
 }
@@ -6314,14 +6379,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol
 #ifdef CONFIG_NUMA
 static int sched_domains_numa_levels;
-static int sched_domains_numa_scale;
 static int *sched_domains_numa_distance;
 static struct cpumask ***sched_domains_numa_masks;
 static int sched_domains_curr_level;
 static inline int sd_local_flags(int level)
 {
-        if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
+        if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
                return 0;
        return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
@@ -6379,6 +6443,42 @@ static const struct cpumask *sd_numa_mask(int cpu)
        return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
 }
+static void sched_numa_warn(const char *str)
+{
+        static int done = false;
+        int i,j;
+        if (done)
+                return;
+        done = true;
+        printk(KERN_WARNING "ERROR: %s\n\n", str);
+        for (i = 0; i < nr_node_ids; i++) {
+                printk(KERN_WARNING "  ");
+                for (j = 0; j < nr_node_ids; j++)
+                        printk(KERN_CONT "%02d ", node_distance(i,j));
+                printk(KERN_CONT "\n");
+        }
+        printk(KERN_WARNING "\n");
+}
+static bool find_numa_distance(int distance)
+{
+        int i;
+        if (distance == node_distance(0, 0))
+                return true;
+        for (i = 0; i < sched_domains_numa_levels; i++) {
+                if (sched_domains_numa_distance[i] == distance)
+                        return true;
+        }
+        return false;
+}
 static void sched_init_numa(void)
 {
        int next_distance, curr_distance = node_distance(0, 0);
@@ -6386,7 +6486,6 @@ static void sched_init_numa(void)
        int level = 0;
        int i, j, k;
-        sched_domains_numa_scale = curr_distance;
        sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
        if (!sched_domains_numa_distance)
                return;
@@ -6397,23 +6496,41 @@ static void sched_init_numa(void)
         *
         * Assumes node_distance(0,j) includes all distances in
         * node_distance(i,j) in order to avoid cubic time.
-         *
-         * XXX: could be optimized to O(n log n) by using sort()
         */
        next_distance = curr_distance;
        for (i = 0; i < nr_node_ids; i++) {
                for (j = 0; j < nr_node_ids; j++) {
-                        int distance = node_distance(0, j);
+                        for (k = 0; k < nr_node_ids; k++) {
-                        if (distance > curr_distance &&
+                                int distance = node_distance(i, k);
-                                        (distance < next_distance ||
-                                         next_distance == curr_distance))
+                                if (distance > curr_distance &&
-                                next_distance = distance;
+                                    (distance < next_distance ||
+                                     next_distance == curr_distance))
+                                        next_distance = distance;
+                                /*
+                                 * While not a strong assumption it would be nice to know
+                                 * about cases where if node A is connected to B, B is not
+                                 * equally connected to A.
+                                 */
+                                if (sched_debug() && node_distance(k, i) != distance)
+                                        sched_numa_warn("Node-distance not symmetric");
+                                if (sched_debug() && i && !find_numa_distance(distance))
+                                        sched_numa_warn("Node-0 not representative");
+                        }
+                        if (next_distance != curr_distance) {
+                                sched_domains_numa_distance[level++] = next_distance;
+                                sched_domains_numa_levels = level;
+                                curr_distance = next_distance;
+                        } else break;
                }
-                if (next_distance != curr_distance) {
-                        sched_domains_numa_distance[level++] = next_distance;
+                /*
-                        sched_domains_numa_levels = level;
+                 * In case of sched_debug() we verify the above assumption.
-                        curr_distance = next_distance;
+                 */
-                } else break;
+                if (!sched_debug())
+                        break;
        }
        /*
         * 'level' contains the number of unique distances, excluding the
@@ -6525,7 +6642,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                        *per_cpu_ptr(sdd->sg, j) = sg;
-                        sgp = kzalloc_node(sizeof(struct sched_group_power),
+                        sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
                                        GFP_KERNEL, cpu_to_node(j));
                        if (!sgp)
                                return -ENOMEM;
@@ -6578,7 +6695,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
        if (!sd)
                return child;
-        set_domain_attribute(sd, attr);
        cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
        if (child) {
                sd->level = child->level + 1;
@@ -6586,6 +6702,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                child->parent = sd;
        }
        sd->child = child;
+        set_domain_attribute(sd, attr);
        return sd;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d5583f9588e7..c099cc6eebe3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3602,7 +3602,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
                } while (group != child->groups);
        }
-        sdg->sgp->power = power;
+        sdg->sgp->power_orig = sdg->sgp->power = power;
 }
 /*
@@ -3652,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        int i;
        if (local_group)
-                balance_cpu = group_first_cpu(group);
+                balance_cpu = group_balance_cpu(group);
        /* Tally up the load of all CPUs in the group */
        max_cpu_load = 0;
@@ -3667,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                /* Bias balancing toward cpus of our domain */
                if (local_group) {
-                        if (idle_cpu(i) && !first_idle_cpu) {
+                        if (idle_cpu(i) && !first_idle_cpu &&
+                                        cpumask_test_cpu(i, sched_group_mask(group))) {
                                first_idle_cpu = 1;
                                balance_cpu = i;
                        }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2a4e8dffbd6b..573e1ca01102 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1562,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                                     task_running(rq, task) ||
                                     !task->on_rq)) {
-                                raw_spin_unlock(&lowest_rq->lock);
+                                double_unlock_balance(rq, lowest_rq);
                                lowest_rq = NULL;
                                break;
                        }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ba9dccfd24ce..6d52cea7f33d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_id);
+extern int group_balance_cpu(struct sched_group *sg);
 #endif /* CONFIG_SMP */
 #include "stats.h"
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-06-08 17:59:29 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-06-08 17:59:29 -0400
commit	72494504498ff5ac2f086a83473d4dd1ca490bd3 (patch)
tree	7f1ceab43de3580235f1a56f2ae865901c09e4d7 /kernel
parent	cd96891d48a945ca2011fbeceda73813d6286195 (diff)
parent	a841f8cef4bb124f0f5563314d0beaf2e1249d72 (diff)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c46958e26121..d5594a4268d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -5556,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5556		5556
5557	#ifdef CONFIG_SCHED_DEBUG	5557	#ifdef CONFIG_SCHED_DEBUG
5558		5558
5559	static __read_mostly int sched_domain_debug_enabled;	5559	static __read_mostly int sched_debug_enabled;
5560		5560
5561	static int __init sched_domain_debug_setup(char *str)	5561	static int __init sched_debug_setup(char *str)
5562	{	5562	{
5563	sched_domain_debug_enabled = 1;	5563	sched_debug_enabled = 1;
5564		5564
5565	return 0;	5565	return 0;
5566	}	5566	}
5567	early_param("sched_debug", sched_domain_debug_setup);	5567	early_param("sched_debug", sched_debug_setup);
		5568
		5569	static inline bool sched_debug(void)
		5570	{
		5571	return sched_debug_enabled;
		5572	}
5568		5573
5569	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,	5574	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5570	struct cpumask *groupmask)	5575	struct cpumask *groupmask)
@@ -5604,7 +5609,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5604	break;	5609	break;
5605	}	5610	}
5606		5611
5607	if (!group->sgp->power) {	5612	/*
		5613	* Even though we initialize ->power to something semi-sane,
		5614	* we leave power_orig unset. This allows us to detect if
		5615	* domain iteration is still funny without causing /0 traps.
		5616	*/
		5617	if (!group->sgp->power_orig) {
5608	printk(KERN_CONT "\n");	5618	printk(KERN_CONT "\n");
5609	printk(KERN_ERR "ERROR: domain->cpu_power not "	5619	printk(KERN_ERR "ERROR: domain->cpu_power not "
5610	"set\n");	5620	"set\n");
@@ -5652,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5652	{	5662	{
5653	int level = 0;	5663	int level = 0;
5654		5664
5655	if (!sched_domain_debug_enabled)	5665	if (!sched_debug_enabled)
5656	return;	5666	return;
5657		5667
5658	if (!sd) {	5668	if (!sd) {
@@ -5673,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5673	}	5683	}
5674	#else /* !CONFIG_SCHED_DEBUG */	5684	#else /* !CONFIG_SCHED_DEBUG */
5675	# define sched_domain_debug(sd, cpu) do { } while (0)	5685	# define sched_domain_debug(sd, cpu) do { } while (0)
		5686	static inline bool sched_debug(void)
		5687	{
		5688	return false;
		5689	}
5676	#endif /* CONFIG_SCHED_DEBUG */	5690	#endif /* CONFIG_SCHED_DEBUG */
5677		5691
5678	static int sd_degenerate(struct sched_domain *sd)	5692	static int sd_degenerate(struct sched_domain *sd)
@@ -5994,6 +6008,44 @@ struct sched_domain_topology_level {
5994	struct sd_data data;	6008	struct sd_data data;
5995	};	6009	};
5996		6010
		6011	/*
		6012	* Build an iteration mask that can exclude certain CPUs from the upwards
		6013	* domain traversal.
		6014	*
		6015	* Asymmetric node setups can result in situations where the domain tree is of
		6016	* unequal depth, make sure to skip domains that already cover the entire
		6017	* range.
		6018	*
		6019	* In that case build_sched_domains() will have terminated the iteration early
		6020	* and our sibling sd spans will be empty. Domains should always include the
		6021	* cpu they're built on, so check that.
		6022	*
		6023	*/
		6024	static void build_group_mask(struct sched_domain sd, struct sched_group sg)
		6025	{
		6026	const struct cpumask *span = sched_domain_span(sd);
		6027	struct sd_data *sdd = sd->private;
		6028	struct sched_domain *sibling;
		6029	int i;
		6030
		6031	for_each_cpu(i, span) {
		6032	sibling = *per_cpu_ptr(sdd->sd, i);
		6033	if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
		6034	continue;
		6035
		6036	cpumask_set_cpu(i, sched_group_mask(sg));
		6037	}
		6038	}
		6039
		6040	/*
		6041	* Return the canonical balance cpu for this group, this is the first cpu
		6042	* of this group that's also in the iteration mask.
		6043	*/
		6044	int group_balance_cpu(struct sched_group *sg)
		6045	{
		6046	return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
		6047	}
		6048
5997	static int	6049	static int
5998	build_overlap_sched_groups(struct sched_domain *sd, int cpu)	6050	build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5999	{	6051	{
@@ -6012,6 +6064,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6012	if (cpumask_test_cpu(i, covered))	6064	if (cpumask_test_cpu(i, covered))
6013	continue;	6065	continue;
6014		6066
		6067	child = *per_cpu_ptr(sdd->sd, i);
		6068
		6069	/* See the comment near build_group_mask(). */
		6070	if (!cpumask_test_cpu(i, sched_domain_span(child)))
		6071	continue;
		6072
6015	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),	6073	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6016	GFP_KERNEL, cpu_to_node(cpu));	6074	GFP_KERNEL, cpu_to_node(cpu));
6017		6075
@@ -6019,8 +6077,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6019	goto fail;	6077	goto fail;
6020		6078
6021	sg_span = sched_group_cpus(sg);	6079	sg_span = sched_group_cpus(sg);
6022
6023	child = *per_cpu_ptr(sdd->sd, i);
6024	if (child->child) {	6080	if (child->child) {
6025	child = child->child;	6081	child = child->child;
6026	cpumask_copy(sg_span, sched_domain_span(child));	6082	cpumask_copy(sg_span, sched_domain_span(child));
@@ -6030,13 +6086,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6030	cpumask_or(covered, covered, sg_span);	6086	cpumask_or(covered, covered, sg_span);
6031		6087
6032	sg->sgp = *per_cpu_ptr(sdd->sgp, i);	6088	sg->sgp = *per_cpu_ptr(sdd->sgp, i);
6033	atomic_inc(&sg->sgp->ref);	6089	if (atomic_inc_return(&sg->sgp->ref) == 1)
		6090	build_group_mask(sd, sg);
6034		6091
		6092	/*
		6093	* Initialize sgp->power such that even if we mess up the
		6094	* domains and no possible iteration will get us here, we won't
		6095	* die on a /0 trap.
		6096	*/
		6097	sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
		6098
		6099	/*
		6100	* Make sure the first group of this domain contains the
		6101	* canonical balance cpu. Otherwise the sched_domain iteration
		6102	* breaks. See update_sg_lb_stats().
		6103	*/
6035	if ((!groups && cpumask_test_cpu(cpu, sg_span)) \|\|	6104	if ((!groups && cpumask_test_cpu(cpu, sg_span)) \|\|
6036	cpumask_first(sg_span) == cpu) {	6105	group_balance_cpu(sg) == cpu)
6037	WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
6038	groups = sg;	6106	groups = sg;
6039	}
6040		6107
6041	if (!first)	6108	if (!first)
6042	first = sg;	6109	first = sg;
@@ -6109,6 +6176,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
6109		6176
6110	cpumask_clear(sched_group_cpus(sg));	6177	cpumask_clear(sched_group_cpus(sg));
6111	sg->sgp->power = 0;	6178	sg->sgp->power = 0;
		6179	cpumask_setall(sched_group_mask(sg));
6112		6180
6113	for_each_cpu(j, span) {	6181	for_each_cpu(j, span) {
6114	if (get_group(j, sdd, NULL) != group)	6182	if (get_group(j, sdd, NULL) != group)
@@ -6150,7 +6218,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6150	sg = sg->next;	6218	sg = sg->next;
6151	} while (sg != sd->groups);	6219	} while (sg != sd->groups);
6152		6220
6153	if (cpu != group_first_cpu(sg))	6221	if (cpu != group_balance_cpu(sg))
6154	return;	6222	return;
6155		6223
6156	update_group_power(sd, cpu);	6224	update_group_power(sd, cpu);
@@ -6200,11 +6268,8 @@ int sched_domain_level_max;
6200		6268
6201	static int __init setup_relax_domain_level(char *str)	6269	static int __init setup_relax_domain_level(char *str)
6202	{	6270	{
6203	unsigned long val;	6271	if (kstrtoint(str, 0, &default_relax_domain_level))
6204		6272	pr_warn("Unable to set relax_domain_level\n");
6205	val = simple_strtoul(str, NULL, 0);
6206	if (val < sched_domain_level_max)
6207	default_relax_domain_level = val;
6208		6273
6209	return 1;	6274	return 1;
6210	}	6275	}
@@ -6314,14 +6379,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol
6314	#ifdef CONFIG_NUMA	6379	#ifdef CONFIG_NUMA
6315		6380
6316	static int sched_domains_numa_levels;	6381	static int sched_domains_numa_levels;
6317	static int sched_domains_numa_scale;
6318	static int *sched_domains_numa_distance;	6382	static int *sched_domains_numa_distance;
6319	static struct cpumask ***sched_domains_numa_masks;	6383	static struct cpumask ***sched_domains_numa_masks;
6320	static int sched_domains_curr_level;	6384	static int sched_domains_curr_level;
6321		6385
6322	static inline int sd_local_flags(int level)	6386	static inline int sd_local_flags(int level)
6323	{	6387	{
6324	if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)	6388	if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
6325	return 0;	6389	return 0;
6326		6390
6327	return SD_BALANCE_EXEC \| SD_BALANCE_FORK \| SD_WAKE_AFFINE;	6391	return SD_BALANCE_EXEC \| SD_BALANCE_FORK \| SD_WAKE_AFFINE;
@@ -6379,6 +6443,42 @@ static const struct cpumask *sd_numa_mask(int cpu)
6379	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];	6443	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6380	}	6444	}
6381		6445
		6446	static void sched_numa_warn(const char *str)
		6447	{
		6448	static int done = false;
		6449	int i,j;
		6450
		6451	if (done)
		6452	return;
		6453
		6454	done = true;
		6455
		6456	printk(KERN_WARNING "ERROR: %s\n\n", str);
		6457
		6458	for (i = 0; i < nr_node_ids; i++) {
		6459	printk(KERN_WARNING " ");
		6460	for (j = 0; j < nr_node_ids; j++)
		6461	printk(KERN_CONT "%02d ", node_distance(i,j));
		6462	printk(KERN_CONT "\n");
		6463	}
		6464	printk(KERN_WARNING "\n");
		6465	}
		6466
		6467	static bool find_numa_distance(int distance)
		6468	{
		6469	int i;
		6470
		6471	if (distance == node_distance(0, 0))
		6472	return true;
		6473
		6474	for (i = 0; i < sched_domains_numa_levels; i++) {
		6475	if (sched_domains_numa_distance[i] == distance)
		6476	return true;
		6477	}
		6478
		6479	return false;
		6480	}
		6481
6382	static void sched_init_numa(void)	6482	static void sched_init_numa(void)
6383	{	6483	{
6384	int next_distance, curr_distance = node_distance(0, 0);	6484	int next_distance, curr_distance = node_distance(0, 0);
@@ -6386,7 +6486,6 @@ static void sched_init_numa(void)
6386	int level = 0;	6486	int level = 0;
6387	int i, j, k;	6487	int i, j, k;
6388		6488
6389	sched_domains_numa_scale = curr_distance;
6390	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);	6489	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6391	if (!sched_domains_numa_distance)	6490	if (!sched_domains_numa_distance)
6392	return;	6491	return;
@@ -6397,23 +6496,41 @@ static void sched_init_numa(void)
6397	*	6496	*
6398	* Assumes node_distance(0,j) includes all distances in	6497	* Assumes node_distance(0,j) includes all distances in
6399	* node_distance(i,j) in order to avoid cubic time.	6498	* node_distance(i,j) in order to avoid cubic time.
6400	*
6401	* XXX: could be optimized to O(n log n) by using sort()
6402	*/	6499	*/
6403	next_distance = curr_distance;	6500	next_distance = curr_distance;
6404	for (i = 0; i < nr_node_ids; i++) {	6501	for (i = 0; i < nr_node_ids; i++) {
6405	for (j = 0; j < nr_node_ids; j++) {	6502	for (j = 0; j < nr_node_ids; j++) {
6406	int distance = node_distance(0, j);	6503	for (k = 0; k < nr_node_ids; k++) {
6407	if (distance > curr_distance &&	6504	int distance = node_distance(i, k);
6408	(distance < next_distance \|\|	6505
6409	next_distance == curr_distance))	6506	if (distance > curr_distance &&
6410	next_distance = distance;	6507	(distance < next_distance \|\|
		6508	next_distance == curr_distance))
		6509	next_distance = distance;
		6510
		6511	/*
		6512	* While not a strong assumption it would be nice to know
		6513	* about cases where if node A is connected to B, B is not
		6514	* equally connected to A.
		6515	*/
		6516	if (sched_debug() && node_distance(k, i) != distance)
		6517	sched_numa_warn("Node-distance not symmetric");
		6518
		6519	if (sched_debug() && i && !find_numa_distance(distance))
		6520	sched_numa_warn("Node-0 not representative");
		6521	}
		6522	if (next_distance != curr_distance) {
		6523	sched_domains_numa_distance[level++] = next_distance;
		6524	sched_domains_numa_levels = level;
		6525	curr_distance = next_distance;
		6526	} else break;
6411	}	6527	}
6412	if (next_distance != curr_distance) {	6528
6413	sched_domains_numa_distance[level++] = next_distance;	6529	/*
6414	sched_domains_numa_levels = level;	6530	* In case of sched_debug() we verify the above assumption.
6415	curr_distance = next_distance;	6531	*/
6416	} else break;	6532	if (!sched_debug())
		6533	break;
6417	}	6534	}
6418	/*	6535	/*
6419	* 'level' contains the number of unique distances, excluding the	6536	* 'level' contains the number of unique distances, excluding the
@@ -6525,7 +6642,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6525		6642
6526	*per_cpu_ptr(sdd->sg, j) = sg;	6643	*per_cpu_ptr(sdd->sg, j) = sg;
6527		6644
6528	sgp = kzalloc_node(sizeof(struct sched_group_power),	6645	sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6529	GFP_KERNEL, cpu_to_node(j));	6646	GFP_KERNEL, cpu_to_node(j));
6530	if (!sgp)	6647	if (!sgp)
6531	return -ENOMEM;	6648	return -ENOMEM;
@@ -6578,7 +6695,6 @@ struct sched_domain build_sched_domain(struct sched_domain_topology_level tl,
6578	if (!sd)	6695	if (!sd)
6579	return child;	6696	return child;
6580		6697
6581	set_domain_attribute(sd, attr);
6582	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));	6698	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6583	if (child) {	6699	if (child) {
6584	sd->level = child->level + 1;	6700	sd->level = child->level + 1;
@@ -6586,6 +6702,7 @@ struct sched_domain build_sched_domain(struct sched_domain_topology_level tl,
6586	child->parent = sd;	6702	child->parent = sd;
6587	}	6703	}
6588	sd->child = child;	6704	sd->child = child;
		6705	set_domain_attribute(sd, attr);
6589		6706
6590	return sd;	6707	return sd;
6591	}	6708	}


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d5583f9588e7..c099cc6eebe3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -3602,7 +3602,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
3602	} while (group != child->groups);	3602	} while (group != child->groups);
3603	}	3603	}
3604		3604
3605	sdg->sgp->power = power;	3605	sdg->sgp->power_orig = sdg->sgp->power = power;
3606	}	3606	}
3607		3607
3608	/*	3608	/*
@@ -3652,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3652	int i;	3652	int i;
3653		3653
3654	if (local_group)	3654	if (local_group)
3655	balance_cpu = group_first_cpu(group);	3655	balance_cpu = group_balance_cpu(group);
3656		3656
3657	/* Tally up the load of all CPUs in the group */	3657	/* Tally up the load of all CPUs in the group */
3658	max_cpu_load = 0;	3658	max_cpu_load = 0;
@@ -3667,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3667		3667
3668	/* Bias balancing toward cpus of our domain */	3668	/* Bias balancing toward cpus of our domain */
3669	if (local_group) {	3669	if (local_group) {
3670	if (idle_cpu(i) && !first_idle_cpu) {	3670	if (idle_cpu(i) && !first_idle_cpu &&
		3671	cpumask_test_cpu(i, sched_group_mask(group))) {
3671	first_idle_cpu = 1;	3672	first_idle_cpu = 1;
3672	balance_cpu = i;	3673	balance_cpu = i;
3673	}	3674	}


diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2a4e8dffbd6b..573e1ca01102 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c
@@ -1562,7 +1562,7 @@ static struct rq find_lock_lowest_rq(struct task_struct task, struct rq *rq)
1562	task_running(rq, task) \|\|	1562	task_running(rq, task) \|\|
1563	!task->on_rq)) {	1563	!task->on_rq)) {
1564		1564
1565	raw_spin_unlock(&lowest_rq->lock);	1565	double_unlock_balance(rq, lowest_rq);
1566	lowest_rq = NULL;	1566	lowest_rq = NULL;
1567	break;	1567	break;
1568	}	1568	}


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ba9dccfd24ce..6d52cea7f33d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h
@@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
526	DECLARE_PER_CPU(struct sched_domain *, sd_llc);	526	DECLARE_PER_CPU(struct sched_domain *, sd_llc);
527	DECLARE_PER_CPU(int, sd_llc_id);	527	DECLARE_PER_CPU(int, sd_llc_id);
528		528
		529	extern int group_balance_cpu(struct sched_group *sg);
		530
529	#endif /* CONFIG_SMP */	531	#endif /* CONFIG_SMP */
530		532
531	#include "stats.h"	533	#include "stats.h"