3 files changed, 61 insertions, 10 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6546083af3e0..781acb91a50a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5994,6 +5994,44 @@ struct sched_domain_topology_level {
        struct sd_data      data;
 };
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * cpu they're built on, so check that.
+ *
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+        const struct cpumask *span = sched_domain_span(sd);
+        struct sd_data *sdd = sd->private;
+        struct sched_domain *sibling;
+        int i;
+        for_each_cpu(i, span) {
+                sibling = *per_cpu_ptr(sdd->sd, i);
+                if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+                        continue;
+                cpumask_set_cpu(i, sched_group_mask(sg));
+        }
+}
+/*
+ * Return the canonical balance cpu for this group, this is the first cpu
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+        return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
 static int
 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 {
@@ -6012,6 +6050,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                if (cpumask_test_cpu(i, covered))
                        continue;
+                child = *per_cpu_ptr(sdd->sd, i);
+                /* See the comment near build_group_mask(). */
+                if (!cpumask_test_cpu(i, sched_domain_span(child)))
+                        continue;
                sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
                                GFP_KERNEL, cpu_to_node(cpu));
@@ -6019,8 +6063,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                        goto fail;
                sg_span = sched_group_cpus(sg);
-                child = *per_cpu_ptr(sdd->sd, i);
                if (child->child) {
                        child = child->child;
                        cpumask_copy(sg_span, sched_domain_span(child));
@@ -6030,13 +6072,18 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                cpumask_or(covered, covered, sg_span);
                sg->sgp = *per_cpu_ptr(sdd->sgp, i);
-                atomic_inc(&sg->sgp->ref);
+                if (atomic_inc_return(&sg->sgp->ref) == 1)
+                        build_group_mask(sd, sg);
+                /*
+                 * Make sure the first group of this domain contains the
+                 * canonical balance cpu. Otherwise the sched_domain iteration
+                 * breaks. See update_sg_lb_stats().
+                 */
                if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
-                               cpumask_first(sg_span) == cpu) {
+                    group_balance_cpu(sg) == cpu)
-                        WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
                        groups = sg;
-                }
                if (!first)
                        first = sg;
@@ -6109,6 +6156,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
                cpumask_clear(sched_group_cpus(sg));
                sg->sgp->power = 0;
+                cpumask_setall(sched_group_mask(sg));
                for_each_cpu(j, span) {
                        if (get_group(j, sdd, NULL) != group)
@@ -6150,7 +6198,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
                sg = sg->next;
        } while (sg != sd->groups);
-        if (cpu != group_first_cpu(sg))
+        if (cpu != group_balance_cpu(sg))
                return;
        update_group_power(sd, cpu);
@@ -6525,7 +6573,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                        *per_cpu_ptr(sdd->sg, j) = sg;
-                        sgp = kzalloc_node(sizeof(struct sched_group_power),
+                        sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
                                        GFP_KERNEL, cpu_to_node(j));
                        if (!sgp)
                                return -ENOMEM;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b2a2d236f27b..54cbaa4e7b37 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3652,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        int i;
        if (local_group)
-                balance_cpu = group_first_cpu(group);
+                balance_cpu = group_balance_cpu(group);
        /* Tally up the load of all CPUs in the group */
        max_cpu_load = 0;
@@ -3667,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                /* Bias balancing toward cpus of our domain */
                if (local_group) {
-                        if (idle_cpu(i) && !first_idle_cpu) {
+                        if (idle_cpu(i) && !first_idle_cpu &&
+                                        cpumask_test_cpu(i, sched_group_mask(group))) {
                                first_idle_cpu = 1;
                                balance_cpu = i;
                        }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ba9dccfd24ce..6d52cea7f33d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_id);
+extern int group_balance_cpu(struct sched_group *sg);
 #endif /* CONFIG_SMP */
 #include "stats.h"