7 files changed, 260 insertions, 532 deletions
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index b242594be55b..307514f7a282 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += acpi-ext.o
 obj-$(CONFIG_IA64_PALINFO)      += palinfo.o
 obj-$(CONFIG_IOSAPIC)           += iosapic.o
 obj-$(CONFIG_MODULES)           += module.o
-obj-$(CONFIG_SMP)               += smp.o smpboot.o domain.o
+obj-$(CONFIG_SMP)               += smp.o smpboot.o
 obj-$(CONFIG_NUMA)              += numa.o
 obj-$(CONFIG_PERFMON)           += perfmon_default_smpl.o
 obj-$(CONFIG_IA64_CYCLONE)      += cyclone.o
diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c
deleted file mode 100644
index e907109983f1..000000000000
--- a/arch/ia64/kernel/domain.c
+++ /dev/null
@@ -1,444 +0,0 @@
-/*
- * arch/ia64/kernel/domain.c
- * Architecture specific sched-domains builder.
- *
- * Copyright (C) 2004 Jesse Barnes
- * Copyright (C) 2004 Silicon Graphics, Inc.
- */
-#include <linux/sched.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/cpumask.h>
-#include <linux/init.h>
-#include <linux/topology.h>
-#include <linux/nodemask.h>
-#define SD_NODES_PER_DOMAIN 16
-#ifdef CONFIG_NUMA
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain.  Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, unsigned long *used_nodes)
-{
-        int i, n, val, min_val, best_node = 0;
-        min_val = INT_MAX;
-        for (i = 0; i < MAX_NUMNODES; i++) {
-                /* Start at @node */
-                n = (node + i) % MAX_NUMNODES;
-                if (!nr_cpus_node(n))
-                        continue;
-                /* Skip already used nodes */
-                if (test_bit(n, used_nodes))
-                        continue;
-                /* Simple min distance search */
-                val = node_distance(node, n);
-                if (val < min_val) {
-                        min_val = val;
-                        best_node = n;
-                }
-        }
-        set_bit(best_node, used_nodes);
-        return best_node;
-}
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @size: number of nodes to include in this span
- *
- * Given a node, construct a good cpumask for its sched_domain to span.  It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static cpumask_t sched_domain_node_span(int node)
-{
-        int i;
-        cpumask_t span, nodemask;
-        DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
-        cpus_clear(span);
-        bitmap_zero(used_nodes, MAX_NUMNODES);
-        nodemask = node_to_cpumask(node);
-        cpus_or(span, span, nodemask);
-        set_bit(node, used_nodes);
-        for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-                int next_node = find_next_best_node(node, used_nodes);
-                nodemask = node_to_cpumask(next_node);
-                cpus_or(span, span, nodemask);
-        }
-        return span;
-}
-#endif
-/*
- * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
- * can switch it on easily if needed.
- */
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static struct sched_group sched_group_cpus[NR_CPUS];
-static int cpu_to_cpu_group(int cpu)
-{
-        return cpu;
-}
-#endif
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
-static int cpu_to_phys_group(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
-        return first_cpu(cpu_sibling_map[cpu]);
-#else
-        return cpu;
-#endif
-}
-#ifdef CONFIG_NUMA
-/*
- * The init_sched_build_groups can't handle what we want to do with node
- * groups, so roll our own. Now each node has its own list of groups which
- * gets dynamically allocated.
- */
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
-static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
-static int cpu_to_allnodes_group(int cpu)
-{
-        return cpu_to_node(cpu);
-}
-#endif
-/*
- * Build sched domains for a given set of cpus and attach the sched domains
- * to the individual cpus
- */
-void build_sched_domains(const cpumask_t *cpu_map)
-{
-        int i;
-#ifdef CONFIG_NUMA
-        struct sched_group **sched_group_nodes = NULL;
-        struct sched_group *sched_group_allnodes = NULL;
-        /*
-         * Allocate the per-node list of sched groups
-         */
-        sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
-                                           GFP_ATOMIC);
-        if (!sched_group_nodes) {
-                printk(KERN_WARNING "Can not alloc sched group node list\n");
-                return;
-        }
-        sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
-#endif
-        /*
-         * Set up domains for cpus specified by the cpu_map.
-         */
-        for_each_cpu_mask(i, *cpu_map) {
-                int group;
-                struct sched_domain *sd = NULL, *p;
-                cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
-                cpus_and(nodemask, nodemask, *cpu_map);
-#ifdef CONFIG_NUMA
-                if (cpus_weight(*cpu_map)
-                                > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
-                        if (!sched_group_allnodes) {
-                                sched_group_allnodes
-                                        = kmalloc(sizeof(struct sched_group)
-                                                        * MAX_NUMNODES,
-                                                  GFP_KERNEL);
-                                if (!sched_group_allnodes) {
-                                        printk(KERN_WARNING
-                                        "Can not alloc allnodes sched group\n");
-                                        break;
-                                }
-                                sched_group_allnodes_bycpu[i]
-                                                = sched_group_allnodes;
-                        }
-                        sd = &per_cpu(allnodes_domains, i);
-                        *sd = SD_ALLNODES_INIT;
-                        sd->span = *cpu_map;
-                        group = cpu_to_allnodes_group(i);
-                        sd->groups = &sched_group_allnodes[group];
-                        p = sd;
-                } else
-                        p = NULL;
-                sd = &per_cpu(node_domains, i);
-                *sd = SD_NODE_INIT;
-                sd->span = sched_domain_node_span(cpu_to_node(i));
-                sd->parent = p;
-                cpus_and(sd->span, sd->span, *cpu_map);
-#endif
-                p = sd;
-                sd = &per_cpu(phys_domains, i);
-                group = cpu_to_phys_group(i);
-                *sd = SD_CPU_INIT;
-                sd->span = nodemask;
-                sd->parent = p;
-                sd->groups = &sched_group_phys[group];
-#ifdef CONFIG_SCHED_SMT
-                p = sd;
-                sd = &per_cpu(cpu_domains, i);
-                group = cpu_to_cpu_group(i);
-                *sd = SD_SIBLING_INIT;
-                sd->span = cpu_sibling_map[i];
-                cpus_and(sd->span, sd->span, *cpu_map);
-                sd->parent = p;
-                sd->groups = &sched_group_cpus[group];
-#endif
-        }
-#ifdef CONFIG_SCHED_SMT
-        /* Set up CPU (sibling) groups */
-        for_each_cpu_mask(i, *cpu_map) {
-                cpumask_t this_sibling_map = cpu_sibling_map[i];
-                cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
-                if (i != first_cpu(this_sibling_map))
-                        continue;
-                init_sched_build_groups(sched_group_cpus, this_sibling_map,
-                                                &cpu_to_cpu_group);
-        }
-#endif
-        /* Set up physical groups */
-        for (i = 0; i < MAX_NUMNODES; i++) {
-                cpumask_t nodemask = node_to_cpumask(i);
-                cpus_and(nodemask, nodemask, *cpu_map);
-                if (cpus_empty(nodemask))
-                        continue;
-                init_sched_build_groups(sched_group_phys, nodemask,
-                                                &cpu_to_phys_group);
-        }
-#ifdef CONFIG_NUMA
-        if (sched_group_allnodes)
-                init_sched_build_groups(sched_group_allnodes, *cpu_map,
-                                        &cpu_to_allnodes_group);
-        for (i = 0; i < MAX_NUMNODES; i++) {
-                /* Set up node groups */
-                struct sched_group *sg, *prev;
-                cpumask_t nodemask = node_to_cpumask(i);
-                cpumask_t domainspan;
-                cpumask_t covered = CPU_MASK_NONE;
-                int j;
-                cpus_and(nodemask, nodemask, *cpu_map);
-                if (cpus_empty(nodemask)) {
-                        sched_group_nodes[i] = NULL;
-                        continue;
-                }
-                domainspan = sched_domain_node_span(i);
-                cpus_and(domainspan, domainspan, *cpu_map);
-                sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
-                sched_group_nodes[i] = sg;
-                for_each_cpu_mask(j, nodemask) {
-                        struct sched_domain *sd;
-                        sd = &per_cpu(node_domains, j);
-                        sd->groups = sg;
-                        if (sd->groups == NULL) {
-                                /* Turn off balancing if we have no groups */
-                                sd->flags = 0;
-                        }
-                }
-                if (!sg) {
-                        printk(KERN_WARNING
-                        "Can not alloc domain group for node %d\n", i);
-                        continue;
-                }
-                sg->cpu_power = 0;
-                sg->cpumask = nodemask;
-                cpus_or(covered, covered, nodemask);
-                prev = sg;
-                for (j = 0; j < MAX_NUMNODES; j++) {
-                        cpumask_t tmp, notcovered;
-                        int n = (i + j) % MAX_NUMNODES;
-                        cpus_complement(notcovered, covered);
-                        cpus_and(tmp, notcovered, *cpu_map);
-                        cpus_and(tmp, tmp, domainspan);
-                        if (cpus_empty(tmp))
-                                break;
-                        nodemask = node_to_cpumask(n);
-                        cpus_and(tmp, tmp, nodemask);
-                        if (cpus_empty(tmp))
-                                continue;
-                        sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
-                        if (!sg) {
-                                printk(KERN_WARNING
-                                "Can not alloc domain group for node %d\n", j);
-                                break;
-                        }
-                        sg->cpu_power = 0;
-                        sg->cpumask = tmp;
-                        cpus_or(covered, covered, tmp);
-                        prev->next = sg;
-                        prev = sg;
-                }
-                prev->next = sched_group_nodes[i];
-        }
-#endif
-        /* Calculate CPU power for physical packages and nodes */
-        for_each_cpu_mask(i, *cpu_map) {
-                int power;
-                struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
-                sd = &per_cpu(cpu_domains, i);
-                power = SCHED_LOAD_SCALE;
-                sd->groups->cpu_power = power;
-#endif
-                sd = &per_cpu(phys_domains, i);
-                power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                                (cpus_weight(sd->groups->cpumask)-1) / 10;
-                sd->groups->cpu_power = power;
-#ifdef CONFIG_NUMA
-                sd = &per_cpu(allnodes_domains, i);
-                if (sd->groups) {
-                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                                (cpus_weight(sd->groups->cpumask)-1) / 10;
-                        sd->groups->cpu_power = power;
-                }
-#endif
-        }
-#ifdef CONFIG_NUMA
-        for (i = 0; i < MAX_NUMNODES; i++) {
-                struct sched_group *sg = sched_group_nodes[i];
-                int j;
-                if (sg == NULL)
-                        continue;
-next_sg:
-                for_each_cpu_mask(j, sg->cpumask) {
-                        struct sched_domain *sd;
-                        int power;
-                        sd = &per_cpu(phys_domains, j);
-                        if (j != first_cpu(sd->groups->cpumask)) {
-                                /*
-                                 * Only add "power" once for each
-                                 * physical package.
-                                 */
-                                continue;
-                        }
-                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                                (cpus_weight(sd->groups->cpumask)-1) / 10;
-                        sg->cpu_power += power;
-                }
-                sg = sg->next;
-                if (sg != sched_group_nodes[i])
-                        goto next_sg;
-        }
-#endif
-        /* Attach the domains */
-        for_each_cpu_mask(i, *cpu_map) {
-                struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
-                sd = &per_cpu(cpu_domains, i);
-#else
-                sd = &per_cpu(phys_domains, i);
-#endif
-                cpu_attach_domain(sd, i);
-        }
-}
-/*
- * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
- */
-void arch_init_sched_domains(const cpumask_t *cpu_map)
-{
-        cpumask_t cpu_default_map;
-        /*
-         * Setup mask for cpus without special case scheduling requirements.
-         * For now this just excludes isolated cpus, but could be used to
-         * exclude other special cases in the future.
-         */
-        cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
-        build_sched_domains(&cpu_default_map);
-}
-void arch_destroy_sched_domains(const cpumask_t *cpu_map)
-{
-#ifdef CONFIG_NUMA
-        int i;
-        int cpu;
-        for_each_cpu_mask(cpu, *cpu_map) {
-                struct sched_group *sched_group_allnodes
-                        = sched_group_allnodes_bycpu[cpu];
-                struct sched_group **sched_group_nodes
-                        = sched_group_nodes_bycpu[cpu];
-                if (sched_group_allnodes) {
-                        kfree(sched_group_allnodes);
-                        sched_group_allnodes_bycpu[cpu] = NULL;
-                }
-                if (!sched_group_nodes)
-                        continue;
-                for (i = 0; i < MAX_NUMNODES; i++) {
-                        cpumask_t nodemask = node_to_cpumask(i);
-                        struct sched_group *oldsg, *sg = sched_group_nodes[i];
-                        cpus_and(nodemask, nodemask, *cpu_map);
-                        if (cpus_empty(nodemask))
-                                continue;
-                        if (sg == NULL)
-                                continue;
-                        sg = sg->next;
-next_sg:
-                        oldsg = sg;
-                        sg = sg->next;
-                        kfree(oldsg);
-                        if (oldsg != sched_group_nodes[i])
-                                goto next_sg;
-                }
-                kfree(sched_group_nodes);
-                sched_group_nodes_bycpu[cpu] = NULL;
-        }
-#endif
-}
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index 91bbd1f22461..94e07e727395 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -20,9 +20,6 @@
 #include <asm/ptrace.h>
 #include <asm/ustack.h>
-/* Our arch specific arch_init_sched_domain is in arch/ia64/kernel/domain.c */
-#define ARCH_HAS_SCHED_DOMAIN
 #define IA64_NUM_DBG_REGS       8
 /*
 * Limits for PMC and PMD are set to less than maximum architected values
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index 399bc29729fd..a9f738bf18a7 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -98,29 +98,6 @@ void build_cpu_to_node_map(void);
        .nr_balance_failed      = 0,                    \
 }
-/* sched_domains SD_ALLNODES_INIT for IA64 NUMA machines */
-#define SD_ALLNODES_INIT (struct sched_domain) {        \
-        .span                   = CPU_MASK_NONE,        \
-        .parent                 = NULL,                 \
-        .groups                 = NULL,                 \
-        .min_interval           = 64,                   \
-        .max_interval           = 64*num_online_cpus(), \
-        .busy_factor            = 128,                  \
-        .imbalance_pct          = 133,                  \
-        .cache_hot_time         = (10*1000000),         \
-        .cache_nice_tries       = 1,                    \
-        .busy_idx               = 3,                    \
-        .idle_idx               = 3,                    \
-        .newidle_idx            = 0, /* unused */       \
-        .wake_idx               = 0, /* unused */       \
-        .forkexec_idx           = 0, /* unused */       \
-        .per_cpu_gain           = 100,                  \
-        .flags                  = SD_LOAD_BALANCE,      \
-        .last_balance           = jiffies,              \
-        .balance_interval       = 64,                   \
-        .nr_balance_failed      = 0,                    \
-}
 #endif /* CONFIG_NUMA */
 #include <asm-generic/topology.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b5a22ea80045..ea1b5f32ec5c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -564,13 +564,6 @@ struct sched_domain {
 extern void partition_sched_domains(cpumask_t *partition1,
                                    cpumask_t *partition2);
-#ifdef ARCH_HAS_SCHED_DOMAIN
-/* Useful helpers that arch setup code may use. Defined in kernel/sched.c */
-extern cpumask_t cpu_isolated_map;
-extern void init_sched_build_groups(struct sched_group groups[],
-                                cpumask_t span, int (*group_fn)(int cpu));
-extern void cpu_attach_domain(struct sched_domain *sd, int cpu);
-#endif /* ARCH_HAS_SCHED_DOMAIN */
 #endif /* CONFIG_SMP */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 0320225e96da..3df1d474e5c5 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -135,6 +135,29 @@
 }
 #endif
+/* sched_domains SD_ALLNODES_INIT for NUMA machines */
+#define SD_ALLNODES_INIT (struct sched_domain) {        \
+        .span                   = CPU_MASK_NONE,        \
+        .parent                 = NULL,                 \
+        .groups                 = NULL,                 \
+        .min_interval           = 64,                   \
+        .max_interval           = 64*num_online_cpus(), \
+        .busy_factor            = 128,                  \
+        .imbalance_pct          = 133,                  \
+        .cache_hot_time         = (10*1000000),         \
+        .cache_nice_tries       = 1,                    \
+        .busy_idx               = 3,                    \
+        .idle_idx               = 3,                    \
+        .newidle_idx            = 0, /* unused */       \
+        .wake_idx               = 0, /* unused */       \
+        .forkexec_idx           = 0, /* unused */       \
+        .per_cpu_gain           = 100,                  \
+        .flags                  = SD_LOAD_BALANCE,      \
+        .last_balance           = jiffies,              \
+        .balance_interval       = 64,                   \
+        .nr_balance_failed      = 0,                    \
+}
 #ifdef CONFIG_NUMA
 #ifndef SD_NODE_INIT
 #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/kernel/sched.c b/kernel/sched.c
index 5f889d0cbfcc..50860ad5b624 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4779,7 +4779,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
 * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
 * hold the hotplug lock.
 */
-void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
        runqueue_t *rq = cpu_rq(cpu);
        struct sched_domain *tmp;
@@ -4802,7 +4802,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
 }
 /* cpus with isolated domains */
-cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -4830,8 +4830,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
 * covered by the given span, and will set each group's ->cpumask correctly,
 * and ->cpu_power to 0.
 */
-void init_sched_build_groups(struct sched_group groups[],
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
-                        cpumask_t span, int (*group_fn)(int cpu))
+                                    int (*group_fn)(int cpu))
 {
        struct sched_group *first = NULL, *last = NULL;
        cpumask_t covered = CPU_MASK_NONE;
@@ -4864,12 +4864,85 @@ void init_sched_build_groups(struct sched_group groups[],
        last->next = first;
 }
+#define SD_NODES_PER_DOMAIN 16
-#ifdef ARCH_HAS_SCHED_DOMAIN
+#ifdef CONFIG_NUMA
-extern void build_sched_domains(const cpumask_t *cpu_map);
+/**
-extern void arch_init_sched_domains(const cpumask_t *cpu_map);
+ * find_next_best_node - find the next node to include in a sched_domain
-extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
+ * @node: node whose sched_domain we're building
-#else
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, unsigned long *used_nodes)
+{
+        int i, n, val, min_val, best_node = 0;
+        min_val = INT_MAX;
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                /* Start at @node */
+                n = (node + i) % MAX_NUMNODES;
+                if (!nr_cpus_node(n))
+                        continue;
+                /* Skip already used nodes */
+                if (test_bit(n, used_nodes))
+                        continue;
+                /* Simple min distance search */
+                val = node_distance(node, n);
+                if (val < min_val) {
+                        min_val = val;
+                        best_node = n;
+                }
+        }
+        set_bit(best_node, used_nodes);
+        return best_node;
+}
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t sched_domain_node_span(int node)
+{
+        int i;
+        cpumask_t span, nodemask;
+        DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+        cpus_clear(span);
+        bitmap_zero(used_nodes, MAX_NUMNODES);
+        nodemask = node_to_cpumask(node);
+        cpus_or(span, span, nodemask);
+        set_bit(node, used_nodes);
+        for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+                int next_node = find_next_best_node(node, used_nodes);
+                nodemask = node_to_cpumask(next_node);
+                cpus_or(span, span, nodemask);
+        }
+        return span;
+}
+#endif
+/*
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ * can switch it on easily if needed.
+ */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4891,36 +4964,20 @@ static int cpu_to_phys_group(int cpu)
 }
 #ifdef CONFIG_NUMA
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int cpu_to_node_group(int cpu)
-{
-        return cpu_to_node(cpu);
-}
-#endif
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
 /*
- * The domains setup code relies on siblings not spanning
+ * The init_sched_build_groups can't handle what we want to do with node
- * multiple nodes. Make sure the architecture has a proper
+ * groups, so roll our own. Now each node has its own list of groups which
- * siblings map:
+ * gets dynamically allocated.
 */
-static void check_sibling_maps(void)
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
-{
+static struct sched_group *sched_group_nodes[MAX_NUMNODES];
-        int i, j;
-        for_each_online_cpu(i) {
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-                for_each_cpu_mask(j, cpu_sibling_map[i]) {
+static struct sched_group sched_group_allnodes[MAX_NUMNODES];
-                        if (cpu_to_node(i) != cpu_to_node(j)) {
-                                printk(KERN_INFO "warning: CPU %d siblings map "
+static int cpu_to_allnodes_group(int cpu)
-                                        "to different node - isolating "
+{
-                                        "them.\n", i);
+        return cpu_to_node(cpu);
-                                cpu_sibling_map[i] = cpumask_of_cpu(i);
-                                break;
-                        }
-                }
-        }
 }
 #endif
@@ -4928,7 +4985,7 @@ static void check_sibling_maps(void)
 * Build sched domains for a given set of cpus and attach the sched domains
 * to the individual cpus
 */
-static void build_sched_domains(const cpumask_t *cpu_map)
+void build_sched_domains(const cpumask_t *cpu_map)
 {
        int i;
@@ -4943,11 +5000,22 @@ static void build_sched_domains(const cpumask_t *cpu_map)
                cpus_and(nodemask, nodemask, *cpu_map);
 #ifdef CONFIG_NUMA
+                if (num_online_cpus()
+                                > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+                        sd = &per_cpu(allnodes_domains, i);
+                        *sd = SD_ALLNODES_INIT;
+                        sd->span = *cpu_map;
+                        group = cpu_to_allnodes_group(i);
+                        sd->groups = &sched_group_allnodes[group];
+                        p = sd;
+                } else
+                        p = NULL;
                sd = &per_cpu(node_domains, i);
-                group = cpu_to_node_group(i);
                *sd = SD_NODE_INIT;
-                sd->span = *cpu_map;
+                sd->span = sched_domain_node_span(cpu_to_node(i));
-                sd->groups = &sched_group_nodes[group];
+                sd->parent = p;
+                cpus_and(sd->span, sd->span, *cpu_map);
 #endif
                p = sd;
@@ -4972,7 +5040,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 #ifdef CONFIG_SCHED_SMT
        /* Set up CPU (sibling) groups */
-        for_each_online_cpu(i) {
+        for_each_cpu_mask(i, *cpu_map) {
                cpumask_t this_sibling_map = cpu_sibling_map[i];
                cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
                if (i != first_cpu(this_sibling_map))
@@ -4997,8 +5065,74 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 #ifdef CONFIG_NUMA
        /* Set up node groups */
-        init_sched_build_groups(sched_group_nodes, *cpu_map,
+        init_sched_build_groups(sched_group_allnodes, *cpu_map,
-                                        &cpu_to_node_group);
+                                &cpu_to_allnodes_group);
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                /* Set up node groups */
+                struct sched_group *sg, *prev;
+                cpumask_t nodemask = node_to_cpumask(i);
+                cpumask_t domainspan;
+                cpumask_t covered = CPU_MASK_NONE;
+                int j;
+                cpus_and(nodemask, nodemask, *cpu_map);
+                if (cpus_empty(nodemask))
+                        continue;
+                domainspan = sched_domain_node_span(i);
+                cpus_and(domainspan, domainspan, *cpu_map);
+                sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+                sched_group_nodes[i] = sg;
+                for_each_cpu_mask(j, nodemask) {
+                        struct sched_domain *sd;
+                        sd = &per_cpu(node_domains, j);
+                        sd->groups = sg;
+                        if (sd->groups == NULL) {
+                                /* Turn off balancing if we have no groups */
+                                sd->flags = 0;
+                        }
+                }
+                if (!sg) {
+                        printk(KERN_WARNING
+                        "Can not alloc domain group for node %d\n", i);
+                        continue;
+                }
+                sg->cpu_power = 0;
+                sg->cpumask = nodemask;
+                cpus_or(covered, covered, nodemask);
+                prev = sg;
+                for (j = 0; j < MAX_NUMNODES; j++) {
+                        cpumask_t tmp, notcovered;
+                        int n = (i + j) % MAX_NUMNODES;
+                        cpus_complement(notcovered, covered);
+                        cpus_and(tmp, notcovered, *cpu_map);
+                        cpus_and(tmp, tmp, domainspan);
+                        if (cpus_empty(tmp))
+                                break;
+                        nodemask = node_to_cpumask(n);
+                        cpus_and(tmp, tmp, nodemask);
+                        if (cpus_empty(tmp))
+                                continue;
+                        sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+                        if (!sg) {
+                                printk(KERN_WARNING
+                                "Can not alloc domain group for node %d\n", j);
+                                break;
+                        }
+                        sg->cpu_power = 0;
+                        sg->cpumask = tmp;
+                        cpus_or(covered, covered, tmp);
+                        prev->next = sg;
+                        prev = sg;
+                }
+                prev->next = sched_group_nodes[i];
+        }
 #endif
        /* Calculate CPU power for physical packages and nodes */
@@ -5017,14 +5151,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
                sd->groups->cpu_power = power;
 #ifdef CONFIG_NUMA
-                if (i == first_cpu(sd->groups->cpumask)) {
+                sd = &per_cpu(allnodes_domains, i);
-                        /* Only add "power" once for each physical package. */
+                if (sd->groups) {
-                        sd = &per_cpu(node_domains, i);
+                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                        sd->groups->cpu_power += power;
+                                (cpus_weight(sd->groups->cpumask)-1) / 10;
+                        sd->groups->cpu_power = power;
                }
 #endif
        }
+#ifdef CONFIG_NUMA
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                struct sched_group *sg = sched_group_nodes[i];
+                int j;
+                if (sg == NULL)
+                        continue;
+next_sg:
+                for_each_cpu_mask(j, sg->cpumask) {
+                        struct sched_domain *sd;
+                        int power;
+                        sd = &per_cpu(phys_domains, j);
+                        if (j != first_cpu(sd->groups->cpumask)) {
+                                /*
+                                 * Only add "power" once for each
+                                 * physical package.
+                                 */
+                                continue;
+                        }
+                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+                                (cpus_weight(sd->groups->cpumask)-1) / 10;
+                        sg->cpu_power += power;
+                }
+                sg = sg->next;
+                if (sg != sched_group_nodes[i])
+                        goto next_sg;
+        }
+#endif
        /* Attach the domains */
        for_each_cpu_mask(i, *cpu_map) {
                struct sched_domain *sd;
@@ -5039,13 +5205,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 /*
 * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
 */
-static void arch_init_sched_domains(cpumask_t *cpu_map)
+static void arch_init_sched_domains(const cpumask_t *cpu_map)
 {
        cpumask_t cpu_default_map;
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-        check_sibling_maps();
-#endif
        /*
         * Setup mask for cpus without special case scheduling requirements.
         * For now this just excludes isolated cpus, but could be used to
@@ -5058,10 +5221,29 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
-        /* Do nothing: everything is statically allocated. */
+#ifdef CONFIG_NUMA
-}
+        int i;
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                cpumask_t nodemask = node_to_cpumask(i);
+                struct sched_group *oldsg, *sg = sched_group_nodes[i];
-#endif /* ARCH_HAS_SCHED_DOMAIN */
+                cpus_and(nodemask, nodemask, *cpu_map);
+                if (cpus_empty(nodemask))
+                        continue;
+                if (sg == NULL)
+                        continue;
+                sg = sg->next;
+next_sg:
+                oldsg = sg;
+                sg = sg->next;
+                kfree(oldsg);
+                if (oldsg != sched_group_nodes[i])
+                        goto next_sg;
+                sched_group_nodes[i] = NULL;
+        }
+#endif
+}
 /*
 * Detach sched domains from a group of cpus specified in cpu_map