1 files changed, 286 insertions, 54 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 5f889d0cbfcc..18b95520a2e2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1478,6 +1478,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
 /**
 * finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
 * @prev: the thread we just switched away from.
 *
 * finish_task_switch must be called after the context switch, paired
@@ -4779,7 +4780,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
 * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
 * hold the hotplug lock.
 */
-void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
        runqueue_t *rq = cpu_rq(cpu);
        struct sched_domain *tmp;
@@ -4802,7 +4803,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
 }
 /* cpus with isolated domains */
-cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -4830,8 +4831,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
 * covered by the given span, and will set each group's ->cpumask correctly,
 * and ->cpu_power to 0.
 */
-void init_sched_build_groups(struct sched_group groups[],
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
-                        cpumask_t span, int (*group_fn)(int cpu))
+                                    int (*group_fn)(int cpu))
 {
        struct sched_group *first = NULL, *last = NULL;
        cpumask_t covered = CPU_MASK_NONE;
@@ -4864,12 +4865,85 @@ void init_sched_build_groups(struct sched_group groups[],
        last->next = first;
 }
+#define SD_NODES_PER_DOMAIN 16
-#ifdef ARCH_HAS_SCHED_DOMAIN
+#ifdef CONFIG_NUMA
-extern void build_sched_domains(const cpumask_t *cpu_map);
+/**
-extern void arch_init_sched_domains(const cpumask_t *cpu_map);
+ * find_next_best_node - find the next node to include in a sched_domain
-extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
+ * @node: node whose sched_domain we're building
-#else
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, unsigned long *used_nodes)
+{
+        int i, n, val, min_val, best_node = 0;
+        min_val = INT_MAX;
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                /* Start at @node */
+                n = (node + i) % MAX_NUMNODES;
+                if (!nr_cpus_node(n))
+                        continue;
+                /* Skip already used nodes */
+                if (test_bit(n, used_nodes))
+                        continue;
+                /* Simple min distance search */
+                val = node_distance(node, n);
+                if (val < min_val) {
+                        min_val = val;
+                        best_node = n;
+                }
+        }
+        set_bit(best_node, used_nodes);
+        return best_node;
+}
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t sched_domain_node_span(int node)
+{
+        int i;
+        cpumask_t span, nodemask;
+        DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+        cpus_clear(span);
+        bitmap_zero(used_nodes, MAX_NUMNODES);
+        nodemask = node_to_cpumask(node);
+        cpus_or(span, span, nodemask);
+        set_bit(node, used_nodes);
+        for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+                int next_node = find_next_best_node(node, used_nodes);
+                nodemask = node_to_cpumask(next_node);
+                cpus_or(span, span, nodemask);
+        }
+        return span;
+}
+#endif
+/*
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ * can switch it on easily if needed.
+ */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4891,36 +4965,20 @@ static int cpu_to_phys_group(int cpu)
 }
 #ifdef CONFIG_NUMA
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int cpu_to_node_group(int cpu)
-{
-        return cpu_to_node(cpu);
-}
-#endif
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
 /*
- * The domains setup code relies on siblings not spanning
+ * The init_sched_build_groups can't handle what we want to do with node
- * multiple nodes. Make sure the architecture has a proper
+ * groups, so roll our own. Now each node has its own list of groups which
- * siblings map:
+ * gets dynamically allocated.
 */
-static void check_sibling_maps(void)
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
-{
+static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
-        int i, j;
-        for_each_online_cpu(i) {
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-                for_each_cpu_mask(j, cpu_sibling_map[i]) {
+static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
-                        if (cpu_to_node(i) != cpu_to_node(j)) {
-                                printk(KERN_INFO "warning: CPU %d siblings map "
+static int cpu_to_allnodes_group(int cpu)
-                                        "to different node - isolating "
+{
-                                        "them.\n", i);
+        return cpu_to_node(cpu);
-                                cpu_sibling_map[i] = cpumask_of_cpu(i);
-                                break;
-                        }
-                }
-        }
 }
 #endif
@@ -4928,9 +4986,24 @@ static void check_sibling_maps(void)
 * Build sched domains for a given set of cpus and attach the sched domains
 * to the individual cpus
 */
-static void build_sched_domains(const cpumask_t *cpu_map)
+void build_sched_domains(const cpumask_t *cpu_map)
 {
        int i;
+#ifdef CONFIG_NUMA
+        struct sched_group **sched_group_nodes = NULL;
+        struct sched_group *sched_group_allnodes = NULL;
+        /*
+         * Allocate the per-node list of sched groups
+         */
+        sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+                                           GFP_ATOMIC);
+        if (!sched_group_nodes) {
+                printk(KERN_WARNING "Can not alloc sched group node list\n");
+                return;
+        }
+        sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+#endif
        /*
         * Set up domains for cpus specified by the cpu_map.
@@ -4943,11 +5016,35 @@ static void build_sched_domains(const cpumask_t *cpu_map)
                cpus_and(nodemask, nodemask, *cpu_map);
 #ifdef CONFIG_NUMA
+                if (cpus_weight(*cpu_map)
+                                > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+                        if (!sched_group_allnodes) {
+                                sched_group_allnodes
+                                        = kmalloc(sizeof(struct sched_group)
+                                                        * MAX_NUMNODES,
+                                                  GFP_KERNEL);
+                                if (!sched_group_allnodes) {
+                                        printk(KERN_WARNING
+                                        "Can not alloc allnodes sched group\n");
+                                        break;
+                                }
+                                sched_group_allnodes_bycpu[i]
+                                                = sched_group_allnodes;
+                        }
+                        sd = &per_cpu(allnodes_domains, i);
+                        *sd = SD_ALLNODES_INIT;
+                        sd->span = *cpu_map;
+                        group = cpu_to_allnodes_group(i);
+                        sd->groups = &sched_group_allnodes[group];
+                        p = sd;
+                } else
+                        p = NULL;
                sd = &per_cpu(node_domains, i);
-                group = cpu_to_node_group(i);
                *sd = SD_NODE_INIT;
-                sd->span = *cpu_map;
+                sd->span = sched_domain_node_span(cpu_to_node(i));
-                sd->groups = &sched_group_nodes[group];
+                sd->parent = p;
+                cpus_and(sd->span, sd->span, *cpu_map);
 #endif
                p = sd;
@@ -4972,7 +5069,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 #ifdef CONFIG_SCHED_SMT
        /* Set up CPU (sibling) groups */
-        for_each_online_cpu(i) {
+        for_each_cpu_mask(i, *cpu_map) {
                cpumask_t this_sibling_map = cpu_sibling_map[i];
                cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
                if (i != first_cpu(this_sibling_map))
@@ -4997,8 +5094,77 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 #ifdef CONFIG_NUMA
        /* Set up node groups */
-        init_sched_build_groups(sched_group_nodes, *cpu_map,
+        if (sched_group_allnodes)
-                                        &cpu_to_node_group);
+                init_sched_build_groups(sched_group_allnodes, *cpu_map,
+                                        &cpu_to_allnodes_group);
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                /* Set up node groups */
+                struct sched_group *sg, *prev;
+                cpumask_t nodemask = node_to_cpumask(i);
+                cpumask_t domainspan;
+                cpumask_t covered = CPU_MASK_NONE;
+                int j;
+                cpus_and(nodemask, nodemask, *cpu_map);
+                if (cpus_empty(nodemask)) {
+                        sched_group_nodes[i] = NULL;
+                        continue;
+                }
+                domainspan = sched_domain_node_span(i);
+                cpus_and(domainspan, domainspan, *cpu_map);
+                sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+                sched_group_nodes[i] = sg;
+                for_each_cpu_mask(j, nodemask) {
+                        struct sched_domain *sd;
+                        sd = &per_cpu(node_domains, j);
+                        sd->groups = sg;
+                        if (sd->groups == NULL) {
+                                /* Turn off balancing if we have no groups */
+                                sd->flags = 0;
+                        }
+                }
+                if (!sg) {
+                        printk(KERN_WARNING
+                        "Can not alloc domain group for node %d\n", i);
+                        continue;
+                }
+                sg->cpu_power = 0;
+                sg->cpumask = nodemask;
+                cpus_or(covered, covered, nodemask);
+                prev = sg;
+                for (j = 0; j < MAX_NUMNODES; j++) {
+                        cpumask_t tmp, notcovered;
+                        int n = (i + j) % MAX_NUMNODES;
+                        cpus_complement(notcovered, covered);
+                        cpus_and(tmp, notcovered, *cpu_map);
+                        cpus_and(tmp, tmp, domainspan);
+                        if (cpus_empty(tmp))
+                                break;
+                        nodemask = node_to_cpumask(n);
+                        cpus_and(tmp, tmp, nodemask);
+                        if (cpus_empty(tmp))
+                                continue;
+                        sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+                        if (!sg) {
+                                printk(KERN_WARNING
+                                "Can not alloc domain group for node %d\n", j);
+                                break;
+                        }
+                        sg->cpu_power = 0;
+                        sg->cpumask = tmp;
+                        cpus_or(covered, covered, tmp);
+                        prev->next = sg;
+                        prev = sg;
+                }
+                prev->next = sched_group_nodes[i];
+        }
 #endif
        /* Calculate CPU power for physical packages and nodes */
@@ -5017,14 +5183,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
                sd->groups->cpu_power = power;
 #ifdef CONFIG_NUMA
-                if (i == first_cpu(sd->groups->cpumask)) {
+                sd = &per_cpu(allnodes_domains, i);
-                        /* Only add "power" once for each physical package. */
+                if (sd->groups) {
-                        sd = &per_cpu(node_domains, i);
+                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                        sd->groups->cpu_power += power;
+                                (cpus_weight(sd->groups->cpumask)-1) / 10;
+                        sd->groups->cpu_power = power;
                }
 #endif
        }
+#ifdef CONFIG_NUMA
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                struct sched_group *sg = sched_group_nodes[i];
+                int j;
+                if (sg == NULL)
+                        continue;
+next_sg:
+                for_each_cpu_mask(j, sg->cpumask) {
+                        struct sched_domain *sd;
+                        int power;
+                        sd = &per_cpu(phys_domains, j);
+                        if (j != first_cpu(sd->groups->cpumask)) {
+                                /*
+                                 * Only add "power" once for each
+                                 * physical package.
+                                 */
+                                continue;
+                        }
+                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+                                (cpus_weight(sd->groups->cpumask)-1) / 10;
+                        sg->cpu_power += power;
+                }
+                sg = sg->next;
+                if (sg != sched_group_nodes[i])
+                        goto next_sg;
+        }
+#endif
        /* Attach the domains */
        for_each_cpu_mask(i, *cpu_map) {
                struct sched_domain *sd;
@@ -5039,13 +5237,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 /*
 * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
 */
-static void arch_init_sched_domains(cpumask_t *cpu_map)
+static void arch_init_sched_domains(const cpumask_t *cpu_map)
 {
        cpumask_t cpu_default_map;
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-        check_sibling_maps();
-#endif
        /*
         * Setup mask for cpus without special case scheduling requirements.
         * For now this just excludes isolated cpus, but could be used to
@@ -5058,10 +5253,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
-        /* Do nothing: everything is statically allocated. */
+#ifdef CONFIG_NUMA
-}
+        int i;
+        int cpu;
+        for_each_cpu_mask(cpu, *cpu_map) {
+                struct sched_group *sched_group_allnodes
+                        = sched_group_allnodes_bycpu[cpu];
+                struct sched_group **sched_group_nodes
+                        = sched_group_nodes_bycpu[cpu];
+                if (sched_group_allnodes) {
+                        kfree(sched_group_allnodes);
+                        sched_group_allnodes_bycpu[cpu] = NULL;
+                }
+                if (!sched_group_nodes)
+                        continue;
+                for (i = 0; i < MAX_NUMNODES; i++) {
+                        cpumask_t nodemask = node_to_cpumask(i);
+                        struct sched_group *oldsg, *sg = sched_group_nodes[i];
-#endif /* ARCH_HAS_SCHED_DOMAIN */
+                        cpus_and(nodemask, nodemask, *cpu_map);
+                        if (cpus_empty(nodemask))
+                                continue;
+                        if (sg == NULL)
+                                continue;
+                        sg = sg->next;
+next_sg:
+                        oldsg = sg;
+                        sg = sg->next;
+                        kfree(oldsg);
+                        if (oldsg != sched_group_nodes[i])
+                                goto next_sg;
+                }
+                kfree(sched_group_nodes);
+                sched_group_nodes_bycpu[cpu] = NULL;
+        }
+#endif
+}
 /*
 * Detach sched domains from a group of cpus specified in cpu_map

diff --git a/kernel/sched.c b/kernel/sched.c index 5f889d0cbfcc..18b95520a2e2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -1478,6 +1478,7 @@ static inline void prepare_task_switch(runqueue_t rq, task_t next)
1478		1478
1479	/**	1479	/**
1480	* finish_task_switch - clean up after a task-switch	1480	* finish_task_switch - clean up after a task-switch
		1481	* @rq: runqueue associated with task-switch
1481	* @prev: the thread we just switched away from.	1482	* @prev: the thread we just switched away from.
1482	*	1483	*
1483	* finish_task_switch must be called after the context switch, paired	1484	* finish_task_switch must be called after the context switch, paired
@@ -4779,7 +4780,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
4779	* Attach the domain 'sd' to 'cpu' as its base domain. Callers must	4780	* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
4780	* hold the hotplug lock.	4781	* hold the hotplug lock.
4781	*/	4782	*/
4782	void cpu_attach_domain(struct sched_domain *sd, int cpu)	4783	static void cpu_attach_domain(struct sched_domain *sd, int cpu)
4783	{	4784	{
4784	runqueue_t *rq = cpu_rq(cpu);	4785	runqueue_t *rq = cpu_rq(cpu);
4785	struct sched_domain *tmp;	4786	struct sched_domain *tmp;
@@ -4802,7 +4803,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
4802	}	4803	}
4803		4804
4804	/* cpus with isolated domains */	4805	/* cpus with isolated domains */
4805	cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;	4806	static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
4806		4807
4807	/* Setup the mask of cpus configured for isolated domains */	4808	/* Setup the mask of cpus configured for isolated domains */
4808	static int __init isolated_cpu_setup(char *str)	4809	static int __init isolated_cpu_setup(char *str)
@@ -4830,8 +4831,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
4830	* covered by the given span, and will set each group's ->cpumask correctly,	4831	* covered by the given span, and will set each group's ->cpumask correctly,
4831	* and ->cpu_power to 0.	4832	* and ->cpu_power to 0.
4832	*/	4833	*/
4833	void init_sched_build_groups(struct sched_group groups[],	4834	static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
4834	cpumask_t span, int (*group_fn)(int cpu))	4835	int (*group_fn)(int cpu))
4835	{	4836	{
4836	struct sched_group first = NULL, last = NULL;	4837	struct sched_group first = NULL, last = NULL;
4837	cpumask_t covered = CPU_MASK_NONE;	4838	cpumask_t covered = CPU_MASK_NONE;
@@ -4864,12 +4865,85 @@ void init_sched_build_groups(struct sched_group groups[],
4864	last->next = first;	4865	last->next = first;
4865	}	4866	}
4866		4867
		4868	#define SD_NODES_PER_DOMAIN 16
4867		4869
4868	#ifdef ARCH_HAS_SCHED_DOMAIN	4870	#ifdef CONFIG_NUMA
4869	extern void build_sched_domains(const cpumask_t *cpu_map);	4871	/**
4870	extern void arch_init_sched_domains(const cpumask_t *cpu_map);	4872	* find_next_best_node - find the next node to include in a sched_domain
4871	extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);	4873	* @node: node whose sched_domain we're building
4872	#else	4874	* @used_nodes: nodes already in the sched_domain
		4875	*
		4876	* Find the next node to include in a given scheduling domain. Simply
		4877	* finds the closest node not already in the @used_nodes map.
		4878	*
		4879	* Should use nodemask_t.
		4880	*/
		4881	static int find_next_best_node(int node, unsigned long *used_nodes)
		4882	{
		4883	int i, n, val, min_val, best_node = 0;
		4884
		4885	min_val = INT_MAX;
		4886
		4887	for (i = 0; i < MAX_NUMNODES; i++) {
		4888	/* Start at @node */
		4889	n = (node + i) % MAX_NUMNODES;
		4890
		4891	if (!nr_cpus_node(n))
		4892	continue;
		4893
		4894	/* Skip already used nodes */
		4895	if (test_bit(n, used_nodes))
		4896	continue;
		4897
		4898	/* Simple min distance search */
		4899	val = node_distance(node, n);
		4900
		4901	if (val < min_val) {
		4902	min_val = val;
		4903	best_node = n;
		4904	}
		4905	}
		4906
		4907	set_bit(best_node, used_nodes);
		4908	return best_node;
		4909	}
		4910
		4911	/**
		4912	* sched_domain_node_span - get a cpumask for a node's sched_domain
		4913	* @node: node whose cpumask we're constructing
		4914	* @size: number of nodes to include in this span
		4915	*
		4916	* Given a node, construct a good cpumask for its sched_domain to span. It
		4917	* should be one that prevents unnecessary balancing, but also spreads tasks
		4918	* out optimally.
		4919	*/
		4920	static cpumask_t sched_domain_node_span(int node)
		4921	{
		4922	int i;
		4923	cpumask_t span, nodemask;
		4924	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
		4925
		4926	cpus_clear(span);
		4927	bitmap_zero(used_nodes, MAX_NUMNODES);
		4928
		4929	nodemask = node_to_cpumask(node);
		4930	cpus_or(span, span, nodemask);
		4931	set_bit(node, used_nodes);
		4932
		4933	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
		4934	int next_node = find_next_best_node(node, used_nodes);
		4935	nodemask = node_to_cpumask(next_node);
		4936	cpus_or(span, span, nodemask);
		4937	}
		4938
		4939	return span;
		4940	}
		4941	#endif
		4942
		4943	/*
		4944	* At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
		4945	* can switch it on easily if needed.
		4946	*/
4873	#ifdef CONFIG_SCHED_SMT	4947	#ifdef CONFIG_SCHED_SMT
4874	static DEFINE_PER_CPU(struct sched_domain, cpu_domains);	4948	static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4875	static struct sched_group sched_group_cpus[NR_CPUS];	4949	static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4891,36 +4965,20 @@ static int cpu_to_phys_group(int cpu)
4891	}	4965	}
4892		4966
4893	#ifdef CONFIG_NUMA	4967	#ifdef CONFIG_NUMA
4894
4895	static DEFINE_PER_CPU(struct sched_domain, node_domains);
4896	static struct sched_group sched_group_nodes[MAX_NUMNODES];
4897	static int cpu_to_node_group(int cpu)
4898	{
4899	return cpu_to_node(cpu);
4900	}
4901	#endif
4902
4903	#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4904	/*	4968	/*
4905	* The domains setup code relies on siblings not spanning	4969	* The init_sched_build_groups can't handle what we want to do with node
4906	* multiple nodes. Make sure the architecture has a proper	4970	* groups, so roll our own. Now each node has its own list of groups which
4907	* siblings map:	4971	* gets dynamically allocated.
4908	*/	4972	*/
4909	static void check_sibling_maps(void)	4973	static DEFINE_PER_CPU(struct sched_domain, node_domains);
4910	{	4974	static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
4911	int i, j;
4912		4975
4913	for_each_online_cpu(i) {	4976	static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
4914	for_each_cpu_mask(j, cpu_sibling_map[i]) {	4977	static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
4915	if (cpu_to_node(i) != cpu_to_node(j)) {	4978
4916	printk(KERN_INFO "warning: CPU %d siblings map "	4979	static int cpu_to_allnodes_group(int cpu)
4917	"to different node - isolating "	4980	{
4918	"them.\n", i);	4981	return cpu_to_node(cpu);
4919	cpu_sibling_map[i] = cpumask_of_cpu(i);
4920	break;
4921	}
4922	}
4923	}
4924	}	4982	}
4925	#endif	4983	#endif
4926		4984
@@ -4928,9 +4986,24 @@ static void check_sibling_maps(void)
4928	* Build sched domains for a given set of cpus and attach the sched domains	4986	* Build sched domains for a given set of cpus and attach the sched domains
4929	* to the individual cpus	4987	* to the individual cpus
4930	*/	4988	*/
4931	static void build_sched_domains(const cpumask_t *cpu_map)	4989	void build_sched_domains(const cpumask_t *cpu_map)
4932	{	4990	{
4933	int i;	4991	int i;
		4992	#ifdef CONFIG_NUMA
		4993	struct sched_group **sched_group_nodes = NULL;
		4994	struct sched_group *sched_group_allnodes = NULL;
		4995
		4996	/*
		4997	* Allocate the per-node list of sched groups
		4998	*/
		4999	sched_group_nodes = kmalloc(sizeof(struct sched_group)MAX_NUMNODES,
		5000	GFP_ATOMIC);
		5001	if (!sched_group_nodes) {
		5002	printk(KERN_WARNING "Can not alloc sched group node list\n");
		5003	return;
		5004	}
		5005	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
		5006	#endif
4934		5007
4935	/*	5008	/*
4936	* Set up domains for cpus specified by the cpu_map.	5009	* Set up domains for cpus specified by the cpu_map.
@@ -4943,11 +5016,35 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4943	cpus_and(nodemask, nodemask, *cpu_map);	5016	cpus_and(nodemask, nodemask, *cpu_map);
4944		5017
4945	#ifdef CONFIG_NUMA	5018	#ifdef CONFIG_NUMA
		5019	if (cpus_weight(*cpu_map)
		5020	> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
		5021	if (!sched_group_allnodes) {
		5022	sched_group_allnodes
		5023	= kmalloc(sizeof(struct sched_group)
		5024	* MAX_NUMNODES,
		5025	GFP_KERNEL);
		5026	if (!sched_group_allnodes) {
		5027	printk(KERN_WARNING
		5028	"Can not alloc allnodes sched group\n");
		5029	break;
		5030	}
		5031	sched_group_allnodes_bycpu[i]
		5032	= sched_group_allnodes;
		5033	}
		5034	sd = &per_cpu(allnodes_domains, i);
		5035	*sd = SD_ALLNODES_INIT;
		5036	sd->span = *cpu_map;
		5037	group = cpu_to_allnodes_group(i);
		5038	sd->groups = &sched_group_allnodes[group];
		5039	p = sd;
		5040	} else
		5041	p = NULL;
		5042
4946	sd = &per_cpu(node_domains, i);	5043	sd = &per_cpu(node_domains, i);
4947	group = cpu_to_node_group(i);
4948	*sd = SD_NODE_INIT;	5044	*sd = SD_NODE_INIT;
4949	sd->span = *cpu_map;	5045	sd->span = sched_domain_node_span(cpu_to_node(i));
4950	sd->groups = &sched_group_nodes[group];	5046	sd->parent = p;
		5047	cpus_and(sd->span, sd->span, *cpu_map);
4951	#endif	5048	#endif
4952		5049
4953	p = sd;	5050	p = sd;
@@ -4972,7 +5069,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4972		5069
4973	#ifdef CONFIG_SCHED_SMT	5070	#ifdef CONFIG_SCHED_SMT
4974	/* Set up CPU (sibling) groups */	5071	/* Set up CPU (sibling) groups */
4975	for_each_online_cpu(i) {	5072	for_each_cpu_mask(i, *cpu_map) {
4976	cpumask_t this_sibling_map = cpu_sibling_map[i];	5073	cpumask_t this_sibling_map = cpu_sibling_map[i];
4977	cpus_and(this_sibling_map, this_sibling_map, *cpu_map);	5074	cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
4978	if (i != first_cpu(this_sibling_map))	5075	if (i != first_cpu(this_sibling_map))
@@ -4997,8 +5094,77 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4997		5094
4998	#ifdef CONFIG_NUMA	5095	#ifdef CONFIG_NUMA
4999	/* Set up node groups */	5096	/* Set up node groups */
5000	init_sched_build_groups(sched_group_nodes, *cpu_map,	5097	if (sched_group_allnodes)
5001	&cpu_to_node_group);	5098	init_sched_build_groups(sched_group_allnodes, *cpu_map,
		5099	&cpu_to_allnodes_group);
		5100
		5101	for (i = 0; i < MAX_NUMNODES; i++) {
		5102	/* Set up node groups */
		5103	struct sched_group sg, prev;
		5104	cpumask_t nodemask = node_to_cpumask(i);
		5105	cpumask_t domainspan;
		5106	cpumask_t covered = CPU_MASK_NONE;
		5107	int j;
		5108
		5109	cpus_and(nodemask, nodemask, *cpu_map);
		5110	if (cpus_empty(nodemask)) {
		5111	sched_group_nodes[i] = NULL;
		5112	continue;
		5113	}
		5114
		5115	domainspan = sched_domain_node_span(i);
		5116	cpus_and(domainspan, domainspan, *cpu_map);
		5117
		5118	sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
		5119	sched_group_nodes[i] = sg;
		5120	for_each_cpu_mask(j, nodemask) {
		5121	struct sched_domain *sd;
		5122	sd = &per_cpu(node_domains, j);
		5123	sd->groups = sg;
		5124	if (sd->groups == NULL) {
		5125	/* Turn off balancing if we have no groups */
		5126	sd->flags = 0;
		5127	}
		5128	}
		5129	if (!sg) {
		5130	printk(KERN_WARNING
		5131	"Can not alloc domain group for node %d\n", i);
		5132	continue;
		5133	}
		5134	sg->cpu_power = 0;
		5135	sg->cpumask = nodemask;
		5136	cpus_or(covered, covered, nodemask);
		5137	prev = sg;
		5138
		5139	for (j = 0; j < MAX_NUMNODES; j++) {
		5140	cpumask_t tmp, notcovered;
		5141	int n = (i + j) % MAX_NUMNODES;
		5142
		5143	cpus_complement(notcovered, covered);
		5144	cpus_and(tmp, notcovered, *cpu_map);
		5145	cpus_and(tmp, tmp, domainspan);
		5146	if (cpus_empty(tmp))
		5147	break;
		5148
		5149	nodemask = node_to_cpumask(n);
		5150	cpus_and(tmp, tmp, nodemask);
		5151	if (cpus_empty(tmp))
		5152	continue;
		5153
		5154	sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
		5155	if (!sg) {
		5156	printk(KERN_WARNING
		5157	"Can not alloc domain group for node %d\n", j);
		5158	break;
		5159	}
		5160	sg->cpu_power = 0;
		5161	sg->cpumask = tmp;
		5162	cpus_or(covered, covered, tmp);
		5163	prev->next = sg;
		5164	prev = sg;
		5165	}
		5166	prev->next = sched_group_nodes[i];
		5167	}
5002	#endif	5168	#endif
5003		5169
5004	/* Calculate CPU power for physical packages and nodes */	5170	/* Calculate CPU power for physical packages and nodes */
@@ -5017,14 +5183,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
5017	sd->groups->cpu_power = power;	5183	sd->groups->cpu_power = power;
5018		5184
5019	#ifdef CONFIG_NUMA	5185	#ifdef CONFIG_NUMA
5020	if (i == first_cpu(sd->groups->cpumask)) {	5186	sd = &per_cpu(allnodes_domains, i);
5021	/* Only add "power" once for each physical package. */	5187	if (sd->groups) {
5022	sd = &per_cpu(node_domains, i);	5188	power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5023	sd->groups->cpu_power += power;	5189	(cpus_weight(sd->groups->cpumask)-1) / 10;
		5190	sd->groups->cpu_power = power;
5024	}	5191	}
5025	#endif	5192	#endif
5026	}	5193	}
5027		5194
		5195	#ifdef CONFIG_NUMA
		5196	for (i = 0; i < MAX_NUMNODES; i++) {
		5197	struct sched_group *sg = sched_group_nodes[i];
		5198	int j;
		5199
		5200	if (sg == NULL)
		5201	continue;
		5202	next_sg:
		5203	for_each_cpu_mask(j, sg->cpumask) {
		5204	struct sched_domain *sd;
		5205	int power;
		5206
		5207	sd = &per_cpu(phys_domains, j);
		5208	if (j != first_cpu(sd->groups->cpumask)) {
		5209	/*
		5210	* Only add "power" once for each
		5211	* physical package.
		5212	*/
		5213	continue;
		5214	}
		5215	power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
		5216	(cpus_weight(sd->groups->cpumask)-1) / 10;
		5217
		5218	sg->cpu_power += power;
		5219	}
		5220	sg = sg->next;
		5221	if (sg != sched_group_nodes[i])
		5222	goto next_sg;
		5223	}
		5224	#endif
		5225
5028	/* Attach the domains */	5226	/* Attach the domains */
5029	for_each_cpu_mask(i, *cpu_map) {	5227	for_each_cpu_mask(i, *cpu_map) {
5030	struct sched_domain *sd;	5228	struct sched_domain *sd;
@@ -5039,13 +5237,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
5039	/*	5237	/*
5040	* Set up scheduler domains and groups. Callers must hold the hotplug lock.	5238	* Set up scheduler domains and groups. Callers must hold the hotplug lock.
5041	*/	5239	*/
5042	static void arch_init_sched_domains(cpumask_t *cpu_map)	5240	static void arch_init_sched_domains(const cpumask_t *cpu_map)
5043	{	5241	{
5044	cpumask_t cpu_default_map;	5242	cpumask_t cpu_default_map;
5045		5243
5046	#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
5047	check_sibling_maps();
5048	#endif
5049	/*	5244	/*
5050	* Setup mask for cpus without special case scheduling requirements.	5245	* Setup mask for cpus without special case scheduling requirements.
5051	* For now this just excludes isolated cpus, but could be used to	5246	* For now this just excludes isolated cpus, but could be used to
@@ -5058,10 +5253,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
5058		5253
5059	static void arch_destroy_sched_domains(const cpumask_t *cpu_map)	5254	static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
5060	{	5255	{
5061	/* Do nothing: everything is statically allocated. */	5256	#ifdef CONFIG_NUMA
5062	}	5257	int i;
		5258	int cpu;
		5259
		5260	for_each_cpu_mask(cpu, *cpu_map) {
		5261	struct sched_group *sched_group_allnodes
		5262	= sched_group_allnodes_bycpu[cpu];
		5263	struct sched_group **sched_group_nodes
		5264	= sched_group_nodes_bycpu[cpu];
		5265
		5266	if (sched_group_allnodes) {
		5267	kfree(sched_group_allnodes);
		5268	sched_group_allnodes_bycpu[cpu] = NULL;
		5269	}
		5270
		5271	if (!sched_group_nodes)
		5272	continue;
		5273
		5274	for (i = 0; i < MAX_NUMNODES; i++) {
		5275	cpumask_t nodemask = node_to_cpumask(i);
		5276	struct sched_group oldsg, sg = sched_group_nodes[i];
5063		5277
5064	#endif /* ARCH_HAS_SCHED_DOMAIN */	5278	cpus_and(nodemask, nodemask, *cpu_map);
		5279	if (cpus_empty(nodemask))
		5280	continue;
		5281
		5282	if (sg == NULL)
		5283	continue;
		5284	sg = sg->next;
		5285	next_sg:
		5286	oldsg = sg;
		5287	sg = sg->next;
		5288	kfree(oldsg);
		5289	if (oldsg != sched_group_nodes[i])
		5290	goto next_sg;
		5291	}
		5292	kfree(sched_group_nodes);
		5293	sched_group_nodes_bycpu[cpu] = NULL;
		5294	}
		5295	#endif
		5296	}
5065		5297
5066	/*	5298	/*
5067	* Detach sched domains from a group of cpus specified in cpu_map	5299	* Detach sched domains from a group of cpus specified in cpu_map