1 files changed, 218 insertions, 109 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e5212ae294f6..bd314d7cd9f8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *data)
 }
 #endif
-void update_cpu_load(struct rq *this_rq);
 static void set_load_weight(struct task_struct *p)
 {
        int prio = p->static_prio - MAX_RT_PRIO;
@@ -2486,22 +2484,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
 * every tick. We fix it up based on jiffies.
 */
-void update_cpu_load(struct rq *this_rq)
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+                              unsigned long pending_updates)
 {
-        unsigned long this_load = this_rq->load.weight;
-        unsigned long curr_jiffies = jiffies;
-        unsigned long pending_updates;
        int i, scale;
        this_rq->nr_load_updates++;
-        /* Avoid repeated calls on same jiffy, when moving in and out of idle */
-        if (curr_jiffies == this_rq->last_load_update_tick)
-                return;
-        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-        this_rq->last_load_update_tick = curr_jiffies;
        /* Update our load: */
        this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
        for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2526,9 +2515,45 @@ void update_cpu_load(struct rq *this_rq)
        sched_avg_update(this_rq);
 }
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+void update_idle_cpu_load(struct rq *this_rq)
+{
+        unsigned long curr_jiffies = jiffies;
+        unsigned long load = this_rq->load.weight;
+        unsigned long pending_updates;
+        /*
+         * Bloody broken means of dealing with nohz, but better than nothing..
+         * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
+         * update and see 0 difference the one time and 2 the next, even though
+         * we ticked at roughtly the same rate.
+         *
+         * Hence we only use this from nohz_idle_balance() and skip this
+         * nonsense when called from the scheduler_tick() since that's
+         * guaranteed a stable rate.
+         */
+        if (load || curr_jiffies == this_rq->last_load_update_tick)
+                return;
+        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+        this_rq->last_load_update_tick = curr_jiffies;
+        __update_cpu_load(this_rq, load, pending_updates);
+}
+/*
+ * Called from scheduler_tick()
+ */
 static void update_cpu_load_active(struct rq *this_rq)
 {
-        update_cpu_load(this_rq);
+        /*
+         * See the mess in update_idle_cpu_load().
+         */
+        this_rq->last_load_update_tick = jiffies;
+        __update_cpu_load(this_rq, this_rq->load.weight, 1);
        calc_load_account_active(this_rq);
 }
@@ -5560,7 +5585,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
-                if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
+                if (!(sd->flags & SD_OVERLAP) &&
+                    cpumask_intersects(groupmask, sched_group_cpus(group))) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: repeated CPUs\n");
                        break;
@@ -5898,92 +5924,6 @@ static int __init isolated_cpu_setup(char *str)
 __setup("isolcpus=", isolated_cpu_setup);
-#ifdef CONFIG_NUMA
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain. Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, nodemask_t *used_nodes)
-{
-        int i, n, val, min_val, best_node = -1;
-        min_val = INT_MAX;
-        for (i = 0; i < nr_node_ids; i++) {
-                /* Start at @node */
-                n = (node + i) % nr_node_ids;
-                if (!nr_cpus_node(n))
-                        continue;
-                /* Skip already used nodes */
-                if (node_isset(n, *used_nodes))
-                        continue;
-                /* Simple min distance search */
-                val = node_distance(node, n);
-                if (val < min_val) {
-                        min_val = val;
-                        best_node = n;
-                }
-        }
-        if (best_node != -1)
-                node_set(best_node, *used_nodes);
-        return best_node;
-}
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @span: resulting cpumask
- *
- * Given a node, construct a good cpumask for its sched_domain to span. It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static void sched_domain_node_span(int node, struct cpumask *span)
-{
-        nodemask_t used_nodes;
-        int i;
-        cpumask_clear(span);
-        nodes_clear(used_nodes);
-        cpumask_or(span, span, cpumask_of_node(node));
-        node_set(node, used_nodes);
-        for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-                int next_node = find_next_best_node(node, &used_nodes);
-                if (next_node < 0)
-                        break;
-                cpumask_or(span, span, cpumask_of_node(next_node));
-        }
-}
-static const struct cpumask *cpu_node_mask(int cpu)
-{
-        lockdep_assert_held(&sched_domains_mutex);
-        sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
-        return sched_domains_tmpmask;
-}
-static const struct cpumask *cpu_allnodes_mask(int cpu)
-{
-        return cpu_possible_mask;
-}
-#endif /* CONFIG_NUMA */
 static const struct cpumask *cpu_cpu_mask(int cpu)
 {
        return cpumask_of_node(cpu_to_node(cpu));
@@ -6020,6 +5960,7 @@ struct sched_domain_topology_level {
        sched_domain_init_f init;
        sched_domain_mask_f mask;
        int                 flags;
+        int                 numa_level;
        struct sd_data      data;
 };
@@ -6211,10 +6152,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\
 }
 SD_INIT_FUNC(CPU)
-#ifdef CONFIG_NUMA
- SD_INIT_FUNC(ALLNODES)
- SD_INIT_FUNC(NODE)
-#endif
 #ifdef CONFIG_SCHED_SMT
 SD_INIT_FUNC(SIBLING)
 #endif
@@ -6336,15 +6273,185 @@ static struct sched_domain_topology_level default_topology[] = {
        { sd_init_BOOK, cpu_book_mask, },
 #endif
        { sd_init_CPU, cpu_cpu_mask, },
-#ifdef CONFIG_NUMA
-        { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
-        { sd_init_ALLNODES, cpu_allnodes_mask, },
-#endif
        { NULL, },
 };
 static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+#ifdef CONFIG_NUMA
+static int sched_domains_numa_levels;
+static int sched_domains_numa_scale;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+static inline int sd_local_flags(int level)
+{
+        if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
+                return 0;
+        return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
+}
+static struct sched_domain *
+sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+{
+        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+        int level = tl->numa_level;
+        int sd_weight = cpumask_weight(
+                        sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+        *sd = (struct sched_domain){
+                .min_interval           = sd_weight,
+                .max_interval           = 2*sd_weight,
+                .busy_factor            = 32,
+                .imbalance_pct          = 125,
+                .cache_nice_tries       = 2,
+                .busy_idx               = 3,
+                .idle_idx               = 2,
+                .newidle_idx            = 0,
+                .wake_idx               = 0,
+                .forkexec_idx           = 0,
+                .flags                  = 1*SD_LOAD_BALANCE
+                                        | 1*SD_BALANCE_NEWIDLE
+                                        | 0*SD_BALANCE_EXEC
+                                        | 0*SD_BALANCE_FORK
+                                        | 0*SD_BALANCE_WAKE
+                                        | 0*SD_WAKE_AFFINE
+                                        | 0*SD_PREFER_LOCAL
+                                        | 0*SD_SHARE_CPUPOWER
+                                        | 0*SD_POWERSAVINGS_BALANCE
+                                        | 0*SD_SHARE_PKG_RESOURCES
+                                        | 1*SD_SERIALIZE
+                                        | 0*SD_PREFER_SIBLING
+                                        | sd_local_flags(level)
+                                        ,
+                .last_balance           = jiffies,
+                .balance_interval       = sd_weight,
+        };
+        SD_INIT_NAME(sd, NUMA);
+        sd->private = &tl->data;
+        /*
+         * Ugly hack to pass state to sd_numa_mask()...
+         */
+        sched_domains_curr_level = tl->numa_level;
+        return sd;
+}
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+        return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+static void sched_init_numa(void)
+{
+        int next_distance, curr_distance = node_distance(0, 0);
+        struct sched_domain_topology_level *tl;
+        int level = 0;
+        int i, j, k;
+        sched_domains_numa_scale = curr_distance;
+        sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+        if (!sched_domains_numa_distance)
+                return;
+        /*
+         * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+         * unique distances in the node_distance() table.
+         *
+         * Assumes node_distance(0,j) includes all distances in
+         * node_distance(i,j) in order to avoid cubic time.
+         *
+         * XXX: could be optimized to O(n log n) by using sort()
+         */
+        next_distance = curr_distance;
+        for (i = 0; i < nr_node_ids; i++) {
+                for (j = 0; j < nr_node_ids; j++) {
+                        int distance = node_distance(0, j);
+                        if (distance > curr_distance &&
+                                        (distance < next_distance ||
+                                         next_distance == curr_distance))
+                                next_distance = distance;
+                }
+                if (next_distance != curr_distance) {
+                        sched_domains_numa_distance[level++] = next_distance;
+                        sched_domains_numa_levels = level;
+                        curr_distance = next_distance;
+                } else break;
+        }
+        /*
+         * 'level' contains the number of unique distances, excluding the
+         * identity distance node_distance(i,i).
+         *
+         * The sched_domains_nume_distance[] array includes the actual distance
+         * numbers.
+         */
+        sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+        if (!sched_domains_numa_masks)
+                return;
+        /*
+         * Now for each level, construct a mask per node which contains all
+         * cpus of nodes that are that many hops away from us.
+         */
+        for (i = 0; i < level; i++) {
+                sched_domains_numa_masks[i] =
+                        kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+                if (!sched_domains_numa_masks[i])
+                        return;
+                for (j = 0; j < nr_node_ids; j++) {
+                        struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
+                        if (!mask)
+                                return;
+                        sched_domains_numa_masks[i][j] = mask;
+                        for (k = 0; k < nr_node_ids; k++) {
+                                if (node_distance(j, k) > sched_domains_numa_distance[i])
+                                        continue;
+                                cpumask_or(mask, mask, cpumask_of_node(k));
+                        }
+                }
+        }
+        tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+                        sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+        if (!tl)
+                return;
+        /*
+         * Copy the default topology bits..
+         */
+        for (i = 0; default_topology[i].init; i++)
+                tl[i] = default_topology[i];
+        /*
+         * .. and append 'j' levels of NUMA goodness.
+         */
+        for (j = 0; j < level; i++, j++) {
+                tl[i] = (struct sched_domain_topology_level){
+                        .init = sd_numa_init,
+                        .mask = sd_numa_mask,
+                        .flags = SDTL_OVERLAP,
+                        .numa_level = j,
+                };
+        }
+        sched_domain_topology = tl;
+}
+#else
+static inline void sched_init_numa(void)
+{
+}
+#endif /* CONFIG_NUMA */
 static int __sdt_alloc(const struct cpumask *cpu_map)
 {
        struct sched_domain_topology_level *tl;
@@ -6840,6 +6947,8 @@ void __init sched_init_smp(void)
        alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
        alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+        sched_init_numa();
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
        init_sched_domains(cpu_active_mask);

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e5212ae294f6..bd314d7cd9f8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -692,8 +692,6 @@ int tg_nop(struct task_group tg, void data)
692	}	692	}
693	#endif	693	#endif
694		694
695	void update_cpu_load(struct rq *this_rq);
696
697	static void set_load_weight(struct task_struct *p)	695	static void set_load_weight(struct task_struct *p)
698	{	696	{
699	int prio = p->static_prio - MAX_RT_PRIO;	697	int prio = p->static_prio - MAX_RT_PRIO;
@@ -2486,22 +2484,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2486	* scheduler tick (TICK_NSEC). With tickless idle this will not be called	2484	* scheduler tick (TICK_NSEC). With tickless idle this will not be called
2487	* every tick. We fix it up based on jiffies.	2485	* every tick. We fix it up based on jiffies.
2488	*/	2486	*/
2489	void update_cpu_load(struct rq *this_rq)	2487	static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
		2488	unsigned long pending_updates)
2490	{	2489	{
2491	unsigned long this_load = this_rq->load.weight;
2492	unsigned long curr_jiffies = jiffies;
2493	unsigned long pending_updates;
2494	int i, scale;	2490	int i, scale;
2495		2491
2496	this_rq->nr_load_updates++;	2492	this_rq->nr_load_updates++;
2497		2493
2498	/* Avoid repeated calls on same jiffy, when moving in and out of idle */
2499	if (curr_jiffies == this_rq->last_load_update_tick)
2500	return;
2501
2502	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2503	this_rq->last_load_update_tick = curr_jiffies;
2504
2505	/* Update our load: */	2494	/* Update our load: */
2506	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */	2495	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2507	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {	2496	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2526,9 +2515,45 @@ void update_cpu_load(struct rq *this_rq)
2526	sched_avg_update(this_rq);	2515	sched_avg_update(this_rq);
2527	}	2516	}
2528		2517
		2518	/*
		2519	* Called from nohz_idle_balance() to update the load ratings before doing the
		2520	* idle balance.
		2521	*/
		2522	void update_idle_cpu_load(struct rq *this_rq)
		2523	{
		2524	unsigned long curr_jiffies = jiffies;
		2525	unsigned long load = this_rq->load.weight;
		2526	unsigned long pending_updates;
		2527
		2528	/*
		2529	* Bloody broken means of dealing with nohz, but better than nothing..
		2530	* jiffies is updated by one cpu, another cpu can drift wrt the jiffy
		2531	* update and see 0 difference the one time and 2 the next, even though
		2532	* we ticked at roughtly the same rate.
		2533	*
		2534	* Hence we only use this from nohz_idle_balance() and skip this
		2535	* nonsense when called from the scheduler_tick() since that's
		2536	* guaranteed a stable rate.
		2537	*/
		2538	if (load \|\| curr_jiffies == this_rq->last_load_update_tick)
		2539	return;
		2540
		2541	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
		2542	this_rq->last_load_update_tick = curr_jiffies;
		2543
		2544	__update_cpu_load(this_rq, load, pending_updates);
		2545	}
		2546
		2547	/*
		2548	* Called from scheduler_tick()
		2549	*/
2529	static void update_cpu_load_active(struct rq *this_rq)	2550	static void update_cpu_load_active(struct rq *this_rq)
2530	{	2551	{
2531	update_cpu_load(this_rq);	2552	/*
		2553	* See the mess in update_idle_cpu_load().
		2554	*/
		2555	this_rq->last_load_update_tick = jiffies;
		2556	__update_cpu_load(this_rq, this_rq->load.weight, 1);
2532		2557
2533	calc_load_account_active(this_rq);	2558	calc_load_account_active(this_rq);
2534	}	2559	}
@@ -5560,7 +5585,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5560	break;	5585	break;
5561	}	5586	}
5562		5587
5563	if (cpumask_intersects(groupmask, sched_group_cpus(group))) {	5588	if (!(sd->flags & SD_OVERLAP) &&
		5589	cpumask_intersects(groupmask, sched_group_cpus(group))) {
5564	printk(KERN_CONT "\n");	5590	printk(KERN_CONT "\n");
5565	printk(KERN_ERR "ERROR: repeated CPUs\n");	5591	printk(KERN_ERR "ERROR: repeated CPUs\n");
5566	break;	5592	break;
@@ -5898,92 +5924,6 @@ static int __init isolated_cpu_setup(char *str)
5898		5924
5899	__setup("isolcpus=", isolated_cpu_setup);	5925	__setup("isolcpus=", isolated_cpu_setup);
5900		5926
5901	#ifdef CONFIG_NUMA
5902
5903	/**
5904	* find_next_best_node - find the next node to include in a sched_domain
5905	* @node: node whose sched_domain we're building
5906	* @used_nodes: nodes already in the sched_domain
5907	*
5908	* Find the next node to include in a given scheduling domain. Simply
5909	* finds the closest node not already in the @used_nodes map.
5910	*
5911	* Should use nodemask_t.
5912	*/
5913	static int find_next_best_node(int node, nodemask_t *used_nodes)
5914	{
5915	int i, n, val, min_val, best_node = -1;
5916
5917	min_val = INT_MAX;
5918
5919	for (i = 0; i < nr_node_ids; i++) {
5920	/* Start at @node */
5921	n = (node + i) % nr_node_ids;
5922
5923	if (!nr_cpus_node(n))
5924	continue;
5925
5926	/* Skip already used nodes */
5927	if (node_isset(n, *used_nodes))
5928	continue;
5929
5930	/* Simple min distance search */
5931	val = node_distance(node, n);
5932
5933	if (val < min_val) {
5934	min_val = val;
5935	best_node = n;
5936	}
5937	}
5938
5939	if (best_node != -1)
5940	node_set(best_node, *used_nodes);
5941	return best_node;
5942	}
5943
5944	/**
5945	* sched_domain_node_span - get a cpumask for a node's sched_domain
5946	* @node: node whose cpumask we're constructing
5947	* @span: resulting cpumask
5948	*
5949	* Given a node, construct a good cpumask for its sched_domain to span. It
5950	* should be one that prevents unnecessary balancing, but also spreads tasks
5951	* out optimally.
5952	*/
5953	static void sched_domain_node_span(int node, struct cpumask *span)
5954	{
5955	nodemask_t used_nodes;
5956	int i;
5957
5958	cpumask_clear(span);
5959	nodes_clear(used_nodes);
5960
5961	cpumask_or(span, span, cpumask_of_node(node));
5962	node_set(node, used_nodes);
5963
5964	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5965	int next_node = find_next_best_node(node, &used_nodes);
5966	if (next_node < 0)
5967	break;
5968	cpumask_or(span, span, cpumask_of_node(next_node));
5969	}
5970	}
5971
5972	static const struct cpumask *cpu_node_mask(int cpu)
5973	{
5974	lockdep_assert_held(&sched_domains_mutex);
5975
5976	sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
5977
5978	return sched_domains_tmpmask;
5979	}
5980
5981	static const struct cpumask *cpu_allnodes_mask(int cpu)
5982	{
5983	return cpu_possible_mask;
5984	}
5985	#endif /* CONFIG_NUMA */
5986
5987	static const struct cpumask *cpu_cpu_mask(int cpu)	5927	static const struct cpumask *cpu_cpu_mask(int cpu)
5988	{	5928	{
5989	return cpumask_of_node(cpu_to_node(cpu));	5929	return cpumask_of_node(cpu_to_node(cpu));
@@ -6020,6 +5960,7 @@ struct sched_domain_topology_level {
6020	sched_domain_init_f init;	5960	sched_domain_init_f init;
6021	sched_domain_mask_f mask;	5961	sched_domain_mask_f mask;
6022	int flags;	5962	int flags;
		5963	int numa_level;
6023	struct sd_data data;	5964	struct sd_data data;
6024	};	5965	};
6025		5966
@@ -6211,10 +6152,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6211	}	6152	}
6212		6153
6213	SD_INIT_FUNC(CPU)	6154	SD_INIT_FUNC(CPU)
6214	#ifdef CONFIG_NUMA
6215	SD_INIT_FUNC(ALLNODES)
6216	SD_INIT_FUNC(NODE)
6217	#endif
6218	#ifdef CONFIG_SCHED_SMT	6155	#ifdef CONFIG_SCHED_SMT
6219	SD_INIT_FUNC(SIBLING)	6156	SD_INIT_FUNC(SIBLING)
6220	#endif	6157	#endif
@@ -6336,15 +6273,185 @@ static struct sched_domain_topology_level default_topology[] = {
6336	{ sd_init_BOOK, cpu_book_mask, },	6273	{ sd_init_BOOK, cpu_book_mask, },
6337	#endif	6274	#endif
6338	{ sd_init_CPU, cpu_cpu_mask, },	6275	{ sd_init_CPU, cpu_cpu_mask, },
6339	#ifdef CONFIG_NUMA
6340	{ sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
6341	{ sd_init_ALLNODES, cpu_allnodes_mask, },
6342	#endif
6343	{ NULL, },	6276	{ NULL, },
6344	};	6277	};
6345		6278
6346	static struct sched_domain_topology_level *sched_domain_topology = default_topology;	6279	static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6347		6280
		6281	#ifdef CONFIG_NUMA
		6282
		6283	static int sched_domains_numa_levels;
		6284	static int sched_domains_numa_scale;
		6285	static int *sched_domains_numa_distance;
		6286	static struct cpumask ***sched_domains_numa_masks;
		6287	static int sched_domains_curr_level;
		6288
		6289	static inline int sd_local_flags(int level)
		6290	{
		6291	if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
		6292	return 0;
		6293
		6294	return SD_BALANCE_EXEC \| SD_BALANCE_FORK \| SD_WAKE_AFFINE;
		6295	}
		6296
		6297	static struct sched_domain *
		6298	sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
		6299	{
		6300	struct sched_domain sd = per_cpu_ptr(tl->data.sd, cpu);
		6301	int level = tl->numa_level;
		6302	int sd_weight = cpumask_weight(
		6303	sched_domains_numa_masks[level][cpu_to_node(cpu)]);
		6304
		6305	*sd = (struct sched_domain){
		6306	.min_interval = sd_weight,
		6307	.max_interval = 2*sd_weight,
		6308	.busy_factor = 32,
		6309	.imbalance_pct = 125,
		6310	.cache_nice_tries = 2,
		6311	.busy_idx = 3,
		6312	.idle_idx = 2,
		6313	.newidle_idx = 0,
		6314	.wake_idx = 0,
		6315	.forkexec_idx = 0,
		6316
		6317	.flags = 1*SD_LOAD_BALANCE
		6318	\| 1*SD_BALANCE_NEWIDLE
		6319	\| 0*SD_BALANCE_EXEC
		6320	\| 0*SD_BALANCE_FORK
		6321	\| 0*SD_BALANCE_WAKE
		6322	\| 0*SD_WAKE_AFFINE
		6323	\| 0*SD_PREFER_LOCAL
		6324	\| 0*SD_SHARE_CPUPOWER
		6325	\| 0*SD_POWERSAVINGS_BALANCE
		6326	\| 0*SD_SHARE_PKG_RESOURCES
		6327	\| 1*SD_SERIALIZE
		6328	\| 0*SD_PREFER_SIBLING
		6329	\| sd_local_flags(level)
		6330	,
		6331	.last_balance = jiffies,
		6332	.balance_interval = sd_weight,
		6333	};
		6334	SD_INIT_NAME(sd, NUMA);
		6335	sd->private = &tl->data;
		6336
		6337	/*
		6338	* Ugly hack to pass state to sd_numa_mask()...
		6339	*/
		6340	sched_domains_curr_level = tl->numa_level;
		6341
		6342	return sd;
		6343	}
		6344
		6345	static const struct cpumask *sd_numa_mask(int cpu)
		6346	{
		6347	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
		6348	}
		6349
		6350	static void sched_init_numa(void)
		6351	{
		6352	int next_distance, curr_distance = node_distance(0, 0);
		6353	struct sched_domain_topology_level *tl;
		6354	int level = 0;
		6355	int i, j, k;
		6356
		6357	sched_domains_numa_scale = curr_distance;
		6358	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
		6359	if (!sched_domains_numa_distance)
		6360	return;
		6361
		6362	/*
		6363	* O(nr_nodes^2) deduplicating selection sort -- in order to find the
		6364	* unique distances in the node_distance() table.
		6365	*
		6366	* Assumes node_distance(0,j) includes all distances in
		6367	* node_distance(i,j) in order to avoid cubic time.
		6368	*
		6369	* XXX: could be optimized to O(n log n) by using sort()
		6370	*/
		6371	next_distance = curr_distance;
		6372	for (i = 0; i < nr_node_ids; i++) {
		6373	for (j = 0; j < nr_node_ids; j++) {
		6374	int distance = node_distance(0, j);
		6375	if (distance > curr_distance &&
		6376	(distance < next_distance \|\|
		6377	next_distance == curr_distance))
		6378	next_distance = distance;
		6379	}
		6380	if (next_distance != curr_distance) {
		6381	sched_domains_numa_distance[level++] = next_distance;
		6382	sched_domains_numa_levels = level;
		6383	curr_distance = next_distance;
		6384	} else break;
		6385	}
		6386	/*
		6387	* 'level' contains the number of unique distances, excluding the
		6388	* identity distance node_distance(i,i).
		6389	*
		6390	* The sched_domains_nume_distance[] array includes the actual distance
		6391	* numbers.
		6392	*/
		6393
		6394	sched_domains_numa_masks = kzalloc(sizeof(void ) level, GFP_KERNEL);
		6395	if (!sched_domains_numa_masks)
		6396	return;
		6397
		6398	/*
		6399	* Now for each level, construct a mask per node which contains all
		6400	* cpus of nodes that are that many hops away from us.
		6401	*/
		6402	for (i = 0; i < level; i++) {
		6403	sched_domains_numa_masks[i] =
		6404	kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
		6405	if (!sched_domains_numa_masks[i])
		6406	return;
		6407
		6408	for (j = 0; j < nr_node_ids; j++) {
		6409	struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
		6410	if (!mask)
		6411	return;
		6412
		6413	sched_domains_numa_masks[i][j] = mask;
		6414
		6415	for (k = 0; k < nr_node_ids; k++) {
		6416	if (node_distance(j, k) > sched_domains_numa_distance[i])
		6417	continue;
		6418
		6419	cpumask_or(mask, mask, cpumask_of_node(k));
		6420	}
		6421	}
		6422	}
		6423
		6424	tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
		6425	sizeof(struct sched_domain_topology_level), GFP_KERNEL);
		6426	if (!tl)
		6427	return;
		6428
		6429	/*
		6430	* Copy the default topology bits..
		6431	*/
		6432	for (i = 0; default_topology[i].init; i++)
		6433	tl[i] = default_topology[i];
		6434
		6435	/*
		6436	* .. and append 'j' levels of NUMA goodness.
		6437	*/
		6438	for (j = 0; j < level; i++, j++) {
		6439	tl[i] = (struct sched_domain_topology_level){
		6440	.init = sd_numa_init,
		6441	.mask = sd_numa_mask,
		6442	.flags = SDTL_OVERLAP,
		6443	.numa_level = j,
		6444	};
		6445	}
		6446
		6447	sched_domain_topology = tl;
		6448	}
		6449	#else
		6450	static inline void sched_init_numa(void)
		6451	{
		6452	}
		6453	#endif /* CONFIG_NUMA */
		6454
6348	static int __sdt_alloc(const struct cpumask *cpu_map)	6455	static int __sdt_alloc(const struct cpumask *cpu_map)
6349	{	6456	{
6350	struct sched_domain_topology_level *tl;	6457	struct sched_domain_topology_level *tl;
@@ -6840,6 +6947,8 @@ void __init sched_init_smp(void)
6840	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);	6947	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6841	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);	6948	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6842		6949
		6950	sched_init_numa();
		6951
6843	get_online_cpus();	6952	get_online_cpus();
6844	mutex_lock(&sched_domains_mutex);	6953	mutex_lock(&sched_domains_mutex);
6845	init_sched_domains(cpu_active_mask);	6954	init_sched_domains(cpu_active_mask);