aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2012-04-17 09:49:36 -0400
committerIngo Molnar <mingo@kernel.org>2012-05-09 09:00:55 -0400
commitcb83b629bae0327cf9f44f096adc38d150ceb913 (patch)
tree13f7da07ee150a97c21aace57eaa817a30df9539
parentbd939f45da24e25e08a8f5c993c50b1afada0fef (diff)
sched/numa: Rewrite the CONFIG_NUMA sched domain support
The current code groups up to 16 nodes in a level and then puts an ALLNODES domain spanning the entire tree on top of that. This doesn't reflect the numa topology and esp for the smaller not-fully-connected machines out there today this might make a difference. Therefore, build a proper numa topology based on node_distance(). Since there's no fixed numa layers anymore, the static SD_NODE_INIT and SD_ALLNODES_INIT aren't usable anymore, the new code tries to construct something similar and scales some values either on the number of cpus in the domain and/or the node_distance() ratio. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Anton Blanchard <anton@samba.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: David Howells <dhowells@redhat.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Fenghua Yu <fenghua.yu@intel.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: linux-alpha@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mips@linux-mips.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-sh@vger.kernel.org Cc: Matt Turner <mattst88@gmail.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Richard Henderson <rth@twiddle.net> Cc: sparclinux@vger.kernel.org Cc: Tony Luck <tony.luck@intel.com> Cc: x86@kernel.org Cc: Dimitri Sivanich <sivanich@sgi.com> Cc: Greg Pearson <greg.pearson@hp.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: bob.picco@oracle.com Cc: chris.mason@oracle.com Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Link: http://lkml.kernel.org/n/tip-r74n3n8hhuc2ynbrnp3vt954@git.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/ia64/include/asm/topology.h25
-rw-r--r--arch/mips/include/asm/mach-ip27/topology.h17
-rw-r--r--arch/powerpc/include/asm/topology.h36
-rw-r--r--arch/sh/include/asm/topology.h25
-rw-r--r--arch/sparc/include/asm/topology_64.h19
-rw-r--r--arch/tile/include/asm/topology.h26
-rw-r--r--arch/x86/include/asm/topology.h38
-rw-r--r--include/linux/topology.h37
-rw-r--r--kernel/sched/core.c280
9 files changed, 185 insertions, 318 deletions
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 09f646753d1..a2496e449b7 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -70,31 +70,6 @@ void build_cpu_to_node_map(void);
70 .nr_balance_failed = 0, \ 70 .nr_balance_failed = 0, \
71} 71}
72 72
73/* sched_domains SD_NODE_INIT for IA64 NUMA machines */
74#define SD_NODE_INIT (struct sched_domain) { \
75 .parent = NULL, \
76 .child = NULL, \
77 .groups = NULL, \
78 .min_interval = 8, \
79 .max_interval = 8*(min(num_online_cpus(), 32U)), \
80 .busy_factor = 64, \
81 .imbalance_pct = 125, \
82 .cache_nice_tries = 2, \
83 .busy_idx = 3, \
84 .idle_idx = 2, \
85 .newidle_idx = 0, \
86 .wake_idx = 0, \
87 .forkexec_idx = 0, \
88 .flags = SD_LOAD_BALANCE \
89 | SD_BALANCE_NEWIDLE \
90 | SD_BALANCE_EXEC \
91 | SD_BALANCE_FORK \
92 | SD_SERIALIZE, \
93 .last_balance = jiffies, \
94 .balance_interval = 64, \
95 .nr_balance_failed = 0, \
96}
97
98#endif /* CONFIG_NUMA */ 73#endif /* CONFIG_NUMA */
99 74
100#ifdef CONFIG_SMP 75#ifdef CONFIG_SMP
diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h
index 1b1a7d1632b..b2cf641f206 100644
--- a/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@ -36,23 +36,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
36 36
37#define node_distance(from, to) (__node_distances[(from)][(to)]) 37#define node_distance(from, to) (__node_distances[(from)][(to)])
38 38
39/* sched_domains SD_NODE_INIT for SGI IP27 machines */
40#define SD_NODE_INIT (struct sched_domain) { \
41 .parent = NULL, \
42 .child = NULL, \
43 .groups = NULL, \
44 .min_interval = 8, \
45 .max_interval = 32, \
46 .busy_factor = 32, \
47 .imbalance_pct = 125, \
48 .cache_nice_tries = 1, \
49 .flags = SD_LOAD_BALANCE | \
50 SD_BALANCE_EXEC, \
51 .last_balance = jiffies, \
52 .balance_interval = 1, \
53 .nr_balance_failed = 0, \
54}
55
56#include <asm-generic/topology.h> 39#include <asm-generic/topology.h>
57 40
58#endif /* _ASM_MACH_TOPOLOGY_H */ 41#endif /* _ASM_MACH_TOPOLOGY_H */
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index c97185885c6..852ed1b384f 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -18,12 +18,6 @@ struct device_node;
18 */ 18 */
19#define RECLAIM_DISTANCE 10 19#define RECLAIM_DISTANCE 10
20 20
21/*
22 * Avoid creating an extra level of balancing (SD_ALLNODES) on the largest
23 * POWER7 boxes which have a maximum of 32 nodes.
24 */
25#define SD_NODES_PER_DOMAIN 32
26
27#include <asm/mmzone.h> 21#include <asm/mmzone.h>
28 22
29static inline int cpu_to_node(int cpu) 23static inline int cpu_to_node(int cpu)
@@ -51,36 +45,6 @@ static inline int pcibus_to_node(struct pci_bus *bus)
51 cpu_all_mask : \ 45 cpu_all_mask : \
52 cpumask_of_node(pcibus_to_node(bus))) 46 cpumask_of_node(pcibus_to_node(bus)))
53 47
54/* sched_domains SD_NODE_INIT for PPC64 machines */
55#define SD_NODE_INIT (struct sched_domain) { \
56 .min_interval = 8, \
57 .max_interval = 32, \
58 .busy_factor = 32, \
59 .imbalance_pct = 125, \
60 .cache_nice_tries = 1, \
61 .busy_idx = 3, \
62 .idle_idx = 1, \
63 .newidle_idx = 0, \
64 .wake_idx = 0, \
65 .forkexec_idx = 0, \
66 \
67 .flags = 1*SD_LOAD_BALANCE \
68 | 0*SD_BALANCE_NEWIDLE \
69 | 1*SD_BALANCE_EXEC \
70 | 1*SD_BALANCE_FORK \
71 | 0*SD_BALANCE_WAKE \
72 | 1*SD_WAKE_AFFINE \
73 | 0*SD_PREFER_LOCAL \
74 | 0*SD_SHARE_CPUPOWER \
75 | 0*SD_POWERSAVINGS_BALANCE \
76 | 0*SD_SHARE_PKG_RESOURCES \
77 | 1*SD_SERIALIZE \
78 | 0*SD_PREFER_SIBLING \
79 , \
80 .last_balance = jiffies, \
81 .balance_interval = 1, \
82}
83
84extern int __node_distance(int, int); 48extern int __node_distance(int, int);
85#define node_distance(a, b) __node_distance(a, b) 49#define node_distance(a, b) __node_distance(a, b)
86 50
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index 88e734069fa..b0a282d65f6 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -3,31 +3,6 @@
3 3
4#ifdef CONFIG_NUMA 4#ifdef CONFIG_NUMA
5 5
6/* sched_domains SD_NODE_INIT for sh machines */
7#define SD_NODE_INIT (struct sched_domain) { \
8 .parent = NULL, \
9 .child = NULL, \
10 .groups = NULL, \
11 .min_interval = 8, \
12 .max_interval = 32, \
13 .busy_factor = 32, \
14 .imbalance_pct = 125, \
15 .cache_nice_tries = 2, \
16 .busy_idx = 3, \
17 .idle_idx = 2, \
18 .newidle_idx = 0, \
19 .wake_idx = 0, \
20 .forkexec_idx = 0, \
21 .flags = SD_LOAD_BALANCE \
22 | SD_BALANCE_FORK \
23 | SD_BALANCE_EXEC \
24 | SD_BALANCE_NEWIDLE \
25 | SD_SERIALIZE, \
26 .last_balance = jiffies, \
27 .balance_interval = 1, \
28 .nr_balance_failed = 0, \
29}
30
31#define cpu_to_node(cpu) ((void)(cpu),0) 6#define cpu_to_node(cpu) ((void)(cpu),0)
32#define parent_node(node) ((void)(node),0) 7#define parent_node(node) ((void)(node),0)
33 8
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index 8b9c556d630..1754390a426 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -31,25 +31,6 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
31 cpu_all_mask : \ 31 cpu_all_mask : \
32 cpumask_of_node(pcibus_to_node(bus))) 32 cpumask_of_node(pcibus_to_node(bus)))
33 33
34#define SD_NODE_INIT (struct sched_domain) { \
35 .min_interval = 8, \
36 .max_interval = 32, \
37 .busy_factor = 32, \
38 .imbalance_pct = 125, \
39 .cache_nice_tries = 2, \
40 .busy_idx = 3, \
41 .idle_idx = 2, \
42 .newidle_idx = 0, \
43 .wake_idx = 0, \
44 .forkexec_idx = 0, \
45 .flags = SD_LOAD_BALANCE \
46 | SD_BALANCE_FORK \
47 | SD_BALANCE_EXEC \
48 | SD_SERIALIZE, \
49 .last_balance = jiffies, \
50 .balance_interval = 1, \
51}
52
53#else /* CONFIG_NUMA */ 34#else /* CONFIG_NUMA */
54 35
55#include <asm-generic/topology.h> 36#include <asm-generic/topology.h>
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h
index 6fdd0c86019..7a7ce390534 100644
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -78,32 +78,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
78 .balance_interval = 32, \ 78 .balance_interval = 32, \
79} 79}
80 80
81/* sched_domains SD_NODE_INIT for TILE architecture */
82#define SD_NODE_INIT (struct sched_domain) { \
83 .min_interval = 16, \
84 .max_interval = 512, \
85 .busy_factor = 32, \
86 .imbalance_pct = 125, \
87 .cache_nice_tries = 1, \
88 .busy_idx = 3, \
89 .idle_idx = 1, \
90 .newidle_idx = 2, \
91 .wake_idx = 1, \
92 .flags = 1*SD_LOAD_BALANCE \
93 | 1*SD_BALANCE_NEWIDLE \
94 | 1*SD_BALANCE_EXEC \
95 | 1*SD_BALANCE_FORK \
96 | 0*SD_BALANCE_WAKE \
97 | 0*SD_WAKE_AFFINE \
98 | 0*SD_PREFER_LOCAL \
99 | 0*SD_SHARE_CPUPOWER \
100 | 0*SD_SHARE_PKG_RESOURCES \
101 | 1*SD_SERIALIZE \
102 , \
103 .last_balance = jiffies, \
104 .balance_interval = 128, \
105}
106
107/* By definition, we create nodes based on online memory. */ 81/* By definition, we create nodes based on online memory. */
108#define node_has_online_mem(nid) 1 82#define node_has_online_mem(nid) 1
109 83
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index b9676ae37ad..095b21507b6 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -92,44 +92,6 @@ extern void setup_node_to_cpumask_map(void);
92 92
93#define pcibus_to_node(bus) __pcibus_to_node(bus) 93#define pcibus_to_node(bus) __pcibus_to_node(bus)
94 94
95#ifdef CONFIG_X86_32
96# define SD_CACHE_NICE_TRIES 1
97# define SD_IDLE_IDX 1
98#else
99# define SD_CACHE_NICE_TRIES 2
100# define SD_IDLE_IDX 2
101#endif
102
103/* sched_domains SD_NODE_INIT for NUMA machines */
104#define SD_NODE_INIT (struct sched_domain) { \
105 .min_interval = 8, \
106 .max_interval = 32, \
107 .busy_factor = 32, \
108 .imbalance_pct = 125, \
109 .cache_nice_tries = SD_CACHE_NICE_TRIES, \
110 .busy_idx = 3, \
111 .idle_idx = SD_IDLE_IDX, \
112 .newidle_idx = 0, \
113 .wake_idx = 0, \
114 .forkexec_idx = 0, \
115 \
116 .flags = 1*SD_LOAD_BALANCE \
117 | 1*SD_BALANCE_NEWIDLE \
118 | 1*SD_BALANCE_EXEC \
119 | 1*SD_BALANCE_FORK \
120 | 0*SD_BALANCE_WAKE \
121 | 1*SD_WAKE_AFFINE \
122 | 0*SD_PREFER_LOCAL \
123 | 0*SD_SHARE_CPUPOWER \
124 | 0*SD_POWERSAVINGS_BALANCE \
125 | 0*SD_SHARE_PKG_RESOURCES \
126 | 1*SD_SERIALIZE \
127 | 0*SD_PREFER_SIBLING \
128 , \
129 .last_balance = jiffies, \
130 .balance_interval = 1, \
131}
132
133extern int __node_distance(int, int); 95extern int __node_distance(int, int);
134#define node_distance(a, b) __node_distance(a, b) 96#define node_distance(a, b) __node_distance(a, b)
135 97
diff --git a/include/linux/topology.h b/include/linux/topology.h
index e26db031303..4f59bf36f0a 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -70,7 +70,6 @@ int arch_update_cpu_topology(void);
70 * Below are the 3 major initializers used in building sched_domains: 70 * Below are the 3 major initializers used in building sched_domains:
71 * SD_SIBLING_INIT, for SMT domains 71 * SD_SIBLING_INIT, for SMT domains
72 * SD_CPU_INIT, for SMP domains 72 * SD_CPU_INIT, for SMP domains
73 * SD_NODE_INIT, for NUMA domains
74 * 73 *
75 * Any architecture that cares to do any tuning to these values should do so 74 * Any architecture that cares to do any tuning to these values should do so
76 * by defining their own arch-specific initializer in include/asm/topology.h. 75 * by defining their own arch-specific initializer in include/asm/topology.h.
@@ -176,48 +175,12 @@ int arch_update_cpu_topology(void);
176} 175}
177#endif 176#endif
178 177
179/* sched_domains SD_ALLNODES_INIT for NUMA machines */
180#define SD_ALLNODES_INIT (struct sched_domain) { \
181 .min_interval = 64, \
182 .max_interval = 64*num_online_cpus(), \
183 .busy_factor = 128, \
184 .imbalance_pct = 133, \
185 .cache_nice_tries = 1, \
186 .busy_idx = 3, \
187 .idle_idx = 3, \
188 .flags = 1*SD_LOAD_BALANCE \
189 | 1*SD_BALANCE_NEWIDLE \
190 | 0*SD_BALANCE_EXEC \
191 | 0*SD_BALANCE_FORK \
192 | 0*SD_BALANCE_WAKE \
193 | 0*SD_WAKE_AFFINE \
194 | 0*SD_SHARE_CPUPOWER \
195 | 0*SD_POWERSAVINGS_BALANCE \
196 | 0*SD_SHARE_PKG_RESOURCES \
197 | 1*SD_SERIALIZE \
198 | 0*SD_PREFER_SIBLING \
199 , \
200 .last_balance = jiffies, \
201 .balance_interval = 64, \
202}
203
204#ifndef SD_NODES_PER_DOMAIN
205#define SD_NODES_PER_DOMAIN 16
206#endif
207
208#ifdef CONFIG_SCHED_BOOK 178#ifdef CONFIG_SCHED_BOOK
209#ifndef SD_BOOK_INIT 179#ifndef SD_BOOK_INIT
210#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!! 180#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
211#endif 181#endif
212#endif /* CONFIG_SCHED_BOOK */ 182#endif /* CONFIG_SCHED_BOOK */
213 183
214#ifdef CONFIG_NUMA
215#ifndef SD_NODE_INIT
216#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
217#endif
218
219#endif /* CONFIG_NUMA */
220
221#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 184#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
222DECLARE_PER_CPU(int, numa_node); 185DECLARE_PER_CPU(int, numa_node);
223 186
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6001e5c3b4e..b4f2096980a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5560,7 +5560,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5560 break; 5560 break;
5561 } 5561 }
5562 5562
5563 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 5563 if (!(sd->flags & SD_OVERLAP) &&
5564 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5564 printk(KERN_CONT "\n"); 5565 printk(KERN_CONT "\n");
5565 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5566 printk(KERN_ERR "ERROR: repeated CPUs\n");
5566 break; 5567 break;
@@ -5898,92 +5899,6 @@ static int __init isolated_cpu_setup(char *str)
5898 5899
5899__setup("isolcpus=", isolated_cpu_setup); 5900__setup("isolcpus=", isolated_cpu_setup);
5900 5901
5901#ifdef CONFIG_NUMA
5902
5903/**
5904 * find_next_best_node - find the next node to include in a sched_domain
5905 * @node: node whose sched_domain we're building
5906 * @used_nodes: nodes already in the sched_domain
5907 *
5908 * Find the next node to include in a given scheduling domain. Simply
5909 * finds the closest node not already in the @used_nodes map.
5910 *
5911 * Should use nodemask_t.
5912 */
5913static int find_next_best_node(int node, nodemask_t *used_nodes)
5914{
5915 int i, n, val, min_val, best_node = -1;
5916
5917 min_val = INT_MAX;
5918
5919 for (i = 0; i < nr_node_ids; i++) {
5920 /* Start at @node */
5921 n = (node + i) % nr_node_ids;
5922
5923 if (!nr_cpus_node(n))
5924 continue;
5925
5926 /* Skip already used nodes */
5927 if (node_isset(n, *used_nodes))
5928 continue;
5929
5930 /* Simple min distance search */
5931 val = node_distance(node, n);
5932
5933 if (val < min_val) {
5934 min_val = val;
5935 best_node = n;
5936 }
5937 }
5938
5939 if (best_node != -1)
5940 node_set(best_node, *used_nodes);
5941 return best_node;
5942}
5943
5944/**
5945 * sched_domain_node_span - get a cpumask for a node's sched_domain
5946 * @node: node whose cpumask we're constructing
5947 * @span: resulting cpumask
5948 *
5949 * Given a node, construct a good cpumask for its sched_domain to span. It
5950 * should be one that prevents unnecessary balancing, but also spreads tasks
5951 * out optimally.
5952 */
5953static void sched_domain_node_span(int node, struct cpumask *span)
5954{
5955 nodemask_t used_nodes;
5956 int i;
5957
5958 cpumask_clear(span);
5959 nodes_clear(used_nodes);
5960
5961 cpumask_or(span, span, cpumask_of_node(node));
5962 node_set(node, used_nodes);
5963
5964 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5965 int next_node = find_next_best_node(node, &used_nodes);
5966 if (next_node < 0)
5967 break;
5968 cpumask_or(span, span, cpumask_of_node(next_node));
5969 }
5970}
5971
5972static const struct cpumask *cpu_node_mask(int cpu)
5973{
5974 lockdep_assert_held(&sched_domains_mutex);
5975
5976 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
5977
5978 return sched_domains_tmpmask;
5979}
5980
5981static const struct cpumask *cpu_allnodes_mask(int cpu)
5982{
5983 return cpu_possible_mask;
5984}
5985#endif /* CONFIG_NUMA */
5986
5987static const struct cpumask *cpu_cpu_mask(int cpu) 5902static const struct cpumask *cpu_cpu_mask(int cpu)
5988{ 5903{
5989 return cpumask_of_node(cpu_to_node(cpu)); 5904 return cpumask_of_node(cpu_to_node(cpu));
@@ -6020,6 +5935,7 @@ struct sched_domain_topology_level {
6020 sched_domain_init_f init; 5935 sched_domain_init_f init;
6021 sched_domain_mask_f mask; 5936 sched_domain_mask_f mask;
6022 int flags; 5937 int flags;
5938 int numa_level;
6023 struct sd_data data; 5939 struct sd_data data;
6024}; 5940};
6025 5941
@@ -6213,10 +6129,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6213} 6129}
6214 6130
6215SD_INIT_FUNC(CPU) 6131SD_INIT_FUNC(CPU)
6216#ifdef CONFIG_NUMA
6217 SD_INIT_FUNC(ALLNODES)
6218 SD_INIT_FUNC(NODE)
6219#endif
6220#ifdef CONFIG_SCHED_SMT 6132#ifdef CONFIG_SCHED_SMT
6221 SD_INIT_FUNC(SIBLING) 6133 SD_INIT_FUNC(SIBLING)
6222#endif 6134#endif
@@ -6338,15 +6250,191 @@ static struct sched_domain_topology_level default_topology[] = {
6338 { sd_init_BOOK, cpu_book_mask, }, 6250 { sd_init_BOOK, cpu_book_mask, },
6339#endif 6251#endif
6340 { sd_init_CPU, cpu_cpu_mask, }, 6252 { sd_init_CPU, cpu_cpu_mask, },
6341#ifdef CONFIG_NUMA
6342 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
6343 { sd_init_ALLNODES, cpu_allnodes_mask, },
6344#endif
6345 { NULL, }, 6253 { NULL, },
6346}; 6254};
6347 6255
6348static struct sched_domain_topology_level *sched_domain_topology = default_topology; 6256static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6349 6257
6258#ifdef CONFIG_NUMA
6259
6260static int sched_domains_numa_levels;
6261static int sched_domains_numa_scale;
6262static int *sched_domains_numa_distance;
6263static struct cpumask ***sched_domains_numa_masks;
6264static int sched_domains_curr_level;
6265
6266static inline unsigned long numa_scale(unsigned long x, int level)
6267{
6268 return x * sched_domains_numa_distance[level] / sched_domains_numa_scale;
6269}
6270
6271static inline int sd_local_flags(int level)
6272{
6273 if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
6274 return 0;
6275
6276 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6277}
6278
6279static struct sched_domain *
6280sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6281{
6282 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6283 int level = tl->numa_level;
6284 int sd_weight = cpumask_weight(
6285 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6286
6287 *sd = (struct sched_domain){
6288 .min_interval = sd_weight,
6289 .max_interval = 2*sd_weight,
6290 .busy_factor = 32,
6291 .imbalance_pct = 100 + numa_scale(25, level),
6292 .cache_nice_tries = 2,
6293 .busy_idx = 3,
6294 .idle_idx = 2,
6295 .newidle_idx = 0,
6296 .wake_idx = 0,
6297 .forkexec_idx = 0,
6298
6299 .flags = 1*SD_LOAD_BALANCE
6300 | 1*SD_BALANCE_NEWIDLE
6301 | 0*SD_BALANCE_EXEC
6302 | 0*SD_BALANCE_FORK
6303 | 0*SD_BALANCE_WAKE
6304 | 0*SD_WAKE_AFFINE
6305 | 0*SD_PREFER_LOCAL
6306 | 0*SD_SHARE_CPUPOWER
6307 | 0*SD_POWERSAVINGS_BALANCE
6308 | 0*SD_SHARE_PKG_RESOURCES
6309 | 1*SD_SERIALIZE
6310 | 0*SD_PREFER_SIBLING
6311 | sd_local_flags(level)
6312 ,
6313 .last_balance = jiffies,
6314 .balance_interval = sd_weight,
6315 };
6316 SD_INIT_NAME(sd, NUMA);
6317 sd->private = &tl->data;
6318
6319 /*
6320 * Ugly hack to pass state to sd_numa_mask()...
6321 */
6322 sched_domains_curr_level = tl->numa_level;
6323
6324 return sd;
6325}
6326
6327static const struct cpumask *sd_numa_mask(int cpu)
6328{
6329 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6330}
6331
6332static void sched_init_numa(void)
6333{
6334 int next_distance, curr_distance = node_distance(0, 0);
6335 struct sched_domain_topology_level *tl;
6336 int level = 0;
6337 int i, j, k;
6338
6339 sched_domains_numa_scale = curr_distance;
6340 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6341 if (!sched_domains_numa_distance)
6342 return;
6343
6344 /*
6345 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6346 * unique distances in the node_distance() table.
6347 *
6348 * Assumes node_distance(0,j) includes all distances in
6349 * node_distance(i,j) in order to avoid cubic time.
6350 *
6351 * XXX: could be optimized to O(n log n) by using sort()
6352 */
6353 next_distance = curr_distance;
6354 for (i = 0; i < nr_node_ids; i++) {
6355 for (j = 0; j < nr_node_ids; j++) {
6356 int distance = node_distance(0, j);
6357 if (distance > curr_distance &&
6358 (distance < next_distance ||
6359 next_distance == curr_distance))
6360 next_distance = distance;
6361 }
6362 if (next_distance != curr_distance) {
6363 sched_domains_numa_distance[level++] = next_distance;
6364 sched_domains_numa_levels = level;
6365 curr_distance = next_distance;
6366 } else break;
6367 }
6368 /*
6369 * 'level' contains the number of unique distances, excluding the
6370 * identity distance node_distance(i,i).
6371 *
6372 * The sched_domains_nume_distance[] array includes the actual distance
6373 * numbers.
6374 */
6375
6376 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6377 if (!sched_domains_numa_masks)
6378 return;
6379
6380 /*
6381 * Now for each level, construct a mask per node which contains all
6382 * cpus of nodes that are that many hops away from us.
6383 */
6384 for (i = 0; i < level; i++) {
6385 sched_domains_numa_masks[i] =
6386 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6387 if (!sched_domains_numa_masks[i])
6388 return;
6389
6390 for (j = 0; j < nr_node_ids; j++) {
6391 struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
6392 if (!mask)
6393 return;
6394
6395 sched_domains_numa_masks[i][j] = mask;
6396
6397 for (k = 0; k < nr_node_ids; k++) {
6398 if (node_distance(cpu_to_node(j), k) >
6399 sched_domains_numa_distance[i])
6400 continue;
6401
6402 cpumask_or(mask, mask, cpumask_of_node(k));
6403 }
6404 }
6405 }
6406
6407 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6408 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6409 if (!tl)
6410 return;
6411
6412 /*
6413 * Copy the default topology bits..
6414 */
6415 for (i = 0; default_topology[i].init; i++)
6416 tl[i] = default_topology[i];
6417
6418 /*
6419 * .. and append 'j' levels of NUMA goodness.
6420 */
6421 for (j = 0; j < level; i++, j++) {
6422 tl[i] = (struct sched_domain_topology_level){
6423 .init = sd_numa_init,
6424 .mask = sd_numa_mask,
6425 .flags = SDTL_OVERLAP,
6426 .numa_level = j,
6427 };
6428 }
6429
6430 sched_domain_topology = tl;
6431}
6432#else
6433static inline void sched_init_numa(void)
6434{
6435}
6436#endif /* CONFIG_NUMA */
6437
6350static int __sdt_alloc(const struct cpumask *cpu_map) 6438static int __sdt_alloc(const struct cpumask *cpu_map)
6351{ 6439{
6352 struct sched_domain_topology_level *tl; 6440 struct sched_domain_topology_level *tl;
@@ -6840,6 +6928,8 @@ void __init sched_init_smp(void)
6840 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 6928 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6841 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 6929 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6842 6930
6931 sched_init_numa();
6932
6843 get_online_cpus(); 6933 get_online_cpus();
6844 mutex_lock(&sched_domains_mutex); 6934 mutex_lock(&sched_domains_mutex);
6845 init_sched_domains(cpu_active_mask); 6935 init_sched_domains(cpu_active_mask);