diff options
Diffstat (limited to 'arch/x86/mm/numa_emulation.c')
-rw-r--r-- | arch/x86/mm/numa_emulation.c | 492 |
1 files changed, 492 insertions, 0 deletions
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c new file mode 100644 index 000000000000..d0ed086b6247 --- /dev/null +++ b/arch/x86/mm/numa_emulation.c | |||
@@ -0,0 +1,492 @@ | |||
1 | /* | ||
2 | * NUMA emulation | ||
3 | */ | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/errno.h> | ||
6 | #include <linux/topology.h> | ||
7 | #include <linux/memblock.h> | ||
8 | #include <linux/bootmem.h> | ||
9 | #include <asm/dma.h> | ||
10 | |||
11 | #include "numa_internal.h" | ||
12 | |||
13 | static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; | ||
14 | static char *emu_cmdline __initdata; | ||
15 | |||
16 | void __init numa_emu_cmdline(char *str) | ||
17 | { | ||
18 | emu_cmdline = str; | ||
19 | } | ||
20 | |||
21 | static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) | ||
22 | { | ||
23 | int i; | ||
24 | |||
25 | for (i = 0; i < mi->nr_blks; i++) | ||
26 | if (mi->blk[i].nid == nid) | ||
27 | return i; | ||
28 | return -ENOENT; | ||
29 | } | ||
30 | |||
31 | /* | ||
32 | * Sets up nid to range from @start to @end. The return value is -errno if | ||
33 | * something went wrong, 0 otherwise. | ||
34 | */ | ||
35 | static int __init emu_setup_memblk(struct numa_meminfo *ei, | ||
36 | struct numa_meminfo *pi, | ||
37 | int nid, int phys_blk, u64 size) | ||
38 | { | ||
39 | struct numa_memblk *eb = &ei->blk[ei->nr_blks]; | ||
40 | struct numa_memblk *pb = &pi->blk[phys_blk]; | ||
41 | |||
42 | if (ei->nr_blks >= NR_NODE_MEMBLKS) { | ||
43 | pr_err("NUMA: Too many emulated memblks, failing emulation\n"); | ||
44 | return -EINVAL; | ||
45 | } | ||
46 | |||
47 | ei->nr_blks++; | ||
48 | eb->start = pb->start; | ||
49 | eb->end = pb->start + size; | ||
50 | eb->nid = nid; | ||
51 | |||
52 | if (emu_nid_to_phys[nid] == NUMA_NO_NODE) | ||
53 | emu_nid_to_phys[nid] = pb->nid; | ||
54 | |||
55 | pb->start += size; | ||
56 | if (pb->start >= pb->end) { | ||
57 | WARN_ON_ONCE(pb->start > pb->end); | ||
58 | numa_remove_memblk_from(phys_blk, pi); | ||
59 | } | ||
60 | |||
61 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, | ||
62 | eb->start, eb->end, (eb->end - eb->start) >> 20); | ||
63 | return 0; | ||
64 | } | ||
65 | |||
66 | /* | ||
67 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr | ||
68 | * to max_addr. The return value is the number of nodes allocated. | ||
69 | */ | ||
70 | static int __init split_nodes_interleave(struct numa_meminfo *ei, | ||
71 | struct numa_meminfo *pi, | ||
72 | u64 addr, u64 max_addr, int nr_nodes) | ||
73 | { | ||
74 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
75 | u64 size; | ||
76 | int big; | ||
77 | int nid = 0; | ||
78 | int i, ret; | ||
79 | |||
80 | if (nr_nodes <= 0) | ||
81 | return -1; | ||
82 | if (nr_nodes > MAX_NUMNODES) { | ||
83 | pr_info("numa=fake=%d too large, reducing to %d\n", | ||
84 | nr_nodes, MAX_NUMNODES); | ||
85 | nr_nodes = MAX_NUMNODES; | ||
86 | } | ||
87 | |||
88 | /* | ||
89 | * Calculate target node size. x86_32 freaks on __udivdi3() so do | ||
90 | * the division in ulong number of pages and convert back. | ||
91 | */ | ||
92 | size = max_addr - addr - memblock_x86_hole_size(addr, max_addr); | ||
93 | size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); | ||
94 | |||
95 | /* | ||
96 | * Calculate the number of big nodes that can be allocated as a result | ||
97 | * of consolidating the remainder. | ||
98 | */ | ||
99 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / | ||
100 | FAKE_NODE_MIN_SIZE; | ||
101 | |||
102 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
103 | if (!size) { | ||
104 | pr_err("Not enough memory for each node. " | ||
105 | "NUMA emulation disabled.\n"); | ||
106 | return -1; | ||
107 | } | ||
108 | |||
109 | for (i = 0; i < pi->nr_blks; i++) | ||
110 | node_set(pi->blk[i].nid, physnode_mask); | ||
111 | |||
112 | /* | ||
113 | * Continue to fill physical nodes with fake nodes until there is no | ||
114 | * memory left on any of them. | ||
115 | */ | ||
116 | while (nodes_weight(physnode_mask)) { | ||
117 | for_each_node_mask(i, physnode_mask) { | ||
118 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); | ||
119 | u64 start, limit, end; | ||
120 | int phys_blk; | ||
121 | |||
122 | phys_blk = emu_find_memblk_by_nid(i, pi); | ||
123 | if (phys_blk < 0) { | ||
124 | node_clear(i, physnode_mask); | ||
125 | continue; | ||
126 | } | ||
127 | start = pi->blk[phys_blk].start; | ||
128 | limit = pi->blk[phys_blk].end; | ||
129 | end = start + size; | ||
130 | |||
131 | if (nid < big) | ||
132 | end += FAKE_NODE_MIN_SIZE; | ||
133 | |||
134 | /* | ||
135 | * Continue to add memory to this fake node if its | ||
136 | * non-reserved memory is less than the per-node size. | ||
137 | */ | ||
138 | while (end - start - | ||
139 | memblock_x86_hole_size(start, end) < size) { | ||
140 | end += FAKE_NODE_MIN_SIZE; | ||
141 | if (end > limit) { | ||
142 | end = limit; | ||
143 | break; | ||
144 | } | ||
145 | } | ||
146 | |||
147 | /* | ||
148 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
149 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
150 | * this one must extend to the boundary. | ||
151 | */ | ||
152 | if (end < dma32_end && dma32_end - end - | ||
153 | memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
154 | end = dma32_end; | ||
155 | |||
156 | /* | ||
157 | * If there won't be enough non-reserved memory for the | ||
158 | * next node, this one must extend to the end of the | ||
159 | * physical node. | ||
160 | */ | ||
161 | if (limit - end - | ||
162 | memblock_x86_hole_size(end, limit) < size) | ||
163 | end = limit; | ||
164 | |||
165 | ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, | ||
166 | phys_blk, | ||
167 | min(end, limit) - start); | ||
168 | if (ret < 0) | ||
169 | return ret; | ||
170 | } | ||
171 | } | ||
172 | return 0; | ||
173 | } | ||
174 | |||
175 | /* | ||
176 | * Returns the end address of a node so that there is at least `size' amount of | ||
177 | * non-reserved memory or `max_addr' is reached. | ||
178 | */ | ||
179 | static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) | ||
180 | { | ||
181 | u64 end = start + size; | ||
182 | |||
183 | while (end - start - memblock_x86_hole_size(start, end) < size) { | ||
184 | end += FAKE_NODE_MIN_SIZE; | ||
185 | if (end > max_addr) { | ||
186 | end = max_addr; | ||
187 | break; | ||
188 | } | ||
189 | } | ||
190 | return end; | ||
191 | } | ||
192 | |||
193 | /* | ||
194 | * Sets up fake nodes of `size' interleaved over physical nodes ranging from | ||
195 | * `addr' to `max_addr'. The return value is the number of nodes allocated. | ||
196 | */ | ||
197 | static int __init split_nodes_size_interleave(struct numa_meminfo *ei, | ||
198 | struct numa_meminfo *pi, | ||
199 | u64 addr, u64 max_addr, u64 size) | ||
200 | { | ||
201 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
202 | u64 min_size; | ||
203 | int nid = 0; | ||
204 | int i, ret; | ||
205 | |||
206 | if (!size) | ||
207 | return -1; | ||
208 | /* | ||
209 | * The limit on emulated nodes is MAX_NUMNODES, so the size per node is | ||
210 | * increased accordingly if the requested size is too small. This | ||
211 | * creates a uniform distribution of node sizes across the entire | ||
212 | * machine (but not necessarily over physical nodes). | ||
213 | */ | ||
214 | min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / | ||
215 | MAX_NUMNODES; | ||
216 | min_size = max(min_size, FAKE_NODE_MIN_SIZE); | ||
217 | if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) | ||
218 | min_size = (min_size + FAKE_NODE_MIN_SIZE) & | ||
219 | FAKE_NODE_MIN_HASH_MASK; | ||
220 | if (size < min_size) { | ||
221 | pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", | ||
222 | size >> 20, min_size >> 20); | ||
223 | size = min_size; | ||
224 | } | ||
225 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
226 | |||
227 | for (i = 0; i < pi->nr_blks; i++) | ||
228 | node_set(pi->blk[i].nid, physnode_mask); | ||
229 | |||
230 | /* | ||
231 | * Fill physical nodes with fake nodes of size until there is no memory | ||
232 | * left on any of them. | ||
233 | */ | ||
234 | while (nodes_weight(physnode_mask)) { | ||
235 | for_each_node_mask(i, physnode_mask) { | ||
236 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); | ||
237 | u64 start, limit, end; | ||
238 | int phys_blk; | ||
239 | |||
240 | phys_blk = emu_find_memblk_by_nid(i, pi); | ||
241 | if (phys_blk < 0) { | ||
242 | node_clear(i, physnode_mask); | ||
243 | continue; | ||
244 | } | ||
245 | start = pi->blk[phys_blk].start; | ||
246 | limit = pi->blk[phys_blk].end; | ||
247 | |||
248 | end = find_end_of_node(start, limit, size); | ||
249 | /* | ||
250 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
251 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
252 | * this one must extend to the boundary. | ||
253 | */ | ||
254 | if (end < dma32_end && dma32_end - end - | ||
255 | memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
256 | end = dma32_end; | ||
257 | |||
258 | /* | ||
259 | * If there won't be enough non-reserved memory for the | ||
260 | * next node, this one must extend to the end of the | ||
261 | * physical node. | ||
262 | */ | ||
263 | if (limit - end - | ||
264 | memblock_x86_hole_size(end, limit) < size) | ||
265 | end = limit; | ||
266 | |||
267 | ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, | ||
268 | phys_blk, | ||
269 | min(end, limit) - start); | ||
270 | if (ret < 0) | ||
271 | return ret; | ||
272 | } | ||
273 | } | ||
274 | return 0; | ||
275 | } | ||
276 | |||
277 | /** | ||
278 | * numa_emulation - Emulate NUMA nodes | ||
279 | * @numa_meminfo: NUMA configuration to massage | ||
280 | * @numa_dist_cnt: The size of the physical NUMA distance table | ||
281 | * | ||
282 | * Emulate NUMA nodes according to the numa=fake kernel parameter. | ||
283 | * @numa_meminfo contains the physical memory configuration and is modified | ||
284 | * to reflect the emulated configuration on success. @numa_dist_cnt is | ||
285 | * used to determine the size of the physical distance table. | ||
286 | * | ||
287 | * On success, the following modifications are made. | ||
288 | * | ||
289 | * - @numa_meminfo is updated to reflect the emulated nodes. | ||
290 | * | ||
291 | * - __apicid_to_node[] is updated such that APIC IDs are mapped to the | ||
292 | * emulated nodes. | ||
293 | * | ||
294 | * - NUMA distance table is rebuilt to represent distances between emulated | ||
295 | * nodes. The distances are determined considering how emulated nodes | ||
296 | * are mapped to physical nodes and match the actual distances. | ||
297 | * | ||
298 | * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical | ||
299 | * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). | ||
300 | * | ||
301 | * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with | ||
302 | * identity mapping and no other modification is made. | ||
303 | */ | ||
304 | void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) | ||
305 | { | ||
306 | static struct numa_meminfo ei __initdata; | ||
307 | static struct numa_meminfo pi __initdata; | ||
308 | const u64 max_addr = PFN_PHYS(max_pfn); | ||
309 | u8 *phys_dist = NULL; | ||
310 | size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); | ||
311 | int max_emu_nid, dfl_phys_nid; | ||
312 | int i, j, ret; | ||
313 | |||
314 | if (!emu_cmdline) | ||
315 | goto no_emu; | ||
316 | |||
317 | memset(&ei, 0, sizeof(ei)); | ||
318 | pi = *numa_meminfo; | ||
319 | |||
320 | for (i = 0; i < MAX_NUMNODES; i++) | ||
321 | emu_nid_to_phys[i] = NUMA_NO_NODE; | ||
322 | |||
323 | /* | ||
324 | * If the numa=fake command-line contains a 'M' or 'G', it represents | ||
325 | * the fixed node size. Otherwise, if it is just a single number N, | ||
326 | * split the system RAM into N fake nodes. | ||
327 | */ | ||
328 | if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { | ||
329 | u64 size; | ||
330 | |||
331 | size = memparse(emu_cmdline, &emu_cmdline); | ||
332 | ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); | ||
333 | } else { | ||
334 | unsigned long n; | ||
335 | |||
336 | n = simple_strtoul(emu_cmdline, NULL, 0); | ||
337 | ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); | ||
338 | } | ||
339 | |||
340 | if (ret < 0) | ||
341 | goto no_emu; | ||
342 | |||
343 | if (numa_cleanup_meminfo(&ei) < 0) { | ||
344 | pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); | ||
345 | goto no_emu; | ||
346 | } | ||
347 | |||
348 | /* copy the physical distance table */ | ||
349 | if (numa_dist_cnt) { | ||
350 | u64 phys; | ||
351 | |||
352 | phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), | ||
353 | phys_size, PAGE_SIZE); | ||
354 | if (phys == MEMBLOCK_ERROR) { | ||
355 | pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); | ||
356 | goto no_emu; | ||
357 | } | ||
358 | memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST"); | ||
359 | phys_dist = __va(phys); | ||
360 | |||
361 | for (i = 0; i < numa_dist_cnt; i++) | ||
362 | for (j = 0; j < numa_dist_cnt; j++) | ||
363 | phys_dist[i * numa_dist_cnt + j] = | ||
364 | node_distance(i, j); | ||
365 | } | ||
366 | |||
367 | /* | ||
368 | * Determine the max emulated nid and the default phys nid to use | ||
369 | * for unmapped nodes. | ||
370 | */ | ||
371 | max_emu_nid = 0; | ||
372 | dfl_phys_nid = NUMA_NO_NODE; | ||
373 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { | ||
374 | if (emu_nid_to_phys[i] != NUMA_NO_NODE) { | ||
375 | max_emu_nid = i; | ||
376 | if (dfl_phys_nid == NUMA_NO_NODE) | ||
377 | dfl_phys_nid = emu_nid_to_phys[i]; | ||
378 | } | ||
379 | } | ||
380 | if (dfl_phys_nid == NUMA_NO_NODE) { | ||
381 | pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); | ||
382 | goto no_emu; | ||
383 | } | ||
384 | |||
385 | /* commit */ | ||
386 | *numa_meminfo = ei; | ||
387 | |||
388 | /* | ||
389 | * Transform __apicid_to_node table to use emulated nids by | ||
390 | * reverse-mapping phys_nid. The maps should always exist but fall | ||
391 | * back to zero just in case. | ||
392 | */ | ||
393 | for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { | ||
394 | if (__apicid_to_node[i] == NUMA_NO_NODE) | ||
395 | continue; | ||
396 | for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) | ||
397 | if (__apicid_to_node[i] == emu_nid_to_phys[j]) | ||
398 | break; | ||
399 | __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; | ||
400 | } | ||
401 | |||
402 | /* make sure all emulated nodes are mapped to a physical node */ | ||
403 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) | ||
404 | if (emu_nid_to_phys[i] == NUMA_NO_NODE) | ||
405 | emu_nid_to_phys[i] = dfl_phys_nid; | ||
406 | |||
407 | /* transform distance table */ | ||
408 | numa_reset_distance(); | ||
409 | for (i = 0; i < max_emu_nid + 1; i++) { | ||
410 | for (j = 0; j < max_emu_nid + 1; j++) { | ||
411 | int physi = emu_nid_to_phys[i]; | ||
412 | int physj = emu_nid_to_phys[j]; | ||
413 | int dist; | ||
414 | |||
415 | if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) | ||
416 | dist = physi == physj ? | ||
417 | LOCAL_DISTANCE : REMOTE_DISTANCE; | ||
418 | else | ||
419 | dist = phys_dist[physi * numa_dist_cnt + physj]; | ||
420 | |||
421 | numa_set_distance(i, j, dist); | ||
422 | } | ||
423 | } | ||
424 | |||
425 | /* free the copied physical distance table */ | ||
426 | if (phys_dist) | ||
427 | memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size); | ||
428 | return; | ||
429 | |||
430 | no_emu: | ||
431 | /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ | ||
432 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) | ||
433 | emu_nid_to_phys[i] = i; | ||
434 | } | ||
435 | |||
436 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS | ||
437 | void __cpuinit numa_add_cpu(int cpu) | ||
438 | { | ||
439 | int physnid, nid; | ||
440 | |||
441 | nid = early_cpu_to_node(cpu); | ||
442 | BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); | ||
443 | |||
444 | physnid = emu_nid_to_phys[nid]; | ||
445 | |||
446 | /* | ||
447 | * Map the cpu to each emulated node that is allocated on the physical | ||
448 | * node of the cpu's apic id. | ||
449 | */ | ||
450 | for_each_online_node(nid) | ||
451 | if (emu_nid_to_phys[nid] == physnid) | ||
452 | cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); | ||
453 | } | ||
454 | |||
455 | void __cpuinit numa_remove_cpu(int cpu) | ||
456 | { | ||
457 | int i; | ||
458 | |||
459 | for_each_online_node(i) | ||
460 | cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); | ||
461 | } | ||
462 | #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ | ||
463 | static void __cpuinit numa_set_cpumask(int cpu, bool enable) | ||
464 | { | ||
465 | int nid, physnid; | ||
466 | |||
467 | nid = early_cpu_to_node(cpu); | ||
468 | if (nid == NUMA_NO_NODE) { | ||
469 | /* early_cpu_to_node() already emits a warning and trace */ | ||
470 | return; | ||
471 | } | ||
472 | |||
473 | physnid = emu_nid_to_phys[nid]; | ||
474 | |||
475 | for_each_online_node(nid) { | ||
476 | if (emu_nid_to_phys[nid] != physnid) | ||
477 | continue; | ||
478 | |||
479 | debug_cpumask_set_cpu(cpu, nid, enable); | ||
480 | } | ||
481 | } | ||
482 | |||
483 | void __cpuinit numa_add_cpu(int cpu) | ||
484 | { | ||
485 | numa_set_cpumask(cpu, true); | ||
486 | } | ||
487 | |||
488 | void __cpuinit numa_remove_cpu(int cpu) | ||
489 | { | ||
490 | numa_set_cpumask(cpu, false); | ||
491 | } | ||
492 | #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ | ||