diff options
Diffstat (limited to 'arch/x86/mm/numa_emulation.c')
-rw-r--r-- | arch/x86/mm/numa_emulation.c | 494 |
1 files changed, 494 insertions, 0 deletions
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c new file mode 100644 index 000000000000..ad091e4cff17 --- /dev/null +++ b/arch/x86/mm/numa_emulation.c | |||
@@ -0,0 +1,494 @@ | |||
1 | /* | ||
2 | * NUMA emulation | ||
3 | */ | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/errno.h> | ||
6 | #include <linux/topology.h> | ||
7 | #include <linux/memblock.h> | ||
8 | #include <asm/dma.h> | ||
9 | |||
10 | #include "numa_internal.h" | ||
11 | |||
12 | static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; | ||
13 | static char *emu_cmdline __initdata; | ||
14 | |||
15 | void __init numa_emu_cmdline(char *str) | ||
16 | { | ||
17 | emu_cmdline = str; | ||
18 | } | ||
19 | |||
20 | static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) | ||
21 | { | ||
22 | int i; | ||
23 | |||
24 | for (i = 0; i < mi->nr_blks; i++) | ||
25 | if (mi->blk[i].nid == nid) | ||
26 | return i; | ||
27 | return -ENOENT; | ||
28 | } | ||
29 | |||
30 | /* | ||
31 | * Sets up nid to range from @start to @end. The return value is -errno if | ||
32 | * something went wrong, 0 otherwise. | ||
33 | */ | ||
34 | static int __init emu_setup_memblk(struct numa_meminfo *ei, | ||
35 | struct numa_meminfo *pi, | ||
36 | int nid, int phys_blk, u64 size) | ||
37 | { | ||
38 | struct numa_memblk *eb = &ei->blk[ei->nr_blks]; | ||
39 | struct numa_memblk *pb = &pi->blk[phys_blk]; | ||
40 | |||
41 | if (ei->nr_blks >= NR_NODE_MEMBLKS) { | ||
42 | pr_err("NUMA: Too many emulated memblks, failing emulation\n"); | ||
43 | return -EINVAL; | ||
44 | } | ||
45 | |||
46 | ei->nr_blks++; | ||
47 | eb->start = pb->start; | ||
48 | eb->end = pb->start + size; | ||
49 | eb->nid = nid; | ||
50 | |||
51 | if (emu_nid_to_phys[nid] == NUMA_NO_NODE) | ||
52 | emu_nid_to_phys[nid] = pb->nid; | ||
53 | |||
54 | pb->start += size; | ||
55 | if (pb->start >= pb->end) { | ||
56 | WARN_ON_ONCE(pb->start > pb->end); | ||
57 | numa_remove_memblk_from(phys_blk, pi); | ||
58 | } | ||
59 | |||
60 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, | ||
61 | eb->start, eb->end, (eb->end - eb->start) >> 20); | ||
62 | return 0; | ||
63 | } | ||
64 | |||
65 | /* | ||
66 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr | ||
67 | * to max_addr. The return value is the number of nodes allocated. | ||
68 | */ | ||
69 | static int __init split_nodes_interleave(struct numa_meminfo *ei, | ||
70 | struct numa_meminfo *pi, | ||
71 | u64 addr, u64 max_addr, int nr_nodes) | ||
72 | { | ||
73 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
74 | u64 size; | ||
75 | int big; | ||
76 | int nid = 0; | ||
77 | int i, ret; | ||
78 | |||
79 | if (nr_nodes <= 0) | ||
80 | return -1; | ||
81 | if (nr_nodes > MAX_NUMNODES) { | ||
82 | pr_info("numa=fake=%d too large, reducing to %d\n", | ||
83 | nr_nodes, MAX_NUMNODES); | ||
84 | nr_nodes = MAX_NUMNODES; | ||
85 | } | ||
86 | |||
87 | size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; | ||
88 | /* | ||
89 | * Calculate the number of big nodes that can be allocated as a result | ||
90 | * of consolidating the remainder. | ||
91 | */ | ||
92 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / | ||
93 | FAKE_NODE_MIN_SIZE; | ||
94 | |||
95 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
96 | if (!size) { | ||
97 | pr_err("Not enough memory for each node. " | ||
98 | "NUMA emulation disabled.\n"); | ||
99 | return -1; | ||
100 | } | ||
101 | |||
102 | for (i = 0; i < pi->nr_blks; i++) | ||
103 | node_set(pi->blk[i].nid, physnode_mask); | ||
104 | |||
105 | /* | ||
106 | * Continue to fill physical nodes with fake nodes until there is no | ||
107 | * memory left on any of them. | ||
108 | */ | ||
109 | while (nodes_weight(physnode_mask)) { | ||
110 | for_each_node_mask(i, physnode_mask) { | ||
111 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); | ||
112 | u64 start, limit, end; | ||
113 | int phys_blk; | ||
114 | |||
115 | phys_blk = emu_find_memblk_by_nid(i, pi); | ||
116 | if (phys_blk < 0) { | ||
117 | node_clear(i, physnode_mask); | ||
118 | continue; | ||
119 | } | ||
120 | start = pi->blk[phys_blk].start; | ||
121 | limit = pi->blk[phys_blk].end; | ||
122 | end = start + size; | ||
123 | |||
124 | if (nid < big) | ||
125 | end += FAKE_NODE_MIN_SIZE; | ||
126 | |||
127 | /* | ||
128 | * Continue to add memory to this fake node if its | ||
129 | * non-reserved memory is less than the per-node size. | ||
130 | */ | ||
131 | while (end - start - | ||
132 | memblock_x86_hole_size(start, end) < size) { | ||
133 | end += FAKE_NODE_MIN_SIZE; | ||
134 | if (end > limit) { | ||
135 | end = limit; | ||
136 | break; | ||
137 | } | ||
138 | } | ||
139 | |||
140 | /* | ||
141 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
142 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
143 | * this one must extend to the boundary. | ||
144 | */ | ||
145 | if (end < dma32_end && dma32_end - end - | ||
146 | memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
147 | end = dma32_end; | ||
148 | |||
149 | /* | ||
150 | * If there won't be enough non-reserved memory for the | ||
151 | * next node, this one must extend to the end of the | ||
152 | * physical node. | ||
153 | */ | ||
154 | if (limit - end - | ||
155 | memblock_x86_hole_size(end, limit) < size) | ||
156 | end = limit; | ||
157 | |||
158 | ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, | ||
159 | phys_blk, | ||
160 | min(end, limit) - start); | ||
161 | if (ret < 0) | ||
162 | return ret; | ||
163 | } | ||
164 | } | ||
165 | return 0; | ||
166 | } | ||
167 | |||
168 | /* | ||
169 | * Returns the end address of a node so that there is at least `size' amount of | ||
170 | * non-reserved memory or `max_addr' is reached. | ||
171 | */ | ||
172 | static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) | ||
173 | { | ||
174 | u64 end = start + size; | ||
175 | |||
176 | while (end - start - memblock_x86_hole_size(start, end) < size) { | ||
177 | end += FAKE_NODE_MIN_SIZE; | ||
178 | if (end > max_addr) { | ||
179 | end = max_addr; | ||
180 | break; | ||
181 | } | ||
182 | } | ||
183 | return end; | ||
184 | } | ||
185 | |||
186 | /* | ||
187 | * Sets up fake nodes of `size' interleaved over physical nodes ranging from | ||
188 | * `addr' to `max_addr'. The return value is the number of nodes allocated. | ||
189 | */ | ||
190 | static int __init split_nodes_size_interleave(struct numa_meminfo *ei, | ||
191 | struct numa_meminfo *pi, | ||
192 | u64 addr, u64 max_addr, u64 size) | ||
193 | { | ||
194 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
195 | u64 min_size; | ||
196 | int nid = 0; | ||
197 | int i, ret; | ||
198 | |||
199 | if (!size) | ||
200 | return -1; | ||
201 | /* | ||
202 | * The limit on emulated nodes is MAX_NUMNODES, so the size per node is | ||
203 | * increased accordingly if the requested size is too small. This | ||
204 | * creates a uniform distribution of node sizes across the entire | ||
205 | * machine (but not necessarily over physical nodes). | ||
206 | */ | ||
207 | min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / | ||
208 | MAX_NUMNODES; | ||
209 | min_size = max(min_size, FAKE_NODE_MIN_SIZE); | ||
210 | if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) | ||
211 | min_size = (min_size + FAKE_NODE_MIN_SIZE) & | ||
212 | FAKE_NODE_MIN_HASH_MASK; | ||
213 | if (size < min_size) { | ||
214 | pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", | ||
215 | size >> 20, min_size >> 20); | ||
216 | size = min_size; | ||
217 | } | ||
218 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
219 | |||
220 | for (i = 0; i < pi->nr_blks; i++) | ||
221 | node_set(pi->blk[i].nid, physnode_mask); | ||
222 | |||
223 | /* | ||
224 | * Fill physical nodes with fake nodes of size until there is no memory | ||
225 | * left on any of them. | ||
226 | */ | ||
227 | while (nodes_weight(physnode_mask)) { | ||
228 | for_each_node_mask(i, physnode_mask) { | ||
229 | u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; | ||
230 | u64 start, limit, end; | ||
231 | int phys_blk; | ||
232 | |||
233 | phys_blk = emu_find_memblk_by_nid(i, pi); | ||
234 | if (phys_blk < 0) { | ||
235 | node_clear(i, physnode_mask); | ||
236 | continue; | ||
237 | } | ||
238 | start = pi->blk[phys_blk].start; | ||
239 | limit = pi->blk[phys_blk].end; | ||
240 | |||
241 | end = find_end_of_node(start, limit, size); | ||
242 | /* | ||
243 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
244 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
245 | * this one must extend to the boundary. | ||
246 | */ | ||
247 | if (end < dma32_end && dma32_end - end - | ||
248 | memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
249 | end = dma32_end; | ||
250 | |||
251 | /* | ||
252 | * If there won't be enough non-reserved memory for the | ||
253 | * next node, this one must extend to the end of the | ||
254 | * physical node. | ||
255 | */ | ||
256 | if (limit - end - | ||
257 | memblock_x86_hole_size(end, limit) < size) | ||
258 | end = limit; | ||
259 | |||
260 | ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, | ||
261 | phys_blk, | ||
262 | min(end, limit) - start); | ||
263 | if (ret < 0) | ||
264 | return ret; | ||
265 | } | ||
266 | } | ||
267 | return 0; | ||
268 | } | ||
269 | |||
270 | /** | ||
271 | * numa_emulation - Emulate NUMA nodes | ||
272 | * @numa_meminfo: NUMA configuration to massage | ||
273 | * @numa_dist_cnt: The size of the physical NUMA distance table | ||
274 | * | ||
275 | * Emulate NUMA nodes according to the numa=fake kernel parameter. | ||
276 | * @numa_meminfo contains the physical memory configuration and is modified | ||
277 | * to reflect the emulated configuration on success. @numa_dist_cnt is | ||
278 | * used to determine the size of the physical distance table. | ||
279 | * | ||
280 | * On success, the following modifications are made. | ||
281 | * | ||
282 | * - @numa_meminfo is updated to reflect the emulated nodes. | ||
283 | * | ||
284 | * - __apicid_to_node[] is updated such that APIC IDs are mapped to the | ||
285 | * emulated nodes. | ||
286 | * | ||
287 | * - NUMA distance table is rebuilt to represent distances between emulated | ||
288 | * nodes. The distances are determined considering how emulated nodes | ||
289 | * are mapped to physical nodes and match the actual distances. | ||
290 | * | ||
291 | * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical | ||
292 | * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). | ||
293 | * | ||
294 | * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with | ||
295 | * identity mapping and no other modification is made. | ||
296 | */ | ||
297 | void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) | ||
298 | { | ||
299 | static struct numa_meminfo ei __initdata; | ||
300 | static struct numa_meminfo pi __initdata; | ||
301 | const u64 max_addr = max_pfn << PAGE_SHIFT; | ||
302 | u8 *phys_dist = NULL; | ||
303 | size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); | ||
304 | int max_emu_nid, dfl_phys_nid; | ||
305 | int i, j, ret; | ||
306 | |||
307 | if (!emu_cmdline) | ||
308 | goto no_emu; | ||
309 | |||
310 | memset(&ei, 0, sizeof(ei)); | ||
311 | pi = *numa_meminfo; | ||
312 | |||
313 | for (i = 0; i < MAX_NUMNODES; i++) | ||
314 | emu_nid_to_phys[i] = NUMA_NO_NODE; | ||
315 | |||
316 | /* | ||
317 | * If the numa=fake command-line contains a 'M' or 'G', it represents | ||
318 | * the fixed node size. Otherwise, if it is just a single number N, | ||
319 | * split the system RAM into N fake nodes. | ||
320 | */ | ||
321 | if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { | ||
322 | u64 size; | ||
323 | |||
324 | size = memparse(emu_cmdline, &emu_cmdline); | ||
325 | ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); | ||
326 | } else { | ||
327 | unsigned long n; | ||
328 | |||
329 | n = simple_strtoul(emu_cmdline, NULL, 0); | ||
330 | ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); | ||
331 | } | ||
332 | |||
333 | if (ret < 0) | ||
334 | goto no_emu; | ||
335 | |||
336 | if (numa_cleanup_meminfo(&ei) < 0) { | ||
337 | pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); | ||
338 | goto no_emu; | ||
339 | } | ||
340 | |||
341 | /* copy the physical distance table */ | ||
342 | if (numa_dist_cnt) { | ||
343 | u64 phys; | ||
344 | |||
345 | phys = memblock_find_in_range(0, | ||
346 | (u64)max_pfn_mapped << PAGE_SHIFT, | ||
347 | phys_size, PAGE_SIZE); | ||
348 | if (phys == MEMBLOCK_ERROR) { | ||
349 | pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); | ||
350 | goto no_emu; | ||
351 | } | ||
352 | memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST"); | ||
353 | phys_dist = __va(phys); | ||
354 | |||
355 | for (i = 0; i < numa_dist_cnt; i++) | ||
356 | for (j = 0; j < numa_dist_cnt; j++) | ||
357 | phys_dist[i * numa_dist_cnt + j] = | ||
358 | node_distance(i, j); | ||
359 | } | ||
360 | |||
361 | /* | ||
362 | * Determine the max emulated nid and the default phys nid to use | ||
363 | * for unmapped nodes. | ||
364 | */ | ||
365 | max_emu_nid = 0; | ||
366 | dfl_phys_nid = NUMA_NO_NODE; | ||
367 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { | ||
368 | if (emu_nid_to_phys[i] != NUMA_NO_NODE) { | ||
369 | max_emu_nid = i; | ||
370 | if (dfl_phys_nid == NUMA_NO_NODE) | ||
371 | dfl_phys_nid = emu_nid_to_phys[i]; | ||
372 | } | ||
373 | } | ||
374 | if (dfl_phys_nid == NUMA_NO_NODE) { | ||
375 | pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); | ||
376 | goto no_emu; | ||
377 | } | ||
378 | |||
379 | /* commit */ | ||
380 | *numa_meminfo = ei; | ||
381 | |||
382 | /* | ||
383 | * Transform __apicid_to_node table to use emulated nids by | ||
384 | * reverse-mapping phys_nid. The maps should always exist but fall | ||
385 | * back to zero just in case. | ||
386 | */ | ||
387 | for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { | ||
388 | if (__apicid_to_node[i] == NUMA_NO_NODE) | ||
389 | continue; | ||
390 | for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) | ||
391 | if (__apicid_to_node[i] == emu_nid_to_phys[j]) | ||
392 | break; | ||
393 | __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; | ||
394 | } | ||
395 | |||
396 | /* make sure all emulated nodes are mapped to a physical node */ | ||
397 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) | ||
398 | if (emu_nid_to_phys[i] == NUMA_NO_NODE) | ||
399 | emu_nid_to_phys[i] = dfl_phys_nid; | ||
400 | |||
401 | /* transform distance table */ | ||
402 | numa_reset_distance(); | ||
403 | for (i = 0; i < max_emu_nid + 1; i++) { | ||
404 | for (j = 0; j < max_emu_nid + 1; j++) { | ||
405 | int physi = emu_nid_to_phys[i]; | ||
406 | int physj = emu_nid_to_phys[j]; | ||
407 | int dist; | ||
408 | |||
409 | if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) | ||
410 | dist = physi == physj ? | ||
411 | LOCAL_DISTANCE : REMOTE_DISTANCE; | ||
412 | else | ||
413 | dist = phys_dist[physi * numa_dist_cnt + physj]; | ||
414 | |||
415 | numa_set_distance(i, j, dist); | ||
416 | } | ||
417 | } | ||
418 | |||
419 | /* free the copied physical distance table */ | ||
420 | if (phys_dist) | ||
421 | memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size); | ||
422 | return; | ||
423 | |||
424 | no_emu: | ||
425 | /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ | ||
426 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) | ||
427 | emu_nid_to_phys[i] = i; | ||
428 | } | ||
429 | |||
430 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS | ||
431 | void __cpuinit numa_add_cpu(int cpu) | ||
432 | { | ||
433 | int physnid, nid; | ||
434 | |||
435 | nid = early_cpu_to_node(cpu); | ||
436 | BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); | ||
437 | |||
438 | physnid = emu_nid_to_phys[nid]; | ||
439 | |||
440 | /* | ||
441 | * Map the cpu to each emulated node that is allocated on the physical | ||
442 | * node of the cpu's apic id. | ||
443 | */ | ||
444 | for_each_online_node(nid) | ||
445 | if (emu_nid_to_phys[nid] == physnid) | ||
446 | cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); | ||
447 | } | ||
448 | |||
449 | void __cpuinit numa_remove_cpu(int cpu) | ||
450 | { | ||
451 | int i; | ||
452 | |||
453 | for_each_online_node(i) | ||
454 | cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); | ||
455 | } | ||
456 | #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ | ||
457 | static void __cpuinit numa_set_cpumask(int cpu, int enable) | ||
458 | { | ||
459 | struct cpumask *mask; | ||
460 | int nid, physnid, i; | ||
461 | |||
462 | nid = early_cpu_to_node(cpu); | ||
463 | if (nid == NUMA_NO_NODE) { | ||
464 | /* early_cpu_to_node() already emits a warning and trace */ | ||
465 | return; | ||
466 | } | ||
467 | |||
468 | physnid = emu_nid_to_phys[nid]; | ||
469 | |||
470 | for_each_online_node(i) { | ||
471 | if (emu_nid_to_phys[nid] != physnid) | ||
472 | continue; | ||
473 | |||
474 | mask = debug_cpumask_set_cpu(cpu, enable); | ||
475 | if (!mask) | ||
476 | return; | ||
477 | |||
478 | if (enable) | ||
479 | cpumask_set_cpu(cpu, mask); | ||
480 | else | ||
481 | cpumask_clear_cpu(cpu, mask); | ||
482 | } | ||
483 | } | ||
484 | |||
485 | void __cpuinit numa_add_cpu(int cpu) | ||
486 | { | ||
487 | numa_set_cpumask(cpu, 1); | ||
488 | } | ||
489 | |||
490 | void __cpuinit numa_remove_cpu(int cpu) | ||
491 | { | ||
492 | numa_set_cpumask(cpu, 0); | ||
493 | } | ||
494 | #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ | ||