aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/x86_64/boot-options.txt8
-rw-r--r--arch/x86_64/mm/numa.c261
-rw-r--r--include/asm-x86_64/mmzone.h2
3 files changed, 161 insertions, 110 deletions
diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index 85f51e5a749f..7500aad95f3c 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -149,7 +149,13 @@ NUMA
149 149
150 numa=noacpi Don't parse the SRAT table for NUMA setup 150 numa=noacpi Don't parse the SRAT table for NUMA setup
151 151
152 numa=fake=X Fake X nodes and ignore NUMA setup of the actual machine. 152 numa=fake=CMDLINE
153 If a number, fakes CMDLINE nodes and ignores NUMA setup of the
154 actual machine. Otherwise, system memory is configured
155 depending on the sizes and coefficients listed. For example:
156 numa=fake=2*512,1024,4*256
157 gives two 512M nodes, a 1024M node, and four 256M nodes. The
158 remaining system RAM is allocated to an additional node.
153 159
154 numa=hotadd=percent 160 numa=hotadd=percent
155 Only allow hotadd memory to preallocate page structures upto 161 Only allow hotadd memory to preallocate page structures upto
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 41b8fb069924..c55936bc6be6 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -273,125 +273,172 @@ void __init numa_init_array(void)
273 273
274#ifdef CONFIG_NUMA_EMU 274#ifdef CONFIG_NUMA_EMU
275/* Numa emulation */ 275/* Numa emulation */
276int numa_fake __initdata = 0; 276#define E820_ADDR_HOLE_SIZE(start, end) \
277 (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \
278 PAGE_SHIFT)
279char *cmdline __initdata;
277 280
278/* 281/*
279 * This function is used to find out if the start and end correspond to 282 * Setups up nid to range from addr to addr + size. If the end boundary is
280 * different zones. 283 * greater than max_addr, then max_addr is used instead. The return value is 0
284 * if there is additional memory left for allocation past addr and -1 otherwise.
285 * addr is adjusted to be at the end of the node.
281 */ 286 */
282int zone_cross_over(unsigned long start, unsigned long end) 287static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
288 u64 size, u64 max_addr)
283{ 289{
284 if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && 290 int ret = 0;
285 (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) 291 nodes[nid].start = *addr;
286 return 1; 292 *addr += size;
287 return 0; 293 if (*addr >= max_addr) {
294 *addr = max_addr;
295 ret = -1;
296 }
297 nodes[nid].end = *addr;
298 node_set_online(nid);
299 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
300 nodes[nid].start, nodes[nid].end,
301 (nodes[nid].end - nodes[nid].start) >> 20);
302 return ret;
288} 303}
289 304
290static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 305/*
306 * Splits num_nodes nodes up equally starting at node_start. The return value
307 * is the number of nodes split up and addr is adjusted to be at the end of the
308 * last node allocated.
309 */
310static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
311 u64 max_addr, int node_start,
312 int num_nodes)
291{ 313{
292 int i, big; 314 unsigned int big;
293 struct bootnode nodes[MAX_NUMNODES]; 315 u64 size;
294 unsigned long sz, old_sz; 316 int i;
295 unsigned long hole_size;
296 unsigned long start, end;
297 unsigned long max_addr = (end_pfn << PAGE_SHIFT);
298
299 start = (start_pfn << PAGE_SHIFT);
300 hole_size = e820_hole_size(start, max_addr);
301 sz = (max_addr - start - hole_size) / numa_fake;
302
303 /* Kludge needed for the hash function */
304
305 old_sz = sz;
306 /*
307 * Round down to the nearest FAKE_NODE_MIN_SIZE.
308 */
309 sz &= FAKE_NODE_MIN_HASH_MASK;
310 317
318 if (num_nodes <= 0)
319 return -1;
320 if (num_nodes > MAX_NUMNODES)
321 num_nodes = MAX_NUMNODES;
322 size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) /
323 num_nodes;
311 /* 324 /*
312 * We ensure that each node is at least 64MB big. Smaller than this 325 * Calculate the number of big nodes that can be allocated as a result
313 * size can cause VM hiccups. 326 * of consolidating the leftovers.
314 */ 327 */
315 if (sz == 0) { 328 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
316 printk(KERN_INFO "Not enough memory for %d nodes. Reducing " 329 FAKE_NODE_MIN_SIZE;
317 "the number of nodes\n", numa_fake); 330
318 numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; 331 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
319 printk(KERN_INFO "Number of fake nodes will be = %d\n", 332 size &= FAKE_NODE_MIN_HASH_MASK;
320 numa_fake); 333 if (!size) {
321 sz = FAKE_NODE_MIN_SIZE; 334 printk(KERN_ERR "Not enough memory for each node. "
335 "NUMA emulation disabled.\n");
336 return -1;
322 } 337 }
323 /* 338
324 * Find out how many nodes can get an extra NODE_MIN_SIZE granule. 339 for (i = node_start; i < num_nodes + node_start; i++) {
325 * This logic ensures the extra memory gets distributed among as many 340 u64 end = *addr + size;
326 * nodes as possible (as compared to one single node getting all that
327 * extra memory.
328 */
329 big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE;
330 printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: "
331 "%d\n",
332 (sz >> 20), (hole_size >> 20), big);
333 memset(&nodes,0,sizeof(nodes));
334 end = start;
335 for (i = 0; i < numa_fake; i++) {
336 /*
337 * In case we are not able to allocate enough memory for all
338 * the nodes, we reduce the number of fake nodes.
339 */
340 if (end >= max_addr) {
341 numa_fake = i - 1;
342 break;
343 }
344 start = nodes[i].start = end;
345 /*
346 * Final node can have all the remaining memory.
347 */
348 if (i == numa_fake-1)
349 sz = max_addr - start;
350 end = nodes[i].start + sz;
351 /*
352 * Fir "big" number of nodes get extra granule.
353 */
354 if (i < big) 341 if (i < big)
355 end += FAKE_NODE_MIN_SIZE; 342 end += FAKE_NODE_MIN_SIZE;
356 /* 343 /*
357 * Iterate over the range to ensure that this node gets at 344 * The final node can have the remaining system RAM. Other
358 * least sz amount of RAM (excluding holes) 345 * nodes receive roughly the same amount of available pages.
359 */ 346 */
360 while ((end - start - e820_hole_size(start, end)) < sz) { 347 if (i == num_nodes + node_start - 1)
361 end += FAKE_NODE_MIN_SIZE; 348 end = max_addr;
362 if (end >= max_addr) 349 else
363 break; 350 while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) <
351 size) {
352 end += FAKE_NODE_MIN_SIZE;
353 if (end > max_addr) {
354 end = max_addr;
355 break;
356 }
357 }
358 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
359 break;
360 }
361 return i - node_start + 1;
362}
363
364/*
365 * Sets up the system RAM area from start_pfn to end_pfn according to the
366 * numa=fake command-line option.
367 */
368static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
369{
370 struct bootnode nodes[MAX_NUMNODES];
371 u64 addr = start_pfn << PAGE_SHIFT;
372 u64 max_addr = end_pfn << PAGE_SHIFT;
373 unsigned int coeff;
374 unsigned int num = 0;
375 int num_nodes = 0;
376 u64 size;
377 int i;
378
379 memset(&nodes, 0, sizeof(nodes));
380 /*
381 * If the numa=fake command-line is just a single number N, split the
382 * system RAM into N fake nodes.
383 */
384 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
385 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
386 simple_strtol(cmdline, NULL, 0));
387 if (num_nodes < 0)
388 return num_nodes;
389 goto out;
390 }
391
392 /* Parse the command line. */
393 for (coeff = 1; ; cmdline++) {
394 if (*cmdline && isdigit(*cmdline)) {
395 num = num * 10 + *cmdline - '0';
396 continue;
364 } 397 }
365 /* 398 if (*cmdline == '*')
366 * Look at the next node to make sure there is some real memory 399 coeff = num;
367 * to map. Bad things happen when the only memory present 400 if (!*cmdline || *cmdline == ',') {
368 * in a zone on a fake node is IO hole. 401 /*
369 */ 402 * Round down to the nearest FAKE_NODE_MIN_SIZE.
370 while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { 403 * Command-line coefficients are in megabytes.
371 if (zone_cross_over(start, end + sz)) { 404 */
372 end = (MAX_DMA32_PFN << PAGE_SHIFT); 405 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
373 break; 406 if (size) {
407 for (i = 0; i < coeff; i++, num_nodes++)
408 if (setup_node_range(num_nodes, nodes,
409 &addr, size, max_addr) < 0)
410 goto done;
411 coeff = 1;
374 } 412 }
375 if (end >= max_addr)
376 break;
377 end += FAKE_NODE_MIN_SIZE;
378 } 413 }
379 if (end > max_addr) 414 if (!*cmdline)
380 end = max_addr; 415 break;
381 nodes[i].end = end; 416 num = 0;
382 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", 417 }
383 i, 418done:
384 nodes[i].start, nodes[i].end, 419 if (!num_nodes)
385 (nodes[i].end - nodes[i].start) >> 20); 420 return -1;
386 node_set_online(i); 421 /* Fill remainder of system RAM with a final node, if appropriate. */
387 } 422 if (addr < max_addr) {
388 memnode_shift = compute_hash_shift(nodes, numa_fake); 423 setup_node_range(num_nodes, nodes, &addr, max_addr - addr,
389 if (memnode_shift < 0) { 424 max_addr);
390 memnode_shift = 0; 425 num_nodes++;
391 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); 426 }
392 return -1; 427out:
393 } 428 memnode_shift = compute_hash_shift(nodes, num_nodes);
394 for_each_online_node(i) { 429 if (memnode_shift < 0) {
430 memnode_shift = 0;
431 printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
432 "disabled.\n");
433 return -1;
434 }
435
436 /*
437 * We need to vacate all active ranges that may have been registered by
438 * SRAT.
439 */
440 remove_all_active_ranges();
441 for_each_online_node(i) {
395 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 442 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
396 nodes[i].end >> PAGE_SHIFT); 443 nodes[i].end >> PAGE_SHIFT);
397 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 444 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
@@ -399,14 +446,15 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
399 numa_init_array(); 446 numa_init_array();
400 return 0; 447 return 0;
401} 448}
402#endif 449#undef E820_ADDR_HOLE_SIZE
450#endif /* CONFIG_NUMA_EMU */
403 451
404void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 452void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
405{ 453{
406 int i; 454 int i;
407 455
408#ifdef CONFIG_NUMA_EMU 456#ifdef CONFIG_NUMA_EMU
409 if (numa_fake && !numa_emulation(start_pfn, end_pfn)) 457 if (cmdline && !numa_emulation(start_pfn, end_pfn))
410 return; 458 return;
411#endif 459#endif
412 460
@@ -486,11 +534,8 @@ static __init int numa_setup(char *opt)
486 if (!strncmp(opt,"off",3)) 534 if (!strncmp(opt,"off",3))
487 numa_off = 1; 535 numa_off = 1;
488#ifdef CONFIG_NUMA_EMU 536#ifdef CONFIG_NUMA_EMU
489 if(!strncmp(opt, "fake=", 5)) { 537 if (!strncmp(opt, "fake=", 5))
490 numa_fake = simple_strtoul(opt+5,NULL,0); ; 538 cmdline = opt + 5;
491 if (numa_fake >= MAX_NUMNODES)
492 numa_fake = MAX_NUMNODES;
493 }
494#endif 539#endif
495#ifdef CONFIG_ACPI_NUMA 540#ifdef CONFIG_ACPI_NUMA
496 if (!strncmp(opt,"noacpi",6)) 541 if (!strncmp(opt,"noacpi",6))
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index fb558fb1d211..19a89377b123 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -49,7 +49,7 @@ extern int pfn_valid(unsigned long pfn);
49 49
50#ifdef CONFIG_NUMA_EMU 50#ifdef CONFIG_NUMA_EMU
51#define FAKE_NODE_MIN_SIZE (64*1024*1024) 51#define FAKE_NODE_MIN_SIZE (64*1024*1024)
52#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1ul)) 52#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1uL))
53#endif 53#endif
54 54
55#endif 55#endif