aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/mm/numa_64.c211
-rw-r--r--arch/x86/mm/srat_64.c1
2 files changed, 184 insertions, 28 deletions
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index d1a3d94efc8e..086f98a66d80 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -306,8 +306,71 @@ void __init numa_init_array(void)
306 306
307#ifdef CONFIG_NUMA_EMU 307#ifdef CONFIG_NUMA_EMU
308/* Numa emulation */ 308/* Numa emulation */
309static struct bootnode nodes[MAX_NUMNODES] __initdata;
310static struct bootnode physnodes[MAX_NUMNODES] __initdata;
309static char *cmdline __initdata; 311static char *cmdline __initdata;
310 312
313static int __init setup_physnodes(unsigned long start, unsigned long end,
314 int acpi, int k8)
315{
316 int nr_nodes = 0;
317 int ret = 0;
318 int i;
319
320#ifdef CONFIG_ACPI_NUMA
321 if (acpi)
322 nr_nodes = acpi_get_nodes(physnodes);
323#endif
324#ifdef CONFIG_K8_NUMA
325 if (k8)
326 nr_nodes = k8_get_nodes(physnodes);
327#endif
328 /*
329 * Basic sanity checking on the physical node map: there may be errors
330 * if the SRAT or K8 incorrectly reported the topology or the mem=
331 * kernel parameter is used.
332 */
333 for (i = 0; i < nr_nodes; i++) {
334 if (physnodes[i].start == physnodes[i].end)
335 continue;
336 if (physnodes[i].start > end) {
337 physnodes[i].end = physnodes[i].start;
338 continue;
339 }
340 if (physnodes[i].end < start) {
341 physnodes[i].start = physnodes[i].end;
342 continue;
343 }
344 if (physnodes[i].start < start)
345 physnodes[i].start = start;
346 if (physnodes[i].end > end)
347 physnodes[i].end = end;
348 }
349
350 /*
351 * Remove all nodes that have no memory or were truncated because of the
352 * limited address range.
353 */
354 for (i = 0; i < nr_nodes; i++) {
355 if (physnodes[i].start == physnodes[i].end)
356 continue;
357 physnodes[ret].start = physnodes[i].start;
358 physnodes[ret].end = physnodes[i].end;
359 ret++;
360 }
361
362 /*
363 * If no physical topology was detected, a single node is faked to cover
364 * the entire address space.
365 */
366 if (!ret) {
367 physnodes[ret].start = start;
368 physnodes[ret].end = end;
369 ret = 1;
370 }
371 return ret;
372}
373
311/* 374/*
312 * Setups up nid to range from addr to addr + size. If the end 375 * Setups up nid to range from addr to addr + size. If the end
313 * boundary is greater than max_addr, then max_addr is used instead. 376 * boundary is greater than max_addr, then max_addr is used instead.
@@ -315,11 +378,9 @@ static char *cmdline __initdata;
315 * allocation past addr and -1 otherwise. addr is adjusted to be at 378 * allocation past addr and -1 otherwise. addr is adjusted to be at
316 * the end of the node. 379 * the end of the node.
317 */ 380 */
318static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, 381static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
319 u64 size, u64 max_addr)
320{ 382{
321 int ret = 0; 383 int ret = 0;
322
323 nodes[nid].start = *addr; 384 nodes[nid].start = *addr;
324 *addr += size; 385 *addr += size;
325 if (*addr >= max_addr) { 386 if (*addr >= max_addr) {
@@ -335,12 +396,111 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
335} 396}
336 397
337/* 398/*
399 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
400 * to max_addr. The return value is the number of nodes allocated.
401 */
402static int __init split_nodes_interleave(u64 addr, u64 max_addr,
403 int nr_phys_nodes, int nr_nodes)
404{
405 nodemask_t physnode_mask = NODE_MASK_NONE;
406 u64 size;
407 int big;
408 int ret = 0;
409 int i;
410
411 if (nr_nodes <= 0)
412 return -1;
413 if (nr_nodes > MAX_NUMNODES) {
414 pr_info("numa=fake=%d too large, reducing to %d\n",
415 nr_nodes, MAX_NUMNODES);
416 nr_nodes = MAX_NUMNODES;
417 }
418
419 size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
420 /*
421 * Calculate the number of big nodes that can be allocated as a result
422 * of consolidating the remainder.
423 */
424 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) /
425 FAKE_NODE_MIN_SIZE;
426
427 size &= FAKE_NODE_MIN_HASH_MASK;
428 if (!size) {
429 pr_err("Not enough memory for each node. "
430 "NUMA emulation disabled.\n");
431 return -1;
432 }
433
434 for (i = 0; i < nr_phys_nodes; i++)
435 if (physnodes[i].start != physnodes[i].end)
436 node_set(i, physnode_mask);
437
438 /*
439 * Continue to fill physical nodes with fake nodes until there is no
440 * memory left on any of them.
441 */
442 while (nodes_weight(physnode_mask)) {
443 for_each_node_mask(i, physnode_mask) {
444 u64 end = physnodes[i].start + size;
445 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
446
447 if (ret < big)
448 end += FAKE_NODE_MIN_SIZE;
449
450 /*
451 * Continue to add memory to this fake node if its
452 * non-reserved memory is less than the per-node size.
453 */
454 while (end - physnodes[i].start -
455 e820_hole_size(physnodes[i].start, end) < size) {
456 end += FAKE_NODE_MIN_SIZE;
457 if (end > physnodes[i].end) {
458 end = physnodes[i].end;
459 break;
460 }
461 }
462
463 /*
464 * If there won't be at least FAKE_NODE_MIN_SIZE of
465 * non-reserved memory in ZONE_DMA32 for the next node,
466 * this one must extend to the boundary.
467 */
468 if (end < dma32_end && dma32_end - end -
469 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
470 end = dma32_end;
471
472 /*
473 * If there won't be enough non-reserved memory for the
474 * next node, this one must extend to the end of the
475 * physical node.
476 */
477 if (physnodes[i].end - end -
478 e820_hole_size(end, physnodes[i].end) < size)
479 end = physnodes[i].end;
480
481 /*
482 * Avoid allocating more nodes than requested, which can
483 * happen as a result of rounding down each node's size
484 * to FAKE_NODE_MIN_SIZE.
485 */
486 if (nodes_weight(physnode_mask) + ret >= nr_nodes)
487 end = physnodes[i].end;
488
489 if (setup_node_range(ret++, &physnodes[i].start,
490 end - physnodes[i].start,
491 physnodes[i].end) < 0)
492 node_clear(i, physnode_mask);
493 }
494 }
495 return ret;
496}
497
498/*
338 * Splits num_nodes nodes up equally starting at node_start. The return value 499 * Splits num_nodes nodes up equally starting at node_start. The return value
339 * is the number of nodes split up and addr is adjusted to be at the end of the 500 * is the number of nodes split up and addr is adjusted to be at the end of the
340 * last node allocated. 501 * last node allocated.
341 */ 502 */
342static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, 503static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start,
343 u64 max_addr, int node_start,
344 int num_nodes) 504 int num_nodes)
345{ 505{
346 unsigned int big; 506 unsigned int big;
@@ -388,7 +548,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
388 break; 548 break;
389 } 549 }
390 } 550 }
391 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) 551 if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
392 break; 552 break;
393 } 553 }
394 return i - node_start + 1; 554 return i - node_start + 1;
@@ -399,12 +559,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
399 * always assigned to a final node and can be asymmetric. Returns the number of 559 * always assigned to a final node and can be asymmetric. Returns the number of
400 * nodes split. 560 * nodes split.
401 */ 561 */
402static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, 562static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
403 u64 max_addr, int node_start, u64 size) 563 u64 size)
404{ 564{
405 int i = node_start; 565 int i = node_start;
406 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; 566 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
407 while (!setup_node_range(i++, nodes, addr, size, max_addr)) 567 while (!setup_node_range(i++, addr, size, max_addr))
408 ; 568 ;
409 return i - node_start; 569 return i - node_start;
410} 570}
@@ -413,15 +573,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
413 * Sets up the system RAM area from start_pfn to last_pfn according to the 573 * Sets up the system RAM area from start_pfn to last_pfn according to the
414 * numa=fake command-line option. 574 * numa=fake command-line option.
415 */ 575 */
416static struct bootnode nodes[MAX_NUMNODES] __initdata; 576static int __init numa_emulation(unsigned long start_pfn,
417 577 unsigned long last_pfn, int acpi, int k8)
418static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
419{ 578{
420 u64 size, addr = start_pfn << PAGE_SHIFT; 579 u64 size, addr = start_pfn << PAGE_SHIFT;
421 u64 max_addr = last_pfn << PAGE_SHIFT; 580 u64 max_addr = last_pfn << PAGE_SHIFT;
422 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 581 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
582 int num_phys_nodes;
423 583
424 memset(&nodes, 0, sizeof(nodes)); 584 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
425 /* 585 /*
426 * If the numa=fake command-line is just a single number N, split the 586 * If the numa=fake command-line is just a single number N, split the
427 * system RAM into N fake nodes. 587 * system RAM into N fake nodes.
@@ -429,7 +589,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
429 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 589 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
430 long n = simple_strtol(cmdline, NULL, 0); 590 long n = simple_strtol(cmdline, NULL, 0);
431 591
432 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); 592 num_nodes = split_nodes_interleave(addr, max_addr,
593 num_phys_nodes, n);
433 if (num_nodes < 0) 594 if (num_nodes < 0)
434 return num_nodes; 595 return num_nodes;
435 goto out; 596 goto out;
@@ -456,8 +617,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
456 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; 617 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
457 if (size) 618 if (size)
458 for (i = 0; i < coeff; i++, num_nodes++) 619 for (i = 0; i < coeff; i++, num_nodes++)
459 if (setup_node_range(num_nodes, nodes, 620 if (setup_node_range(num_nodes, &addr,
460 &addr, size, max_addr) < 0) 621 size, max_addr) < 0)
461 goto done; 622 goto done;
462 if (!*cmdline) 623 if (!*cmdline)
463 break; 624 break;
@@ -473,7 +634,7 @@ done:
473 if (addr < max_addr) { 634 if (addr < max_addr) {
474 if (coeff_flag && coeff < 0) { 635 if (coeff_flag && coeff < 0) {
475 /* Split remaining nodes into num-sized chunks */ 636 /* Split remaining nodes into num-sized chunks */
476 num_nodes += split_nodes_by_size(nodes, &addr, max_addr, 637 num_nodes += split_nodes_by_size(&addr, max_addr,
477 num_nodes, num); 638 num_nodes, num);
478 goto out; 639 goto out;
479 } 640 }
@@ -482,7 +643,7 @@ done:
482 /* Split remaining nodes into coeff chunks */ 643 /* Split remaining nodes into coeff chunks */
483 if (coeff <= 0) 644 if (coeff <= 0)
484 break; 645 break;
485 num_nodes += split_nodes_equally(nodes, &addr, max_addr, 646 num_nodes += split_nodes_equally(&addr, max_addr,
486 num_nodes, coeff); 647 num_nodes, coeff);
487 break; 648 break;
488 case ',': 649 case ',':
@@ -490,8 +651,8 @@ done:
490 break; 651 break;
491 default: 652 default:
492 /* Give one final node */ 653 /* Give one final node */
493 setup_node_range(num_nodes, nodes, &addr, 654 setup_node_range(num_nodes, &addr, max_addr - addr,
494 max_addr - addr, max_addr); 655 max_addr);
495 num_nodes++; 656 num_nodes++;
496 } 657 }
497 } 658 }
@@ -505,14 +666,10 @@ out:
505 } 666 }
506 667
507 /* 668 /*
508 * We need to vacate all active ranges that may have been registered by 669 * We need to vacate all active ranges that may have been registered for
509 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns 670 * the e820 memory map.
510 * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
511 */ 671 */
512 remove_all_active_ranges(); 672 remove_all_active_ranges();
513#ifdef CONFIG_ACPI_NUMA
514 acpi_numa = -1;
515#endif
516 for_each_node_mask(i, node_possible_map) { 673 for_each_node_mask(i, node_possible_map) {
517 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 674 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
518 nodes[i].end >> PAGE_SHIFT); 675 nodes[i].end >> PAGE_SHIFT);
@@ -533,7 +690,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
533 nodes_clear(node_online_map); 690 nodes_clear(node_online_map);
534 691
535#ifdef CONFIG_NUMA_EMU 692#ifdef CONFIG_NUMA_EMU
536 if (cmdline && !numa_emulation(start_pfn, last_pfn)) 693 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
537 return; 694 return;
538 nodes_clear(node_possible_map); 695 nodes_clear(node_possible_map);
539 nodes_clear(node_online_map); 696 nodes_clear(node_online_map);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 891cbe65b2d5..34aa438d60b6 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -468,7 +468,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
468 for (i = 0; i < num_nodes; i++) 468 for (i = 0; i < num_nodes; i++)
469 if (fake_nodes[i].start != fake_nodes[i].end) 469 if (fake_nodes[i].start != fake_nodes[i].end)
470 node_set(i, nodes_parsed); 470 node_set(i, nodes_parsed);
471 WARN_ON(!nodes_cover_memory(fake_nodes));
472} 471}
473 472
474static int null_slit_node_compare(int a, int b) 473static int null_slit_node_compare(int a, int b)