aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm/numa_64.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm/numa_64.c')
-rw-r--r--arch/x86/mm/numa_64.c235
1 files changed, 101 insertions, 134 deletions
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 83bbc70d11bb..3307ea8bd43a 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -427,7 +427,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
427 * Calculate the number of big nodes that can be allocated as a result 427 * Calculate the number of big nodes that can be allocated as a result
428 * of consolidating the remainder. 428 * of consolidating the remainder.
429 */ 429 */
430 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / 430 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
431 FAKE_NODE_MIN_SIZE; 431 FAKE_NODE_MIN_SIZE;
432 432
433 size &= FAKE_NODE_MIN_HASH_MASK; 433 size &= FAKE_NODE_MIN_HASH_MASK;
@@ -502,77 +502,99 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
502} 502}
503 503
504/* 504/*
505 * Splits num_nodes nodes up equally starting at node_start. The return value 505 * Returns the end address of a node so that there is at least `size' amount of
506 * is the number of nodes split up and addr is adjusted to be at the end of the 506 * non-reserved memory or `max_addr' is reached.
507 * last node allocated.
508 */ 507 */
509static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, 508static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
510 int num_nodes)
511{ 509{
512 unsigned int big; 510 u64 end = start + size;
513 u64 size;
514 int i;
515
516 if (num_nodes <= 0)
517 return -1;
518 if (num_nodes > MAX_NUMNODES)
519 num_nodes = MAX_NUMNODES;
520 size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
521 num_nodes;
522 /*
523 * Calculate the number of big nodes that can be allocated as a result
524 * of consolidating the leftovers.
525 */
526 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
527 FAKE_NODE_MIN_SIZE;
528
529 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
530 size &= FAKE_NODE_MIN_HASH_MASK;
531 if (!size) {
532 printk(KERN_ERR "Not enough memory for each node. "
533 "NUMA emulation disabled.\n");
534 return -1;
535 }
536
537 for (i = node_start; i < num_nodes + node_start; i++) {
538 u64 end = *addr + size;
539 511
540 if (i < big) 512 while (end - start - e820_hole_size(start, end) < size) {
541 end += FAKE_NODE_MIN_SIZE; 513 end += FAKE_NODE_MIN_SIZE;
542 /* 514 if (end > max_addr) {
543 * The final node can have the remaining system RAM. Other
544 * nodes receive roughly the same amount of available pages.
545 */
546 if (i == num_nodes + node_start - 1)
547 end = max_addr; 515 end = max_addr;
548 else
549 while (end - *addr - e820_hole_size(*addr, end) <
550 size) {
551 end += FAKE_NODE_MIN_SIZE;
552 if (end > max_addr) {
553 end = max_addr;
554 break;
555 }
556 }
557 if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
558 break; 516 break;
517 }
559 } 518 }
560 return i - node_start + 1; 519 return end;
561} 520}
562 521
563/* 522/*
564 * Splits the remaining system RAM into chunks of size. The remaining memory is 523 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
565 * always assigned to a final node and can be asymmetric. Returns the number of 524 * `addr' to `max_addr'. The return value is the number of nodes allocated.
566 * nodes split.
567 */ 525 */
568static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, 526static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
569 u64 size)
570{ 527{
571 int i = node_start; 528 nodemask_t physnode_mask = NODE_MASK_NONE;
572 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; 529 u64 min_size;
573 while (!setup_node_range(i++, addr, size, max_addr)) 530 int ret = 0;
574 ; 531 int i;
575 return i - node_start; 532
533 if (!size)
534 return -1;
535 /*
536 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
537 * increased accordingly if the requested size is too small. This
538 * creates a uniform distribution of node sizes across the entire
539 * machine (but not necessarily over physical nodes).
540 */
541 min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) /
542 MAX_NUMNODES;
543 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
544 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
545 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
546 FAKE_NODE_MIN_HASH_MASK;
547 if (size < min_size) {
548 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
549 size >> 20, min_size >> 20);
550 size = min_size;
551 }
552 size &= FAKE_NODE_MIN_HASH_MASK;
553
554 for (i = 0; i < MAX_NUMNODES; i++)
555 if (physnodes[i].start != physnodes[i].end)
556 node_set(i, physnode_mask);
557 /*
558 * Fill physical nodes with fake nodes of size until there is no memory
559 * left on any of them.
560 */
561 while (nodes_weight(physnode_mask)) {
562 for_each_node_mask(i, physnode_mask) {
563 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
564 u64 end;
565
566 end = find_end_of_node(physnodes[i].start,
567 physnodes[i].end, size);
568 /*
569 * If there won't be at least FAKE_NODE_MIN_SIZE of
570 * non-reserved memory in ZONE_DMA32 for the next node,
571 * this one must extend to the boundary.
572 */
573 if (end < dma32_end && dma32_end - end -
574 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
575 end = dma32_end;
576
577 /*
578 * If there won't be enough non-reserved memory for the
579 * next node, this one must extend to the end of the
580 * physical node.
581 */
582 if (physnodes[i].end - end -
583 e820_hole_size(end, physnodes[i].end) < size)
584 end = physnodes[i].end;
585
586 /*
587 * Setup the fake node that will be allocated as bootmem
588 * later. If setup_node_range() returns non-zero, there
589 * is no more memory available on this physical node.
590 */
591 if (setup_node_range(ret++, &physnodes[i].start,
592 end - physnodes[i].start,
593 physnodes[i].end) < 0)
594 node_clear(i, physnode_mask);
595 }
596 }
597 return ret;
576} 598}
577 599
578/* 600/*
@@ -582,87 +604,32 @@ static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
582static int __init numa_emulation(unsigned long start_pfn, 604static int __init numa_emulation(unsigned long start_pfn,
583 unsigned long last_pfn, int acpi, int k8) 605 unsigned long last_pfn, int acpi, int k8)
584{ 606{
585 u64 size, addr = start_pfn << PAGE_SHIFT; 607 u64 addr = start_pfn << PAGE_SHIFT;
586 u64 max_addr = last_pfn << PAGE_SHIFT; 608 u64 max_addr = last_pfn << PAGE_SHIFT;
587 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
588 int num_phys_nodes; 609 int num_phys_nodes;
610 int num_nodes;
611 int i;
589 612
590 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); 613 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
591 /* 614 /*
592 * If the numa=fake command-line is just a single number N, split the 615 * If the numa=fake command-line contains a 'M' or 'G', it represents
593 * system RAM into N fake nodes. 616 * the fixed node size. Otherwise, if it is just a single number N,
617 * split the system RAM into N fake nodes.
594 */ 618 */
595 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 619 if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
596 long n = simple_strtol(cmdline, NULL, 0); 620 u64 size;
597
598 num_nodes = split_nodes_interleave(addr, max_addr,
599 num_phys_nodes, n);
600 if (num_nodes < 0)
601 return num_nodes;
602 goto out;
603 }
604 621
605 /* Parse the command line. */ 622 size = memparse(cmdline, &cmdline);
606 for (coeff_flag = 0; ; cmdline++) { 623 num_nodes = split_nodes_size_interleave(addr, max_addr, size);
607 if (*cmdline && isdigit(*cmdline)) { 624 } else {
608 num = num * 10 + *cmdline - '0'; 625 unsigned long n;
609 continue; 626
610 } 627 n = simple_strtoul(cmdline, NULL, 0);
611 if (*cmdline == '*') { 628 num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
612 if (num > 0)
613 coeff = num;
614 coeff_flag = 1;
615 }
616 if (!*cmdline || *cmdline == ',') {
617 if (!coeff_flag)
618 coeff = 1;
619 /*
620 * Round down to the nearest FAKE_NODE_MIN_SIZE.
621 * Command-line coefficients are in megabytes.
622 */
623 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
624 if (size)
625 for (i = 0; i < coeff; i++, num_nodes++)
626 if (setup_node_range(num_nodes, &addr,
627 size, max_addr) < 0)
628 goto done;
629 if (!*cmdline)
630 break;
631 coeff_flag = 0;
632 coeff = -1;
633 }
634 num = 0;
635 }
636done:
637 if (!num_nodes)
638 return -1;
639 /* Fill remainder of system RAM, if appropriate. */
640 if (addr < max_addr) {
641 if (coeff_flag && coeff < 0) {
642 /* Split remaining nodes into num-sized chunks */
643 num_nodes += split_nodes_by_size(&addr, max_addr,
644 num_nodes, num);
645 goto out;
646 }
647 switch (*(cmdline - 1)) {
648 case '*':
649 /* Split remaining nodes into coeff chunks */
650 if (coeff <= 0)
651 break;
652 num_nodes += split_nodes_equally(&addr, max_addr,
653 num_nodes, coeff);
654 break;
655 case ',':
656 /* Do not allocate remaining system RAM */
657 break;
658 default:
659 /* Give one final node */
660 setup_node_range(num_nodes, &addr, max_addr - addr,
661 max_addr);
662 num_nodes++;
663 }
664 } 629 }
665out: 630
631 if (num_nodes < 0)
632 return num_nodes;
666 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); 633 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
667 if (memnode_shift < 0) { 634 if (memnode_shift < 0) {
668 memnode_shift = 0; 635 memnode_shift = 0;