diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/mm/numa_64.c | 211 | ||||
-rw-r--r-- | arch/x86/mm/srat_64.c | 1 |
2 files changed, 184 insertions, 28 deletions
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index d1a3d94efc8e..086f98a66d80 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -306,8 +306,71 @@ void __init numa_init_array(void) | |||
306 | 306 | ||
307 | #ifdef CONFIG_NUMA_EMU | 307 | #ifdef CONFIG_NUMA_EMU |
308 | /* Numa emulation */ | 308 | /* Numa emulation */ |
309 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | ||
310 | static struct bootnode physnodes[MAX_NUMNODES] __initdata; | ||
309 | static char *cmdline __initdata; | 311 | static char *cmdline __initdata; |
310 | 312 | ||
313 | static int __init setup_physnodes(unsigned long start, unsigned long end, | ||
314 | int acpi, int k8) | ||
315 | { | ||
316 | int nr_nodes = 0; | ||
317 | int ret = 0; | ||
318 | int i; | ||
319 | |||
320 | #ifdef CONFIG_ACPI_NUMA | ||
321 | if (acpi) | ||
322 | nr_nodes = acpi_get_nodes(physnodes); | ||
323 | #endif | ||
324 | #ifdef CONFIG_K8_NUMA | ||
325 | if (k8) | ||
326 | nr_nodes = k8_get_nodes(physnodes); | ||
327 | #endif | ||
328 | /* | ||
329 | * Basic sanity checking on the physical node map: there may be errors | ||
330 | * if the SRAT or K8 incorrectly reported the topology or the mem= | ||
331 | * kernel parameter is used. | ||
332 | */ | ||
333 | for (i = 0; i < nr_nodes; i++) { | ||
334 | if (physnodes[i].start == physnodes[i].end) | ||
335 | continue; | ||
336 | if (physnodes[i].start > end) { | ||
337 | physnodes[i].end = physnodes[i].start; | ||
338 | continue; | ||
339 | } | ||
340 | if (physnodes[i].end < start) { | ||
341 | physnodes[i].start = physnodes[i].end; | ||
342 | continue; | ||
343 | } | ||
344 | if (physnodes[i].start < start) | ||
345 | physnodes[i].start = start; | ||
346 | if (physnodes[i].end > end) | ||
347 | physnodes[i].end = end; | ||
348 | } | ||
349 | |||
350 | /* | ||
351 | * Remove all nodes that have no memory or were truncated because of the | ||
352 | * limited address range. | ||
353 | */ | ||
354 | for (i = 0; i < nr_nodes; i++) { | ||
355 | if (physnodes[i].start == physnodes[i].end) | ||
356 | continue; | ||
357 | physnodes[ret].start = physnodes[i].start; | ||
358 | physnodes[ret].end = physnodes[i].end; | ||
359 | ret++; | ||
360 | } | ||
361 | |||
362 | /* | ||
363 | * If no physical topology was detected, a single node is faked to cover | ||
364 | * the entire address space. | ||
365 | */ | ||
366 | if (!ret) { | ||
367 | physnodes[ret].start = start; | ||
368 | physnodes[ret].end = end; | ||
369 | ret = 1; | ||
370 | } | ||
371 | return ret; | ||
372 | } | ||
373 | |||
311 | /* | 374 | /* |
312 | * Setups up nid to range from addr to addr + size. If the end | 375 | * Setups up nid to range from addr to addr + size. If the end |
313 | * boundary is greater than max_addr, then max_addr is used instead. | 376 | * boundary is greater than max_addr, then max_addr is used instead. |
@@ -315,11 +378,9 @@ static char *cmdline __initdata; | |||
315 | * allocation past addr and -1 otherwise. addr is adjusted to be at | 378 | * allocation past addr and -1 otherwise. addr is adjusted to be at |
316 | * the end of the node. | 379 | * the end of the node. |
317 | */ | 380 | */ |
318 | static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, | 381 | static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) |
319 | u64 size, u64 max_addr) | ||
320 | { | 382 | { |
321 | int ret = 0; | 383 | int ret = 0; |
322 | |||
323 | nodes[nid].start = *addr; | 384 | nodes[nid].start = *addr; |
324 | *addr += size; | 385 | *addr += size; |
325 | if (*addr >= max_addr) { | 386 | if (*addr >= max_addr) { |
@@ -335,12 +396,111 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, | |||
335 | } | 396 | } |
336 | 397 | ||
337 | /* | 398 | /* |
399 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr | ||
400 | * to max_addr. The return value is the number of nodes allocated. | ||
401 | */ | ||
402 | static int __init split_nodes_interleave(u64 addr, u64 max_addr, | ||
403 | int nr_phys_nodes, int nr_nodes) | ||
404 | { | ||
405 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
406 | u64 size; | ||
407 | int big; | ||
408 | int ret = 0; | ||
409 | int i; | ||
410 | |||
411 | if (nr_nodes <= 0) | ||
412 | return -1; | ||
413 | if (nr_nodes > MAX_NUMNODES) { | ||
414 | pr_info("numa=fake=%d too large, reducing to %d\n", | ||
415 | nr_nodes, MAX_NUMNODES); | ||
416 | nr_nodes = MAX_NUMNODES; | ||
417 | } | ||
418 | |||
419 | size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes; | ||
420 | /* | ||
421 | * Calculate the number of big nodes that can be allocated as a result | ||
422 | * of consolidating the remainder. | ||
423 | */ | ||
424 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / | ||
425 | FAKE_NODE_MIN_SIZE; | ||
426 | |||
427 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
428 | if (!size) { | ||
429 | pr_err("Not enough memory for each node. " | ||
430 | "NUMA emulation disabled.\n"); | ||
431 | return -1; | ||
432 | } | ||
433 | |||
434 | for (i = 0; i < nr_phys_nodes; i++) | ||
435 | if (physnodes[i].start != physnodes[i].end) | ||
436 | node_set(i, physnode_mask); | ||
437 | |||
438 | /* | ||
439 | * Continue to fill physical nodes with fake nodes until there is no | ||
440 | * memory left on any of them. | ||
441 | */ | ||
442 | while (nodes_weight(physnode_mask)) { | ||
443 | for_each_node_mask(i, physnode_mask) { | ||
444 | u64 end = physnodes[i].start + size; | ||
445 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); | ||
446 | |||
447 | if (ret < big) | ||
448 | end += FAKE_NODE_MIN_SIZE; | ||
449 | |||
450 | /* | ||
451 | * Continue to add memory to this fake node if its | ||
452 | * non-reserved memory is less than the per-node size. | ||
453 | */ | ||
454 | while (end - physnodes[i].start - | ||
455 | e820_hole_size(physnodes[i].start, end) < size) { | ||
456 | end += FAKE_NODE_MIN_SIZE; | ||
457 | if (end > physnodes[i].end) { | ||
458 | end = physnodes[i].end; | ||
459 | break; | ||
460 | } | ||
461 | } | ||
462 | |||
463 | /* | ||
464 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
465 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
466 | * this one must extend to the boundary. | ||
467 | */ | ||
468 | if (end < dma32_end && dma32_end - end - | ||
469 | e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
470 | end = dma32_end; | ||
471 | |||
472 | /* | ||
473 | * If there won't be enough non-reserved memory for the | ||
474 | * next node, this one must extend to the end of the | ||
475 | * physical node. | ||
476 | */ | ||
477 | if (physnodes[i].end - end - | ||
478 | e820_hole_size(end, physnodes[i].end) < size) | ||
479 | end = physnodes[i].end; | ||
480 | |||
481 | /* | ||
482 | * Avoid allocating more nodes than requested, which can | ||
483 | * happen as a result of rounding down each node's size | ||
484 | * to FAKE_NODE_MIN_SIZE. | ||
485 | */ | ||
486 | if (nodes_weight(physnode_mask) + ret >= nr_nodes) | ||
487 | end = physnodes[i].end; | ||
488 | |||
489 | if (setup_node_range(ret++, &physnodes[i].start, | ||
490 | end - physnodes[i].start, | ||
491 | physnodes[i].end) < 0) | ||
492 | node_clear(i, physnode_mask); | ||
493 | } | ||
494 | } | ||
495 | return ret; | ||
496 | } | ||
497 | |||
498 | /* | ||
338 | * Splits num_nodes nodes up equally starting at node_start. The return value | 499 | * Splits num_nodes nodes up equally starting at node_start. The return value |
339 | * is the number of nodes split up and addr is adjusted to be at the end of the | 500 | * is the number of nodes split up and addr is adjusted to be at the end of the |
340 | * last node allocated. | 501 | * last node allocated. |
341 | */ | 502 | */ |
342 | static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, | 503 | static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, |
343 | u64 max_addr, int node_start, | ||
344 | int num_nodes) | 504 | int num_nodes) |
345 | { | 505 | { |
346 | unsigned int big; | 506 | unsigned int big; |
@@ -388,7 +548,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, | |||
388 | break; | 548 | break; |
389 | } | 549 | } |
390 | } | 550 | } |
391 | if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) | 551 | if (setup_node_range(i, addr, end - *addr, max_addr) < 0) |
392 | break; | 552 | break; |
393 | } | 553 | } |
394 | return i - node_start + 1; | 554 | return i - node_start + 1; |
@@ -399,12 +559,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, | |||
399 | * always assigned to a final node and can be asymmetric. Returns the number of | 559 | * always assigned to a final node and can be asymmetric. Returns the number of |
400 | * nodes split. | 560 | * nodes split. |
401 | */ | 561 | */ |
402 | static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, | 562 | static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, |
403 | u64 max_addr, int node_start, u64 size) | 563 | u64 size) |
404 | { | 564 | { |
405 | int i = node_start; | 565 | int i = node_start; |
406 | size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; | 566 | size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; |
407 | while (!setup_node_range(i++, nodes, addr, size, max_addr)) | 567 | while (!setup_node_range(i++, addr, size, max_addr)) |
408 | ; | 568 | ; |
409 | return i - node_start; | 569 | return i - node_start; |
410 | } | 570 | } |
@@ -413,15 +573,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, | |||
413 | * Sets up the system RAM area from start_pfn to last_pfn according to the | 573 | * Sets up the system RAM area from start_pfn to last_pfn according to the |
414 | * numa=fake command-line option. | 574 | * numa=fake command-line option. |
415 | */ | 575 | */ |
416 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | 576 | static int __init numa_emulation(unsigned long start_pfn, |
417 | 577 | unsigned long last_pfn, int acpi, int k8) | |
418 | static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn) | ||
419 | { | 578 | { |
420 | u64 size, addr = start_pfn << PAGE_SHIFT; | 579 | u64 size, addr = start_pfn << PAGE_SHIFT; |
421 | u64 max_addr = last_pfn << PAGE_SHIFT; | 580 | u64 max_addr = last_pfn << PAGE_SHIFT; |
422 | int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; | 581 | int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; |
582 | int num_phys_nodes; | ||
423 | 583 | ||
424 | memset(&nodes, 0, sizeof(nodes)); | 584 | num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); |
425 | /* | 585 | /* |
426 | * If the numa=fake command-line is just a single number N, split the | 586 | * If the numa=fake command-line is just a single number N, split the |
427 | * system RAM into N fake nodes. | 587 | * system RAM into N fake nodes. |
@@ -429,7 +589,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn | |||
429 | if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { | 589 | if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { |
430 | long n = simple_strtol(cmdline, NULL, 0); | 590 | long n = simple_strtol(cmdline, NULL, 0); |
431 | 591 | ||
432 | num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); | 592 | num_nodes = split_nodes_interleave(addr, max_addr, |
593 | num_phys_nodes, n); | ||
433 | if (num_nodes < 0) | 594 | if (num_nodes < 0) |
434 | return num_nodes; | 595 | return num_nodes; |
435 | goto out; | 596 | goto out; |
@@ -456,8 +617,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn | |||
456 | size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; | 617 | size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; |
457 | if (size) | 618 | if (size) |
458 | for (i = 0; i < coeff; i++, num_nodes++) | 619 | for (i = 0; i < coeff; i++, num_nodes++) |
459 | if (setup_node_range(num_nodes, nodes, | 620 | if (setup_node_range(num_nodes, &addr, |
460 | &addr, size, max_addr) < 0) | 621 | size, max_addr) < 0) |
461 | goto done; | 622 | goto done; |
462 | if (!*cmdline) | 623 | if (!*cmdline) |
463 | break; | 624 | break; |
@@ -473,7 +634,7 @@ done: | |||
473 | if (addr < max_addr) { | 634 | if (addr < max_addr) { |
474 | if (coeff_flag && coeff < 0) { | 635 | if (coeff_flag && coeff < 0) { |
475 | /* Split remaining nodes into num-sized chunks */ | 636 | /* Split remaining nodes into num-sized chunks */ |
476 | num_nodes += split_nodes_by_size(nodes, &addr, max_addr, | 637 | num_nodes += split_nodes_by_size(&addr, max_addr, |
477 | num_nodes, num); | 638 | num_nodes, num); |
478 | goto out; | 639 | goto out; |
479 | } | 640 | } |
@@ -482,7 +643,7 @@ done: | |||
482 | /* Split remaining nodes into coeff chunks */ | 643 | /* Split remaining nodes into coeff chunks */ |
483 | if (coeff <= 0) | 644 | if (coeff <= 0) |
484 | break; | 645 | break; |
485 | num_nodes += split_nodes_equally(nodes, &addr, max_addr, | 646 | num_nodes += split_nodes_equally(&addr, max_addr, |
486 | num_nodes, coeff); | 647 | num_nodes, coeff); |
487 | break; | 648 | break; |
488 | case ',': | 649 | case ',': |
@@ -490,8 +651,8 @@ done: | |||
490 | break; | 651 | break; |
491 | default: | 652 | default: |
492 | /* Give one final node */ | 653 | /* Give one final node */ |
493 | setup_node_range(num_nodes, nodes, &addr, | 654 | setup_node_range(num_nodes, &addr, max_addr - addr, |
494 | max_addr - addr, max_addr); | 655 | max_addr); |
495 | num_nodes++; | 656 | num_nodes++; |
496 | } | 657 | } |
497 | } | 658 | } |
@@ -505,14 +666,10 @@ out: | |||
505 | } | 666 | } |
506 | 667 | ||
507 | /* | 668 | /* |
508 | * We need to vacate all active ranges that may have been registered by | 669 | * We need to vacate all active ranges that may have been registered for |
509 | * SRAT and set acpi_numa to -1 so that srat_disabled() always returns | 670 | * the e820 memory map. |
510 | * true. NUMA emulation has succeeded so we will not scan ACPI nodes. | ||
511 | */ | 671 | */ |
512 | remove_all_active_ranges(); | 672 | remove_all_active_ranges(); |
513 | #ifdef CONFIG_ACPI_NUMA | ||
514 | acpi_numa = -1; | ||
515 | #endif | ||
516 | for_each_node_mask(i, node_possible_map) { | 673 | for_each_node_mask(i, node_possible_map) { |
517 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, | 674 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, |
518 | nodes[i].end >> PAGE_SHIFT); | 675 | nodes[i].end >> PAGE_SHIFT); |
@@ -533,7 +690,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, | |||
533 | nodes_clear(node_online_map); | 690 | nodes_clear(node_online_map); |
534 | 691 | ||
535 | #ifdef CONFIG_NUMA_EMU | 692 | #ifdef CONFIG_NUMA_EMU |
536 | if (cmdline && !numa_emulation(start_pfn, last_pfn)) | 693 | if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) |
537 | return; | 694 | return; |
538 | nodes_clear(node_possible_map); | 695 | nodes_clear(node_possible_map); |
539 | nodes_clear(node_online_map); | 696 | nodes_clear(node_online_map); |
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 891cbe65b2d5..34aa438d60b6 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c | |||
@@ -468,7 +468,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) | |||
468 | for (i = 0; i < num_nodes; i++) | 468 | for (i = 0; i < num_nodes; i++) |
469 | if (fake_nodes[i].start != fake_nodes[i].end) | 469 | if (fake_nodes[i].start != fake_nodes[i].end) |
470 | node_set(i, nodes_parsed); | 470 | node_set(i, nodes_parsed); |
471 | WARN_ON(!nodes_cover_memory(fake_nodes)); | ||
472 | } | 471 | } |
473 | 472 | ||
474 | static int null_slit_node_compare(int a, int b) | 473 | static int null_slit_node_compare(int a, int b) |