diff options
author | Nishanth Aravamudan <nacc@linux.vnet.ibm.com> | 2014-07-17 19:15:12 -0400 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2014-08-13 01:14:05 -0400 |
commit | 2fabf084b6ad6337675d700b159a6091023544f2 (patch) | |
tree | bd3258f45d01c46af601b3a0b21e304a73a3bbb3 | |
parent | d6589722846a57a4ddf7af595a7f854ff5180950 (diff) |
powerpc: reorder per-cpu NUMA information's initialization
There is an issue currently where NUMA information is used on powerpc
(and possibly ia64) before it has been read from the device-tree, which
leads to large slab consumption with CONFIG_SLUB and memoryless nodes.
NUMA powerpc non-boot CPU's cpu_to_node/cpu_to_mem is only accurate
after start_secondary(), similar to ia64, which is invoked via
smp_init().
Commit 6ee0578b4daae ("workqueue: mark init_workqueues() as
early_initcall()") made init_workqueues() be invoked via
do_pre_smp_initcalls(), which is obviously before the secondary
processors are online.
Additionally, the following commits changed init_workqueues() to use
cpu_to_node to determine the node to use for kthread_create_on_node:
bce903809ab3f ("workqueue: add wq_numa_tbl_len and
wq_numa_possible_cpumask[]")
f3f90ad469342 ("workqueue: determine NUMA node of workers accourding to
the allowed cpumask")
Therefore, when init_workqueues() runs, it sees all CPUs as being on
Node 0. On LPARs or KVM guests where Node 0 is memoryless, this leads to
a high number of slab deactivations
(http://www.spinics.net/lists/linux-mm/msg67489.html).
Fix this by initializing the powerpc-specific CPU<->node/local memory
node mapping as early as possible, which on powerpc is
do_init_bootmem(). Currently that function initializes the mapping for
the boot CPU, but we extend it to setup the mapping for all possible
CPUs. Then, in smp_prepare_cpus(), we can correspondingly set the
per-cpu values for all possible CPUs. That ensures that before the
early_initcalls run (and really as early as possible), the per-cpu NUMA
mapping is accurate.
While testing memoryless nodes on PowerKVM guests with a fix to the
workqueue logic to use cpu_to_mem() instead of cpu_to_node(), with a
guest topology of:
available: 2 nodes (0-1)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
node 0 size: 0 MB
node 0 free: 0 MB
node 1 cpus: 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
node 1 size: 16336 MB
node 1 free: 15329 MB
node distances:
node 0 1
0: 10 40
1: 40 10
the slab consumption decreases from
Slab: 932416 kB
SUnreclaim: 902336 kB
to
Slab: 395264 kB
SUnreclaim: 359424 kB
And we a corresponding increase in the slab efficiency from
slab mem objs slabs
used active active
------------------------------------------------------------
kmalloc-16384 337 MB 11.28% 100.00%
task_struct 288 MB 9.93% 100.00%
to
slab mem objs slabs
used active active
------------------------------------------------------------
kmalloc-16384 37 MB 100.00% 100.00%
task_struct 31 MB 100.00% 100.00%
Powerpc didn't support memoryless nodes until recently (64bb80d87f01
"powerpc/numa: Enable CONFIG_HAVE_MEMORYLESS_NODES" and 8c272261194d
"powerpc/numa: Enable USE_PERCPU_NUMA_NODE_ID"). Those commits also
helped improve memory consumption with these kind of environments.
Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
-rw-r--r-- | arch/powerpc/kernel/smp.c | 11 | ||||
-rw-r--r-- | arch/powerpc/mm/numa.c | 13 |
2 files changed, 15 insertions, 9 deletions
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 1007fb802e6b..a0738af4aba6 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c | |||
@@ -376,6 +376,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus) | |||
376 | GFP_KERNEL, cpu_to_node(cpu)); | 376 | GFP_KERNEL, cpu_to_node(cpu)); |
377 | zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), | 377 | zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), |
378 | GFP_KERNEL, cpu_to_node(cpu)); | 378 | GFP_KERNEL, cpu_to_node(cpu)); |
379 | /* | ||
380 | * numa_node_id() works after this. | ||
381 | */ | ||
382 | set_cpu_numa_node(cpu, numa_cpu_lookup_table[cpu]); | ||
383 | set_cpu_numa_mem(cpu, local_memory_node(numa_cpu_lookup_table[cpu])); | ||
379 | } | 384 | } |
380 | 385 | ||
381 | cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid)); | 386 | cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid)); |
@@ -723,12 +728,6 @@ void start_secondary(void *unused) | |||
723 | } | 728 | } |
724 | traverse_core_siblings(cpu, true); | 729 | traverse_core_siblings(cpu, true); |
725 | 730 | ||
726 | /* | ||
727 | * numa_node_id() works after this. | ||
728 | */ | ||
729 | set_numa_node(numa_cpu_lookup_table[cpu]); | ||
730 | set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu])); | ||
731 | |||
732 | smp_wmb(); | 731 | smp_wmb(); |
733 | notify_cpu_starting(cpu); | 732 | notify_cpu_starting(cpu); |
734 | set_cpu_online(cpu, true); | 733 | set_cpu_online(cpu, true); |
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index d3e9a78eaed3..d7737a542fd7 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c | |||
@@ -1049,7 +1049,7 @@ static void __init mark_reserved_regions_for_nid(int nid) | |||
1049 | 1049 | ||
1050 | void __init do_init_bootmem(void) | 1050 | void __init do_init_bootmem(void) |
1051 | { | 1051 | { |
1052 | int nid; | 1052 | int nid, cpu; |
1053 | 1053 | ||
1054 | min_low_pfn = 0; | 1054 | min_low_pfn = 0; |
1055 | max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; | 1055 | max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; |
@@ -1122,8 +1122,15 @@ void __init do_init_bootmem(void) | |||
1122 | 1122 | ||
1123 | reset_numa_cpu_lookup_table(); | 1123 | reset_numa_cpu_lookup_table(); |
1124 | register_cpu_notifier(&ppc64_numa_nb); | 1124 | register_cpu_notifier(&ppc64_numa_nb); |
1125 | cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, | 1125 | /* |
1126 | (void *)(unsigned long)boot_cpuid); | 1126 | * We need the numa_cpu_lookup_table to be accurate for all CPUs, |
1127 | * even before we online them, so that we can use cpu_to_{node,mem} | ||
1128 | * early in boot, cf. smp_prepare_cpus(). | ||
1129 | */ | ||
1130 | for_each_possible_cpu(cpu) { | ||
1131 | cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, | ||
1132 | (void *)(unsigned long)cpu); | ||
1133 | } | ||
1127 | } | 1134 | } |
1128 | 1135 | ||
1129 | void __init paging_init(void) | 1136 | void __init paging_init(void) |