aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-07-08 05:59:23 -0400
committerIngo Molnar <mingo@elte.hu>2008-07-08 05:59:23 -0400
commit2b4fa851b2f06fdb04cac808b57324f5e51e1578 (patch)
tree97db3ad5adda7683923630982f68b8b52c86e790
parent3de352bbd86f890dd0c5e1c09a6a1b0b29e0f8ce (diff)
parent46f68e1c6b04a04772e828ff3bcd07ed708805c2 (diff)
Merge branch 'x86/numa' into x86/devel
Conflicts: arch/x86/Kconfig arch/x86/kernel/e820.c arch/x86/kernel/efi_64.c arch/x86/kernel/mpparse.c arch/x86/kernel/setup.c arch/x86/kernel/setup_32.c arch/x86/mm/init_64.c include/asm-x86/proto.h Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--arch/x86/Kconfig35
-rw-r--r--arch/x86/Kconfig.debug2
-rw-r--r--arch/x86/kernel/apic_32.c9
-rw-r--r--arch/x86/kernel/apic_64.c11
-rw-r--r--arch/x86/kernel/head64.c22
-rw-r--r--arch/x86/kernel/nmi_64.c4
-rw-r--r--arch/x86/kernel/setup.c297
-rw-r--r--arch/x86/kernel/setup64.c8
-rw-r--r--arch/x86/kernel/setup_32.c24
-rw-r--r--arch/x86/kernel/setup_64.c9
-rw-r--r--arch/x86/kernel/smpboot.c81
-rw-r--r--arch/x86/mm/numa_64.c87
-rw-r--r--arch/x86/mm/srat_64.c2
-rw-r--r--arch/x86/pci/acpi.c17
-rw-r--r--drivers/base/topology.c25
-rw-r--r--include/asm-x86/numa_64.h19
-rw-r--r--include/asm-x86/pda.h5
-rw-r--r--include/asm-x86/percpu.h46
-rw-r--r--include/asm-x86/smp.h15
-rw-r--r--include/asm-x86/topology.h152
-rw-r--r--include/linux/mm.h1
-rw-r--r--kernel/sched.c18
22 files changed, 611 insertions, 278 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 640dc62a7fa0..112afd368c77 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -121,7 +121,7 @@ config ARCH_HAS_CACHE_LINE_SIZE
121 def_bool y 121 def_bool y
122 122
123config HAVE_SETUP_PER_CPU_AREA 123config HAVE_SETUP_PER_CPU_AREA
124 def_bool X86_64 || (X86_SMP && !X86_VOYAGER) 124 def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER)
125 125
126config HAVE_CPUMASK_OF_CPU_MAP 126config HAVE_CPUMASK_OF_CPU_MAP
127 def_bool X86_64_SMP 127 def_bool X86_64_SMP
@@ -579,7 +579,21 @@ config SWIOTLB
579 579
580config IOMMU_HELPER 580config IOMMU_HELPER
581 def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB) 581 def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB)
582config MAXSMP
583 bool "Configure Maximum number of SMP Processors and NUMA Nodes"
584 depends on X86_64 && SMP
585 default n
586 help
587 Configure maximum number of CPUS and NUMA Nodes for this architecture.
588 If unsure, say N.
582 589
590if MAXSMP
591config NR_CPUS
592 int
593 default "4096"
594endif
595
596if !MAXSMP
583config NR_CPUS 597config NR_CPUS
584 int "Maximum number of CPUs (2-4096)" 598 int "Maximum number of CPUs (2-4096)"
585 range 2 4096 599 range 2 4096
@@ -592,7 +606,8 @@ config NR_CPUS
592 minimum value which makes sense is 2. 606 minimum value which makes sense is 2.
593 607
594 This is purely to save memory - each supported CPU adds 608 This is purely to save memory - each supported CPU adds
595 approximately one kilobyte to the kernel image. 609 approximately eight kilobytes to the kernel image.
610endif
596 611
597config SCHED_SMT 612config SCHED_SMT
598 bool "SMT (Hyperthreading) scheduler support" 613 bool "SMT (Hyperthreading) scheduler support"
@@ -983,13 +998,25 @@ config NUMA_EMU
983 into virtual nodes when booted with "numa=fake=N", where N is the 998 into virtual nodes when booted with "numa=fake=N", where N is the
984 number of nodes. This is only useful for debugging. 999 number of nodes. This is only useful for debugging.
985 1000
1001if MAXSMP
1002
1003config NODES_SHIFT
1004 int
1005 default "9"
1006endif
1007
1008if !MAXSMP
986config NODES_SHIFT 1009config NODES_SHIFT
987 int "Max num nodes shift(1-9)" 1010 int "Maximum NUMA Nodes (as a power of 2)"
988 range 1 9 if X86_64 1011 range 1 9 if X86_64
989 default "6" if X86_64 1012 default "6" if X86_64
990 default "4" if X86_NUMAQ 1013 default "4" if X86_NUMAQ
991 default "3" 1014 default "3"
992 depends on NEED_MULTIPLE_NODES 1015 depends on NEED_MULTIPLE_NODES
1016 help
1017 Specify the maximum number of NUMA Nodes available on the target
1018 system. Increases memory reserved to accomodate various tables.
1019endif
993 1020
994config HAVE_ARCH_BOOTMEM_NODE 1021config HAVE_ARCH_BOOTMEM_NODE
995 def_bool y 1022 def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index f0684bb74faf..acc0271920f2 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -68,7 +68,7 @@ config DEBUG_PAGEALLOC
68config DEBUG_PER_CPU_MAPS 68config DEBUG_PER_CPU_MAPS
69 bool "Debug access to per_cpu maps" 69 bool "Debug access to per_cpu maps"
70 depends on DEBUG_KERNEL 70 depends on DEBUG_KERNEL
71 depends on X86_64_SMP 71 depends on X86_SMP
72 default n 72 default n
73 help 73 help
74 Say Y to verify that the per_cpu map being accessed has 74 Say Y to verify that the per_cpu map being accessed has
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 570c362eca8c..84ce106b33c8 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -52,9 +52,6 @@
52 52
53unsigned long mp_lapic_addr; 53unsigned long mp_lapic_addr;
54 54
55DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
56EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
57
58/* 55/*
59 * Knob to control our willingness to enable the local APIC. 56 * Knob to control our willingness to enable the local APIC.
60 * 57 *
@@ -1546,9 +1543,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
1546 } 1543 }
1547#ifdef CONFIG_SMP 1544#ifdef CONFIG_SMP
1548 /* are we being called early in kernel startup? */ 1545 /* are we being called early in kernel startup? */
1549 if (x86_cpu_to_apicid_early_ptr) { 1546 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1550 u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; 1547 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
1551 u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; 1548 u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1552 1549
1553 cpu_to_apicid[cpu] = apicid; 1550 cpu_to_apicid[cpu] = apicid;
1554 bios_cpu_apicid[cpu] = apicid; 1551 bios_cpu_apicid[cpu] = apicid;
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index d7406aa1c985..e494809fc508 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -90,9 +90,6 @@ static unsigned long apic_phys;
90 90
91unsigned long mp_lapic_addr; 91unsigned long mp_lapic_addr;
92 92
93DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
94EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
95
96unsigned int __cpuinitdata maxcpus = NR_CPUS; 93unsigned int __cpuinitdata maxcpus = NR_CPUS;
97/* 94/*
98 * Get the LAPIC version 95 * Get the LAPIC version
@@ -1075,9 +1072,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
1075 max_physical_apicid = apicid; 1072 max_physical_apicid = apicid;
1076 1073
1077 /* are we being called early in kernel startup? */ 1074 /* are we being called early in kernel startup? */
1078 if (x86_cpu_to_apicid_early_ptr) { 1075 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1079 u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; 1076 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
1080 u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; 1077 u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1081 1078
1082 cpu_to_apicid[cpu] = apicid; 1079 cpu_to_apicid[cpu] = apicid;
1083 bios_cpu_apicid[cpu] = apicid; 1080 bios_cpu_apicid[cpu] = apicid;
@@ -1253,7 +1250,7 @@ __cpuinit int apic_is_clustered_box(void)
1253 if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) 1250 if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
1254 return 0; 1251 return 0;
1255 1252
1256 bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; 1253 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1257 bitmap_zero(clustermap, NUM_APIC_CLUSTERS); 1254 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
1258 1255
1259 for (i = 0; i < NR_CPUS; i++) { 1256 for (i = 0; i < NR_CPUS; i++) {
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 5fbed459ff3b..c970929bb15d 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -25,6 +25,20 @@
25#include <asm/e820.h> 25#include <asm/e820.h>
26#include <asm/bios_ebda.h> 26#include <asm/bios_ebda.h>
27 27
28/* boot cpu pda */
29static struct x8664_pda _boot_cpu_pda __read_mostly;
30
31#ifdef CONFIG_SMP
32/*
33 * We install an empty cpu_pda pointer table to indicate to early users
34 * (numa_set_node) that the cpu_pda pointer table for cpus other than
35 * the boot cpu is not yet setup.
36 */
37static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
38#else
39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
40#endif
41
28static void __init zap_identity_mappings(void) 42static void __init zap_identity_mappings(void)
29{ 43{
30 pgd_t *pgd = pgd_offset_k(0UL); 44 pgd_t *pgd = pgd_offset_k(0UL);
@@ -88,10 +102,12 @@ void __init x86_64_start_kernel(char * real_mode_data)
88 102
89 early_printk("Kernel alive\n"); 103 early_printk("Kernel alive\n");
90 104
91 for (i = 0; i < NR_CPUS; i++) 105 _cpu_pda = __cpu_pda;
92 cpu_pda(i) = &boot_cpu_pda[i]; 106 cpu_pda(0) = &_boot_cpu_pda;
93
94 pda_init(0); 107 pda_init(0);
108
109 early_printk("Kernel really alive\n");
110
95 copy_bootdata(__va(real_mode_data)); 111 copy_bootdata(__va(real_mode_data));
96 112
97 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 113 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 0060e44e8989..d62f3b66b529 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -90,7 +90,7 @@ int __init check_nmi_watchdog(void)
90 if (!atomic_read(&nmi_active)) 90 if (!atomic_read(&nmi_active))
91 return 0; 91 return 0;
92 92
93 prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); 93 prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
94 if (!prev_nmi_count) 94 if (!prev_nmi_count)
95 goto error; 95 goto error;
96 96
@@ -101,7 +101,7 @@ int __init check_nmi_watchdog(void)
101 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); 101 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
102#endif 102#endif
103 103
104 for (cpu = 0; cpu < NR_CPUS; cpu++) 104 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
105 prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count; 105 prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count;
106 local_irq_enable(); 106 local_irq_enable();
107 mdelay((20*1000)/nmi_hz); // wait 20 ticks 107 mdelay((20*1000)/nmi_hz); // wait 20 ticks
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5b0de38cde48..ebb0a2bcdc08 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -20,13 +20,34 @@ unsigned int boot_cpu_physical_apicid = -1U;
20unsigned int max_physical_apicid; 20unsigned int max_physical_apicid;
21EXPORT_SYMBOL(boot_cpu_physical_apicid); 21EXPORT_SYMBOL(boot_cpu_physical_apicid);
22 22
23DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
24EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
25
26/* Bitmask of physically existing CPUs */ 23/* Bitmask of physically existing CPUs */
27physid_mask_t phys_cpu_present_map; 24physid_mask_t phys_cpu_present_map;
28#endif 25#endif
29 26
27/* map cpu index to physical APIC ID */
28DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
29DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
30EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
31EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
32
33#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
34#define X86_64_NUMA 1
35
36/* map cpu index to node index */
37DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
38EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
39
40/* which logical CPUs are on which nodes */
41cpumask_t *node_to_cpumask_map;
42EXPORT_SYMBOL(node_to_cpumask_map);
43
44/* setup node_to_cpumask_map */
45static void __init setup_node_to_cpumask_map(void);
46
47#else
48static inline void setup_node_to_cpumask_map(void) { }
49#endif
50
30#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) 51#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
31/* 52/*
32 * Copy data used in early init routines from the initial arrays to the 53 * Copy data used in early init routines from the initial arrays to the
@@ -38,20 +59,21 @@ static void __init setup_per_cpu_maps(void)
38 int cpu; 59 int cpu;
39 60
40 for_each_possible_cpu(cpu) { 61 for_each_possible_cpu(cpu) {
41 per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu]; 62 per_cpu(x86_cpu_to_apicid, cpu) =
63 early_per_cpu_map(x86_cpu_to_apicid, cpu);
42 per_cpu(x86_bios_cpu_apicid, cpu) = 64 per_cpu(x86_bios_cpu_apicid, cpu) =
43 x86_bios_cpu_apicid_init[cpu]; 65 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
44#ifdef CONFIG_NUMA 66#ifdef X86_64_NUMA
45 per_cpu(x86_cpu_to_node_map, cpu) = 67 per_cpu(x86_cpu_to_node_map, cpu) =
46 x86_cpu_to_node_map_init[cpu]; 68 early_per_cpu_map(x86_cpu_to_node_map, cpu);
47#endif 69#endif
48 } 70 }
49 71
50 /* indicate the early static arrays will soon be gone */ 72 /* indicate the early static arrays will soon be gone */
51 x86_cpu_to_apicid_early_ptr = NULL; 73 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
52 x86_bios_cpu_apicid_early_ptr = NULL; 74 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
53#ifdef CONFIG_NUMA 75#ifdef X86_64_NUMA
54 x86_cpu_to_node_map_early_ptr = NULL; 76 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
55#endif 77#endif
56} 78}
57 79
@@ -80,6 +102,50 @@ static inline void setup_cpumask_of_cpu(void) { }
80 */ 102 */
81unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 103unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
82EXPORT_SYMBOL(__per_cpu_offset); 104EXPORT_SYMBOL(__per_cpu_offset);
105static inline void setup_cpu_pda_map(void) { }
106
107#elif !defined(CONFIG_SMP)
108static inline void setup_cpu_pda_map(void) { }
109
110#else /* CONFIG_SMP && CONFIG_X86_64 */
111
112/*
113 * Allocate cpu_pda pointer table and array via alloc_bootmem.
114 */
115static void __init setup_cpu_pda_map(void)
116{
117 char *pda;
118 struct x8664_pda **new_cpu_pda;
119 unsigned long size;
120 int cpu;
121
122 size = roundup(sizeof(struct x8664_pda), cache_line_size());
123
124 /* allocate cpu_pda array and pointer table */
125 {
126 unsigned long tsize = nr_cpu_ids * sizeof(void *);
127 unsigned long asize = size * (nr_cpu_ids - 1);
128
129 tsize = roundup(tsize, cache_line_size());
130 new_cpu_pda = alloc_bootmem(tsize + asize);
131 pda = (char *)new_cpu_pda + tsize;
132 }
133
134 /* initialize pointer table to static pda's */
135 for_each_possible_cpu(cpu) {
136 if (cpu == 0) {
137 /* leave boot cpu pda in place */
138 new_cpu_pda[0] = cpu_pda(0);
139 continue;
140 }
141 new_cpu_pda[cpu] = (struct x8664_pda *)pda;
142 new_cpu_pda[cpu]->in_bootmem = 1;
143 pda += size;
144 }
145
146 /* point to new pointer table */
147 _cpu_pda = new_cpu_pda;
148}
83#endif 149#endif
84 150
85/* 151/*
@@ -89,50 +155,52 @@ EXPORT_SYMBOL(__per_cpu_offset);
89 */ 155 */
90void __init setup_per_cpu_areas(void) 156void __init setup_per_cpu_areas(void)
91{ 157{
92 int i, highest_cpu = 0; 158 ssize_t size = PERCPU_ENOUGH_ROOM;
93 unsigned long size; 159 char *ptr;
160 int cpu;
94 161
95#ifdef CONFIG_HOTPLUG_CPU 162#ifdef CONFIG_HOTPLUG_CPU
96 prefill_possible_map(); 163 prefill_possible_map();
164#else
165 nr_cpu_ids = num_processors;
97#endif 166#endif
98 167
168 /* Setup cpu_pda map */
169 setup_cpu_pda_map();
170
99 /* Copy section for each CPU (we discard the original) */ 171 /* Copy section for each CPU (we discard the original) */
100 size = PERCPU_ENOUGH_ROOM; 172 size = PERCPU_ENOUGH_ROOM;
101 printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", 173 printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
102 size); 174 size);
103 175
104 for_each_possible_cpu(i) { 176 for_each_possible_cpu(cpu) {
105 char *ptr;
106#ifndef CONFIG_NEED_MULTIPLE_NODES 177#ifndef CONFIG_NEED_MULTIPLE_NODES
107 ptr = alloc_bootmem_pages(size); 178 ptr = alloc_bootmem_pages(size);
108#else 179#else
109 int node = early_cpu_to_node(i); 180 int node = early_cpu_to_node(cpu);
110 if (!node_online(node) || !NODE_DATA(node)) { 181 if (!node_online(node) || !NODE_DATA(node)) {
111 ptr = alloc_bootmem_pages(size); 182 ptr = alloc_bootmem_pages(size);
112 printk(KERN_INFO 183 printk(KERN_INFO
113 "cpu %d has no node or node-local memory\n", i); 184 "cpu %d has no node %d or node-local memory\n",
185 cpu, node);
114 } 186 }
115 else 187 else
116 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); 188 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
117#endif 189#endif
118 if (!ptr) 190 per_cpu_offset(cpu) = ptr - __per_cpu_start;
119 panic("Cannot allocate cpu data for CPU %d\n", i);
120#ifdef CONFIG_X86_64
121 cpu_pda(i)->data_offset = ptr - __per_cpu_start;
122#else
123 __per_cpu_offset[i] = ptr - __per_cpu_start;
124#endif
125 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 191 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
126 192
127 highest_cpu = i;
128 } 193 }
129 194
130 nr_cpu_ids = highest_cpu + 1; 195 printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
131 printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids); 196 NR_CPUS, nr_cpu_ids, nr_node_ids);
132 197
133 /* Setup percpu data maps */ 198 /* Setup percpu data maps */
134 setup_per_cpu_maps(); 199 setup_per_cpu_maps();
135 200
201 /* Setup node to cpumask map */
202 setup_node_to_cpumask_map();
203
136 /* Setup cpumask_of_cpu map */ 204 /* Setup cpumask_of_cpu map */
137 setup_cpumask_of_cpu(); 205 setup_cpumask_of_cpu();
138} 206}
@@ -163,3 +231,176 @@ void __init parse_setup_data(void)
163 early_iounmap(data, PAGE_SIZE); 231 early_iounmap(data, PAGE_SIZE);
164 } 232 }
165} 233}
234
235#ifdef X86_64_NUMA
236
237/*
238 * Allocate node_to_cpumask_map based on number of available nodes
239 * Requires node_possible_map to be valid.
240 *
241 * Note: node_to_cpumask() is not valid until after this is done.
242 */
243static void __init setup_node_to_cpumask_map(void)
244{
245 unsigned int node, num = 0;
246 cpumask_t *map;
247
248 /* setup nr_node_ids if not done yet */
249 if (nr_node_ids == MAX_NUMNODES) {
250 for_each_node_mask(node, node_possible_map)
251 num = node;
252 nr_node_ids = num + 1;
253 }
254
255 /* allocate the map */
256 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
257
258 Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
259 map, nr_node_ids);
260
261 /* node_to_cpumask() will now work */
262 node_to_cpumask_map = map;
263}
264
265void __cpuinit numa_set_node(int cpu, int node)
266{
267 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
268
269 if (cpu_pda(cpu) && node != NUMA_NO_NODE)
270 cpu_pda(cpu)->nodenumber = node;
271
272 if (cpu_to_node_map)
273 cpu_to_node_map[cpu] = node;
274
275 else if (per_cpu_offset(cpu))
276 per_cpu(x86_cpu_to_node_map, cpu) = node;
277
278 else
279 Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
280}
281
282void __cpuinit numa_clear_node(int cpu)
283{
284 numa_set_node(cpu, NUMA_NO_NODE);
285}
286
287#ifndef CONFIG_DEBUG_PER_CPU_MAPS
288
289void __cpuinit numa_add_cpu(int cpu)
290{
291 cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
292}
293
294void __cpuinit numa_remove_cpu(int cpu)
295{
296 cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
297}
298
299#else /* CONFIG_DEBUG_PER_CPU_MAPS */
300
301/*
302 * --------- debug versions of the numa functions ---------
303 */
304static void __cpuinit numa_set_cpumask(int cpu, int enable)
305{
306 int node = cpu_to_node(cpu);
307 cpumask_t *mask;
308 char buf[64];
309
310 if (node_to_cpumask_map == NULL) {
311 printk(KERN_ERR "node_to_cpumask_map NULL\n");
312 dump_stack();
313 return;
314 }
315
316 mask = &node_to_cpumask_map[node];
317 if (enable)
318 cpu_set(cpu, *mask);
319 else
320 cpu_clear(cpu, *mask);
321
322 cpulist_scnprintf(buf, sizeof(buf), *mask);
323 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
324 enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
325 }
326
327void __cpuinit numa_add_cpu(int cpu)
328{
329 numa_set_cpumask(cpu, 1);
330}
331
332void __cpuinit numa_remove_cpu(int cpu)
333{
334 numa_set_cpumask(cpu, 0);
335}
336
337int cpu_to_node(int cpu)
338{
339 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
340 printk(KERN_WARNING
341 "cpu_to_node(%d): usage too early!\n", cpu);
342 dump_stack();
343 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
344 }
345 return per_cpu(x86_cpu_to_node_map, cpu);
346}
347EXPORT_SYMBOL(cpu_to_node);
348
349/*
350 * Same function as cpu_to_node() but used if called before the
351 * per_cpu areas are setup.
352 */
353int early_cpu_to_node(int cpu)
354{
355 if (early_per_cpu_ptr(x86_cpu_to_node_map))
356 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
357
358 if (!per_cpu_offset(cpu)) {
359 printk(KERN_WARNING
360 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
361 dump_stack();
362 return NUMA_NO_NODE;
363 }
364 return per_cpu(x86_cpu_to_node_map, cpu);
365}
366
367/*
368 * Returns a pointer to the bitmask of CPUs on Node 'node'.
369 */
370cpumask_t *_node_to_cpumask_ptr(int node)
371{
372 if (node_to_cpumask_map == NULL) {
373 printk(KERN_WARNING
374 "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
375 node);
376 dump_stack();
377 return &cpu_online_map;
378 }
379 BUG_ON(node >= nr_node_ids);
380 return &node_to_cpumask_map[node];
381}
382EXPORT_SYMBOL(_node_to_cpumask_ptr);
383
384/*
385 * Returns a bitmask of CPUs on Node 'node'.
386 */
387cpumask_t node_to_cpumask(int node)
388{
389 if (node_to_cpumask_map == NULL) {
390 printk(KERN_WARNING
391 "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
392 dump_stack();
393 return cpu_online_map;
394 }
395 BUG_ON(node >= nr_node_ids);
396 return node_to_cpumask_map[node];
397}
398EXPORT_SYMBOL(node_to_cpumask);
399
400/*
401 * --------- end of debug versions of the numa functions ---------
402 */
403
404#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
405
406#endif /* X86_64_NUMA */
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index aee0e8200777..631ea6cc01d8 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -12,6 +12,7 @@
12#include <linux/bitops.h> 12#include <linux/bitops.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/kgdb.h> 14#include <linux/kgdb.h>
15#include <linux/topology.h>
15#include <asm/pda.h> 16#include <asm/pda.h>
16#include <asm/pgtable.h> 17#include <asm/pgtable.h>
17#include <asm/processor.h> 18#include <asm/processor.h>
@@ -34,9 +35,8 @@ struct boot_params boot_params;
34 35
35cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 36cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
36 37
37struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly; 38struct x8664_pda **_cpu_pda __read_mostly;
38EXPORT_SYMBOL(_cpu_pda); 39EXPORT_SYMBOL(_cpu_pda);
39struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
40 40
41struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; 41struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
42 42
@@ -114,8 +114,10 @@ void pda_init(int cpu)
114 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); 114 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
115 if (!pda->irqstackptr) 115 if (!pda->irqstackptr)
116 panic("cannot allocate irqstack for cpu %d", cpu); 116 panic("cannot allocate irqstack for cpu %d", cpu);
117 }
118 117
118 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
119 pda->nodenumber = cpu_to_node(cpu);
120 }
119 121
120 pda->irqstackptr += IRQSTACKSIZE-64; 122 pda->irqstackptr += IRQSTACKSIZE-64;
121} 123}
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 7e06ecd83174..a9b19ad24edb 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -659,18 +659,6 @@ static void set_mca_bus(int x)
659static void set_mca_bus(int x) { } 659static void set_mca_bus(int x) { }
660#endif 660#endif
661 661
662#ifdef CONFIG_NUMA
663/*
664 * In the golden day, when everything among i386 and x86_64 will be
665 * integrated, this will not live here
666 */
667void *x86_cpu_to_node_map_early_ptr;
668int x86_cpu_to_node_map_init[NR_CPUS] = {
669 [0 ... NR_CPUS-1] = NUMA_NO_NODE
670};
671DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
672#endif
673
674static void probe_roms(void); 662static void probe_roms(void);
675 663
676/* 664/*
@@ -866,18 +854,6 @@ void __init setup_arch(char **cmdline_p)
866 854
867 paravirt_post_allocator_init(); 855 paravirt_post_allocator_init();
868 856
869#ifdef CONFIG_X86_SMP
870 /*
871 * setup to use the early static init tables during kernel startup
872 * X86_SMP will exclude sub-arches that don't deal well with it.
873 */
874 x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
875 x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
876#ifdef CONFIG_NUMA
877 x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
878#endif
879#endif
880
881#ifdef CONFIG_X86_GENERICARCH 857#ifdef CONFIG_X86_GENERICARCH
882 generic_apic_probe(); 858 generic_apic_probe();
883#endif 859#endif
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 9a87113ba996..16ef53ab538a 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -376,15 +376,6 @@ void __init setup_arch(char **cmdline_p)
376 kvmclock_init(); 376 kvmclock_init();
377#endif 377#endif
378 378
379#ifdef CONFIG_SMP
380 /* setup to use the early static init tables during kernel startup */
381 x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
382 x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
383#ifdef CONFIG_NUMA
384 x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
385#endif
386#endif
387
388#ifdef CONFIG_ACPI 379#ifdef CONFIG_ACPI
389 /* 380 /*
390 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). 381 * Initialize the ACPI boot-time table parser (gets the RSDP and SDT).
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6be701f3027f..ae0a7a200421 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -67,22 +67,6 @@
67#include <mach_wakecpu.h> 67#include <mach_wakecpu.h>
68#include <smpboot_hooks.h> 68#include <smpboot_hooks.h>
69 69
70/*
71 * FIXME: For x86_64, those are defined in other files. But moving them here,
72 * would make the setup areas dependent on smp, which is a loss. When we
73 * integrate apic between arches, we can probably do a better job, but
74 * right now, they'll stay here -- glommer
75 */
76
77/* which logical CPU number maps to which CPU (physical APIC ID) */
78u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata =
79 { [0 ... NR_CPUS-1] = BAD_APICID };
80void *x86_cpu_to_apicid_early_ptr;
81
82u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
83 = { [0 ... NR_CPUS-1] = BAD_APICID };
84void *x86_bios_cpu_apicid_early_ptr;
85
86#ifdef CONFIG_X86_32 70#ifdef CONFIG_X86_32
87u8 apicid_2_node[MAX_APICID]; 71u8 apicid_2_node[MAX_APICID];
88static int low_mappings; 72static int low_mappings;
@@ -814,6 +798,45 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
814 complete(&c_idle->done); 798 complete(&c_idle->done);
815} 799}
816 800
801#ifdef CONFIG_X86_64
802/*
803 * Allocate node local memory for the AP pda.
804 *
805 * Must be called after the _cpu_pda pointer table is initialized.
806 */
807static int __cpuinit get_local_pda(int cpu)
808{
809 struct x8664_pda *oldpda, *newpda;
810 unsigned long size = sizeof(struct x8664_pda);
811 int node = cpu_to_node(cpu);
812
813 if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
814 return 0;
815
816 oldpda = cpu_pda(cpu);
817 newpda = kmalloc_node(size, GFP_ATOMIC, node);
818 if (!newpda) {
819 printk(KERN_ERR "Could not allocate node local PDA "
820 "for CPU %d on node %d\n", cpu, node);
821
822 if (oldpda)
823 return 0; /* have a usable pda */
824 else
825 return -1;
826 }
827
828 if (oldpda) {
829 memcpy(newpda, oldpda, size);
830 if (!after_bootmem)
831 free_bootmem((unsigned long)oldpda, size);
832 }
833
834 newpda->in_bootmem = 0;
835 cpu_pda(cpu) = newpda;
836 return 0;
837}
838#endif /* CONFIG_X86_64 */
839
817static int __cpuinit do_boot_cpu(int apicid, int cpu) 840static int __cpuinit do_boot_cpu(int apicid, int cpu)
818/* 841/*
819 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 842 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -839,19 +862,11 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
839 } 862 }
840 863
841 /* Allocate node local memory for AP pdas */ 864 /* Allocate node local memory for AP pdas */
842 if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) { 865 if (cpu > 0) {
843 struct x8664_pda *newpda, *pda; 866 boot_error = get_local_pda(cpu);
844 int node = cpu_to_node(cpu); 867 if (boot_error)
845 pda = cpu_pda(cpu); 868 goto restore_state;
846 newpda = kmalloc_node(sizeof(struct x8664_pda), GFP_ATOMIC, 869 /* if can't get pda memory, can't start cpu */
847 node);
848 if (newpda) {
849 memcpy(newpda, pda, sizeof(struct x8664_pda));
850 cpu_pda(cpu) = newpda;
851 } else
852 printk(KERN_ERR
853 "Could not allocate node local PDA for CPU %d on node %d\n",
854 cpu, node);
855 } 870 }
856#endif 871#endif
857 872
@@ -970,11 +985,13 @@ do_rest:
970 } 985 }
971 } 986 }
972 987
988restore_state:
989
973 if (boot_error) { 990 if (boot_error) {
974 /* Try to put things back the way they were before ... */ 991 /* Try to put things back the way they were before ... */
975 unmap_cpu_to_logical_apicid(cpu); 992 unmap_cpu_to_logical_apicid(cpu);
976#ifdef CONFIG_X86_64 993#ifdef CONFIG_X86_64
977 clear_node_cpumask(cpu); /* was set by numa_add_cpu */ 994 numa_remove_cpu(cpu); /* was set by numa_add_cpu */
978#endif 995#endif
979 cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */ 996 cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */
980 cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ 997 cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
@@ -1347,6 +1364,8 @@ __init void prefill_possible_map(void)
1347 1364
1348 for (i = 0; i < possible; i++) 1365 for (i = 0; i < possible; i++)
1349 cpu_set(i, cpu_possible_map); 1366 cpu_set(i, cpu_possible_map);
1367
1368 nr_cpu_ids = possible;
1350} 1369}
1351 1370
1352static void __ref remove_cpu_from_maps(int cpu) 1371static void __ref remove_cpu_from_maps(int cpu)
@@ -1357,7 +1376,7 @@ static void __ref remove_cpu_from_maps(int cpu)
1357 cpu_clear(cpu, cpu_callin_map); 1376 cpu_clear(cpu, cpu_callin_map);
1358 /* was set by cpu_init() */ 1377 /* was set by cpu_init() */
1359 clear_bit(cpu, (unsigned long *)&cpu_initialized); 1378 clear_bit(cpu, (unsigned long *)&cpu_initialized);
1360 clear_node_cpumask(cpu); 1379 numa_remove_cpu(cpu);
1361#endif 1380#endif
1362} 1381}
1363 1382
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index afb07ffb931d..c4557e25f60c 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -27,30 +27,17 @@
27struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 27struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
28EXPORT_SYMBOL(node_data); 28EXPORT_SYMBOL(node_data);
29 29
30bootmem_data_t plat_node_bdata[MAX_NUMNODES]; 30static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
31 31
32struct memnode memnode; 32struct memnode memnode;
33 33
34#ifdef CONFIG_SMP
35int x86_cpu_to_node_map_init[NR_CPUS] = {
36 [0 ... NR_CPUS-1] = NUMA_NO_NODE
37};
38void *x86_cpu_to_node_map_early_ptr;
39EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
40#endif
41DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
42EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
43
44s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 34s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
45 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
46}; 36};
47 37
48cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
49EXPORT_SYMBOL(node_to_cpumask_map);
50
51int numa_off __initdata; 38int numa_off __initdata;
52unsigned long __initdata nodemap_addr; 39static unsigned long __initdata nodemap_addr;
53unsigned long __initdata nodemap_size; 40static unsigned long __initdata nodemap_size;
54 41
55/* 42/*
56 * Given a shift value, try to populate memnodemap[] 43 * Given a shift value, try to populate memnodemap[]
@@ -192,7 +179,7 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
192void __init setup_node_bootmem(int nodeid, unsigned long start, 179void __init setup_node_bootmem(int nodeid, unsigned long start,
193 unsigned long end) 180 unsigned long end)
194{ 181{
195 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size; 182 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
196 unsigned long bootmap_start, nodedata_phys; 183 unsigned long bootmap_start, nodedata_phys;
197 void *bootmap; 184 void *bootmap;
198 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 185 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
@@ -204,7 +191,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
204 start, end); 191 start, end);
205 192
206 start_pfn = start >> PAGE_SHIFT; 193 start_pfn = start >> PAGE_SHIFT;
207 end_pfn = end >> PAGE_SHIFT; 194 last_pfn = end >> PAGE_SHIFT;
208 195
209 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, 196 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
210 SMP_CACHE_BYTES); 197 SMP_CACHE_BYTES);
@@ -217,7 +204,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
217 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 204 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
218 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 205 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
219 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 206 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
220 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; 207 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
221 208
222 /* 209 /*
223 * Find a place for the bootmem map 210 * Find a place for the bootmem map
@@ -226,7 +213,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
226 * early_node_mem will get that with find_e820_area instead 213 * early_node_mem will get that with find_e820_area instead
227 * of alloc_bootmem, that could clash with reserved range 214 * of alloc_bootmem, that could clash with reserved range
228 */ 215 */
229 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 216 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
230 nid = phys_to_nid(nodedata_phys); 217 nid = phys_to_nid(nodedata_phys);
231 if (nid == nodeid) 218 if (nid == nodeid)
232 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 219 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
@@ -248,7 +235,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
248 235
249 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 236 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
250 bootmap_start >> PAGE_SHIFT, 237 bootmap_start >> PAGE_SHIFT,
251 start_pfn, end_pfn); 238 start_pfn, last_pfn);
252 239
253 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", 240 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
254 bootmap_start, bootmap_start + bootmap_size - 1, 241 bootmap_start, bootmap_start + bootmap_size - 1,
@@ -309,7 +296,7 @@ void __init numa_init_array(void)
309 296
310#ifdef CONFIG_NUMA_EMU 297#ifdef CONFIG_NUMA_EMU
311/* Numa emulation */ 298/* Numa emulation */
312char *cmdline __initdata; 299static char *cmdline __initdata;
313 300
314/* 301/*
315 * Setups up nid to range from addr to addr + size. If the end 302 * Setups up nid to range from addr to addr + size. If the end
@@ -413,15 +400,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
413} 400}
414 401
415/* 402/*
416 * Sets up the system RAM area from start_pfn to end_pfn according to the 403 * Sets up the system RAM area from start_pfn to last_pfn according to the
417 * numa=fake command-line option. 404 * numa=fake command-line option.
418 */ 405 */
419static struct bootnode nodes[MAX_NUMNODES] __initdata; 406static struct bootnode nodes[MAX_NUMNODES] __initdata;
420 407
421static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 408static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
422{ 409{
423 u64 size, addr = start_pfn << PAGE_SHIFT; 410 u64 size, addr = start_pfn << PAGE_SHIFT;
424 u64 max_addr = end_pfn << PAGE_SHIFT; 411 u64 max_addr = last_pfn << PAGE_SHIFT;
425 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 412 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
426 413
427 memset(&nodes, 0, sizeof(nodes)); 414 memset(&nodes, 0, sizeof(nodes));
@@ -527,7 +514,7 @@ out:
527} 514}
528#endif /* CONFIG_NUMA_EMU */ 515#endif /* CONFIG_NUMA_EMU */
529 516
530void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 517void __init numa_initmem_init(unsigned long start_pfn, unsigned long last_pfn)
531{ 518{
532 int i; 519 int i;
533 520
@@ -535,7 +522,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
535 nodes_clear(node_online_map); 522 nodes_clear(node_online_map);
536 523
537#ifdef CONFIG_NUMA_EMU 524#ifdef CONFIG_NUMA_EMU
538 if (cmdline && !numa_emulation(start_pfn, end_pfn)) 525 if (cmdline && !numa_emulation(start_pfn, last_pfn))
539 return; 526 return;
540 nodes_clear(node_possible_map); 527 nodes_clear(node_possible_map);
541 nodes_clear(node_online_map); 528 nodes_clear(node_online_map);
@@ -543,7 +530,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
543 530
544#ifdef CONFIG_ACPI_NUMA 531#ifdef CONFIG_ACPI_NUMA
545 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 532 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
546 end_pfn << PAGE_SHIFT)) 533 last_pfn << PAGE_SHIFT))
547 return; 534 return;
548 nodes_clear(node_possible_map); 535 nodes_clear(node_possible_map);
549 nodes_clear(node_online_map); 536 nodes_clear(node_online_map);
@@ -551,7 +538,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
551 538
552#ifdef CONFIG_K8_NUMA 539#ifdef CONFIG_K8_NUMA
553 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, 540 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
554 end_pfn<<PAGE_SHIFT)) 541 last_pfn<<PAGE_SHIFT))
555 return; 542 return;
556 nodes_clear(node_possible_map); 543 nodes_clear(node_possible_map);
557 nodes_clear(node_online_map); 544 nodes_clear(node_online_map);
@@ -561,7 +548,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
561 548
562 printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 549 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
563 start_pfn << PAGE_SHIFT, 550 start_pfn << PAGE_SHIFT,
564 end_pfn << PAGE_SHIFT); 551 last_pfn << PAGE_SHIFT);
565 /* setup dummy node covering all memory */ 552 /* setup dummy node covering all memory */
566 memnode_shift = 63; 553 memnode_shift = 63;
567 memnodemap = memnode.embedded_map; 554 memnodemap = memnode.embedded_map;
@@ -570,29 +557,8 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
570 node_set(0, node_possible_map); 557 node_set(0, node_possible_map);
571 for (i = 0; i < NR_CPUS; i++) 558 for (i = 0; i < NR_CPUS; i++)
572 numa_set_node(i, 0); 559 numa_set_node(i, 0);
573 /* cpumask_of_cpu() may not be available during early startup */ 560 e820_register_active_regions(0, start_pfn, last_pfn);
574 memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0])); 561 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
575 cpu_set(0, node_to_cpumask_map[0]);
576 e820_register_active_regions(0, start_pfn, end_pfn);
577 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
578}
579
580__cpuinit void numa_add_cpu(int cpu)
581{
582 set_bit(cpu,
583 (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
584}
585
586void __cpuinit numa_set_node(int cpu, int node)
587{
588 int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
589
590 if(cpu_to_node_map)
591 cpu_to_node_map[cpu] = node;
592 else if(per_cpu_offset(cpu))
593 per_cpu(x86_cpu_to_node_map, cpu) = node;
594 else
595 Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
596} 562}
597 563
598unsigned long __init numa_free_all_bootmem(void) 564unsigned long __init numa_free_all_bootmem(void)
@@ -641,6 +607,7 @@ static __init int numa_setup(char *opt)
641} 607}
642early_param("numa", numa_setup); 608early_param("numa", numa_setup);
643 609
610#ifdef CONFIG_NUMA
644/* 611/*
645 * Setup early cpu_to_node. 612 * Setup early cpu_to_node.
646 * 613 *
@@ -652,14 +619,19 @@ early_param("numa", numa_setup);
652 * is already initialized in a round robin manner at numa_init_array, 619 * is already initialized in a round robin manner at numa_init_array,
653 * prior to this call, and this initialization is good enough 620 * prior to this call, and this initialization is good enough
654 * for the fake NUMA cases. 621 * for the fake NUMA cases.
622 *
623 * Called before the per_cpu areas are setup.
655 */ 624 */
656void __init init_cpu_to_node(void) 625void __init init_cpu_to_node(void)
657{ 626{
658 int i; 627 int cpu;
628 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
659 629
660 for (i = 0; i < NR_CPUS; i++) { 630 BUG_ON(cpu_to_apicid == NULL);
631
632 for_each_possible_cpu(cpu) {
661 int node; 633 int node;
662 u16 apicid = x86_cpu_to_apicid_init[i]; 634 u16 apicid = cpu_to_apicid[cpu];
663 635
664 if (apicid == BAD_APICID) 636 if (apicid == BAD_APICID)
665 continue; 637 continue;
@@ -668,8 +640,9 @@ void __init init_cpu_to_node(void)
668 continue; 640 continue;
669 if (!node_online(node)) 641 if (!node_online(node))
670 continue; 642 continue;
671 numa_set_node(i, node); 643 numa_set_node(cpu, node);
672 } 644 }
673} 645}
646#endif
674 647
675 648
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 391d51035871..b67f5a16755f 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -376,7 +376,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
376 if (node == NUMA_NO_NODE) 376 if (node == NUMA_NO_NODE)
377 continue; 377 continue;
378 if (!node_isset(node, node_possible_map)) 378 if (!node_isset(node, node_possible_map))
379 numa_set_node(i, NUMA_NO_NODE); 379 numa_clear_node(i);
380 } 380 }
381 numa_init_array(); 381 numa_init_array();
382 return 0; 382 return 0;
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 464279da49c4..4fa52d3dc848 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -171,8 +171,11 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
171 if (node != -1) 171 if (node != -1)
172 set_mp_bus_to_node(busnum, node); 172 set_mp_bus_to_node(busnum, node);
173 else 173 else
174 node = get_mp_bus_to_node(busnum);
175#endif 174#endif
175 node = get_mp_bus_to_node(busnum);
176
177 if (node != -1 && !node_online(node))
178 node = -1;
176 179
177 /* Allocate per-root-bus (not per bus) arch-specific data. 180 /* Allocate per-root-bus (not per bus) arch-specific data.
178 * TODO: leak; this memory is never freed. 181 * TODO: leak; this memory is never freed.
@@ -204,14 +207,16 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
204 if (!bus) 207 if (!bus)
205 kfree(sd); 208 kfree(sd);
206 209
210 if (bus && node != -1) {
207#ifdef CONFIG_ACPI_NUMA 211#ifdef CONFIG_ACPI_NUMA
208 if (bus) { 212 if (pxm >= 0)
209 if (pxm >= 0) {
210 printk(KERN_DEBUG "bus %02x -> pxm %d -> node %d\n", 213 printk(KERN_DEBUG "bus %02x -> pxm %d -> node %d\n",
211 busnum, pxm, pxm_to_node(pxm)); 214 busnum, pxm, node);
212 } 215#else
213 } 216 printk(KERN_DEBUG "bus %02x -> node %d\n",
217 busnum, node);
214#endif 218#endif
219 }
215 220
216 if (bus && (pci_probe & PCI_USE__CRS)) 221 if (bus && (pci_probe & PCI_USE__CRS))
217 get_current_resources(device, busnum, domain, bus); 222 get_current_resources(device, busnum, domain, bus);
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index fdf4044d2e74..1efe162e16d7 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -40,6 +40,7 @@ static ssize_t show_##name(struct sys_device *dev, char *buf) \
40 return sprintf(buf, "%d\n", topology_##name(cpu)); \ 40 return sprintf(buf, "%d\n", topology_##name(cpu)); \
41} 41}
42 42
43#if defined(topology_thread_siblings) || defined(topology_core_siblings)
43static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf) 44static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf)
44{ 45{
45 ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; 46 ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
@@ -54,21 +55,41 @@ static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf)
54 } 55 }
55 return n; 56 return n;
56} 57}
58#endif
57 59
60#ifdef arch_provides_topology_pointers
58#define define_siblings_show_map(name) \ 61#define define_siblings_show_map(name) \
59static inline ssize_t show_##name(struct sys_device *dev, char *buf) \ 62static ssize_t show_##name(struct sys_device *dev, char *buf) \
60{ \ 63{ \
61 unsigned int cpu = dev->id; \ 64 unsigned int cpu = dev->id; \
62 return show_cpumap(0, &(topology_##name(cpu)), buf); \ 65 return show_cpumap(0, &(topology_##name(cpu)), buf); \
63} 66}
64 67
65#define define_siblings_show_list(name) \ 68#define define_siblings_show_list(name) \
66static inline ssize_t show_##name##_list(struct sys_device *dev, char *buf) \ 69static ssize_t show_##name##_list(struct sys_device *dev, char *buf) \
67{ \ 70{ \
68 unsigned int cpu = dev->id; \ 71 unsigned int cpu = dev->id; \
69 return show_cpumap(1, &(topology_##name(cpu)), buf); \ 72 return show_cpumap(1, &(topology_##name(cpu)), buf); \
70} 73}
71 74
75#else
76#define define_siblings_show_map(name) \
77static ssize_t show_##name(struct sys_device *dev, char *buf) \
78{ \
79 unsigned int cpu = dev->id; \
80 cpumask_t mask = topology_##name(cpu); \
81 return show_cpumap(0, &mask, buf); \
82}
83
84#define define_siblings_show_list(name) \
85static ssize_t show_##name##_list(struct sys_device *dev, char *buf) \
86{ \
87 unsigned int cpu = dev->id; \
88 cpumask_t mask = topology_##name(cpu); \
89 return show_cpumap(1, &mask, buf); \
90}
91#endif
92
72#define define_siblings_show_func(name) \ 93#define define_siblings_show_func(name) \
73 define_siblings_show_map(name); define_siblings_show_list(name) 94 define_siblings_show_map(name); define_siblings_show_list(name)
74 95
diff --git a/include/asm-x86/numa_64.h b/include/asm-x86/numa_64.h
index 22e87c9f6a80..b510daf4f4d8 100644
--- a/include/asm-x86/numa_64.h
+++ b/include/asm-x86/numa_64.h
@@ -14,11 +14,9 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks,
14 14
15#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) 15#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
16 16
17extern void numa_add_cpu(int cpu);
18extern void numa_init_array(void); 17extern void numa_init_array(void);
19extern int numa_off; 18extern int numa_off;
20 19
21extern void numa_set_node(int cpu, int node);
22extern void srat_reserve_add_area(int nodeid); 20extern void srat_reserve_add_area(int nodeid);
23extern int hotadd_percent; 21extern int hotadd_percent;
24 22
@@ -31,15 +29,16 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
31 29
32#ifdef CONFIG_NUMA 30#ifdef CONFIG_NUMA
33extern void __init init_cpu_to_node(void); 31extern void __init init_cpu_to_node(void);
34 32extern void __cpuinit numa_set_node(int cpu, int node);
35static inline void clear_node_cpumask(int cpu) 33extern void __cpuinit numa_clear_node(int cpu);
36{ 34extern void __cpuinit numa_add_cpu(int cpu);
37 clear_bit(cpu, (unsigned long *)&node_to_cpumask_map[cpu_to_node(cpu)]); 35extern void __cpuinit numa_remove_cpu(int cpu);
38}
39
40#else 36#else
41#define init_cpu_to_node() do {} while (0) 37static inline void init_cpu_to_node(void) { }
42#define clear_node_cpumask(cpu) do {} while (0) 38static inline void numa_set_node(int cpu, int node) { }
39static inline void numa_clear_node(int cpu) { }
40static inline void numa_add_cpu(int cpu, int node) { }
41static inline void numa_remove_cpu(int cpu) { }
43#endif 42#endif
44 43
45#endif 44#endif
diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h
index 101fb9e11954..b34e9a7cc80b 100644
--- a/include/asm-x86/pda.h
+++ b/include/asm-x86/pda.h
@@ -22,6 +22,8 @@ struct x8664_pda {
22 offset 40!!! */ 22 offset 40!!! */
23#endif 23#endif
24 char *irqstackptr; 24 char *irqstackptr;
25 short nodenumber; /* number of current node (32k max) */
26 short in_bootmem; /* pda lives in bootmem */
25 unsigned int __softirq_pending; 27 unsigned int __softirq_pending;
26 unsigned int __nmi_count; /* number of NMI on this CPUs */ 28 unsigned int __nmi_count; /* number of NMI on this CPUs */
27 short mmu_state; 29 short mmu_state;
@@ -37,8 +39,7 @@ struct x8664_pda {
37 unsigned irq_spurious_count; 39 unsigned irq_spurious_count;
38} ____cacheline_aligned_in_smp; 40} ____cacheline_aligned_in_smp;
39 41
40extern struct x8664_pda *_cpu_pda[]; 42extern struct x8664_pda **_cpu_pda;
41extern struct x8664_pda boot_cpu_pda[];
42extern void pda_init(int); 43extern void pda_init(int);
43 44
44#define cpu_pda(i) (_cpu_pda[i]) 45#define cpu_pda(i) (_cpu_pda[i])
diff --git a/include/asm-x86/percpu.h b/include/asm-x86/percpu.h
index 736fc3bb8e1e..912a3a17b9db 100644
--- a/include/asm-x86/percpu.h
+++ b/include/asm-x86/percpu.h
@@ -143,4 +143,50 @@ do { \
143#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val) 143#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
144#endif /* !__ASSEMBLY__ */ 144#endif /* !__ASSEMBLY__ */
145#endif /* !CONFIG_X86_64 */ 145#endif /* !CONFIG_X86_64 */
146
147#ifdef CONFIG_SMP
148
149/*
150 * Define the "EARLY_PER_CPU" macros. These are used for some per_cpu
151 * variables that are initialized and accessed before there are per_cpu
152 * areas allocated.
153 */
154
155#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
156 DEFINE_PER_CPU(_type, _name) = _initvalue; \
157 __typeof__(_type) _name##_early_map[NR_CPUS] __initdata = \
158 { [0 ... NR_CPUS-1] = _initvalue }; \
159 __typeof__(_type) *_name##_early_ptr = _name##_early_map
160
161#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
162 EXPORT_PER_CPU_SYMBOL(_name)
163
164#define DECLARE_EARLY_PER_CPU(_type, _name) \
165 DECLARE_PER_CPU(_type, _name); \
166 extern __typeof__(_type) *_name##_early_ptr; \
167 extern __typeof__(_type) _name##_early_map[]
168
169#define early_per_cpu_ptr(_name) (_name##_early_ptr)
170#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
171#define early_per_cpu(_name, _cpu) \
172 (early_per_cpu_ptr(_name) ? \
173 early_per_cpu_ptr(_name)[_cpu] : \
174 per_cpu(_name, _cpu))
175
176#else /* !CONFIG_SMP */
177#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
178 DEFINE_PER_CPU(_type, _name) = _initvalue
179
180#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
181 EXPORT_PER_CPU_SYMBOL(_name)
182
183#define DECLARE_EARLY_PER_CPU(_type, _name) \
184 DECLARE_PER_CPU(_type, _name)
185
186#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu)
187#define early_per_cpu_ptr(_name) NULL
188/* no early_per_cpu_map() */
189
190#endif /* !CONFIG_SMP */
191
146#endif /* _ASM_X86_PERCPU_H_ */ 192#endif /* _ASM_X86_PERCPU_H_ */
diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h
index 514e52b95cef..fc1007321ef6 100644
--- a/include/asm-x86/smp.h
+++ b/include/asm-x86/smp.h
@@ -29,21 +29,12 @@ extern int smp_num_siblings;
29extern unsigned int num_processors; 29extern unsigned int num_processors;
30extern cpumask_t cpu_initialized; 30extern cpumask_t cpu_initialized;
31 31
32#ifdef CONFIG_SMP
33extern u16 x86_cpu_to_apicid_init[];
34extern u16 x86_bios_cpu_apicid_init[];
35extern void *x86_cpu_to_apicid_early_ptr;
36extern void *x86_bios_cpu_apicid_early_ptr;
37#else
38#define x86_cpu_to_apicid_early_ptr NULL
39#define x86_bios_cpu_apicid_early_ptr NULL
40#endif
41
42DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); 32DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
43DECLARE_PER_CPU(cpumask_t, cpu_core_map); 33DECLARE_PER_CPU(cpumask_t, cpu_core_map);
44DECLARE_PER_CPU(u16, cpu_llc_id); 34DECLARE_PER_CPU(u16, cpu_llc_id);
45DECLARE_PER_CPU(u16, x86_cpu_to_apicid); 35
46DECLARE_PER_CPU(u16, x86_bios_cpu_apicid); 36DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
37DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
47 38
48/* Static state in head.S used to set up a CPU */ 39/* Static state in head.S used to set up a CPU */
49extern struct { 40extern struct {
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index dcf3f8131d6b..1f97758de4ab 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -35,79 +35,88 @@
35# endif 35# endif
36#endif 36#endif
37 37
38/* Node not present */
39#define NUMA_NO_NODE (-1)
40
38#ifdef CONFIG_NUMA 41#ifdef CONFIG_NUMA
39#include <linux/cpumask.h> 42#include <linux/cpumask.h>
40#include <asm/mpspec.h> 43#include <asm/mpspec.h>
41 44
42/* Mappings between logical cpu number and node number */
43#ifdef CONFIG_X86_32 45#ifdef CONFIG_X86_32
44extern int cpu_to_node_map[];
45#else
46/* Returns the number of the current Node. */
47#define numa_node_id() (early_cpu_to_node(raw_smp_processor_id()))
48#endif
49
50DECLARE_PER_CPU(int, x86_cpu_to_node_map);
51
52#ifdef CONFIG_SMP
53extern int x86_cpu_to_node_map_init[];
54extern void *x86_cpu_to_node_map_early_ptr;
55#else
56#define x86_cpu_to_node_map_early_ptr NULL
57#endif
58 46
47/* Mappings between node number and cpus on that node. */
59extern cpumask_t node_to_cpumask_map[]; 48extern cpumask_t node_to_cpumask_map[];
60 49
61#define NUMA_NO_NODE (-1) 50/* Mappings between logical cpu number and node number */
51extern int cpu_to_node_map[];
62 52
63/* Returns the number of the node containing CPU 'cpu' */ 53/* Returns the number of the node containing CPU 'cpu' */
64#ifdef CONFIG_X86_32
65#define early_cpu_to_node(cpu) cpu_to_node(cpu)
66static inline int cpu_to_node(int cpu) 54static inline int cpu_to_node(int cpu)
67{ 55{
68 return cpu_to_node_map[cpu]; 56 return cpu_to_node_map[cpu];
69} 57}
58#define early_cpu_to_node(cpu) cpu_to_node(cpu)
70 59
71#else /* CONFIG_X86_64 */ 60/* Returns a bitmask of CPUs on Node 'node'. */
72 61static inline cpumask_t node_to_cpumask(int node)
73#ifdef CONFIG_SMP
74static inline int early_cpu_to_node(int cpu)
75{ 62{
76 int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr; 63 return node_to_cpumask_map[node];
77
78 if (cpu_to_node_map)
79 return cpu_to_node_map[cpu];
80 else if (per_cpu_offset(cpu))
81 return per_cpu(x86_cpu_to_node_map, cpu);
82 else
83 return NUMA_NO_NODE;
84} 64}
85#else
86#define early_cpu_to_node(cpu) cpu_to_node(cpu)
87#endif
88 65
66#else /* CONFIG_X86_64 */
67
68/* Mappings between node number and cpus on that node. */
69extern cpumask_t *node_to_cpumask_map;
70
71/* Mappings between logical cpu number and node number */
72DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
73
74/* Returns the number of the current Node. */
75#define numa_node_id() read_pda(nodenumber)
76
77#ifdef CONFIG_DEBUG_PER_CPU_MAPS
78extern int cpu_to_node(int cpu);
79extern int early_cpu_to_node(int cpu);
80extern cpumask_t *_node_to_cpumask_ptr(int node);
81extern cpumask_t node_to_cpumask(int node);
82
83#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
84
85/* Returns the number of the node containing CPU 'cpu' */
89static inline int cpu_to_node(int cpu) 86static inline int cpu_to_node(int cpu)
90{ 87{
91#ifdef CONFIG_DEBUG_PER_CPU_MAPS
92 if (x86_cpu_to_node_map_early_ptr) {
93 printk("KERN_NOTICE cpu_to_node(%d): usage too early!\n",
94 (int)cpu);
95 dump_stack();
96 return ((int *)x86_cpu_to_node_map_early_ptr)[cpu];
97 }
98#endif
99 return per_cpu(x86_cpu_to_node_map, cpu); 88 return per_cpu(x86_cpu_to_node_map, cpu);
100} 89}
101 90
102#ifdef CONFIG_NUMA 91/* Same function but used if called before per_cpu areas are setup */
92static inline int early_cpu_to_node(int cpu)
93{
94 if (early_per_cpu_ptr(x86_cpu_to_node_map))
95 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
96
97 return per_cpu(x86_cpu_to_node_map, cpu);
98}
103 99
104/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ 100/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
101static inline cpumask_t *_node_to_cpumask_ptr(int node)
102{
103 return &node_to_cpumask_map[node];
104}
105
106/* Returns a bitmask of CPUs on Node 'node'. */
107static inline cpumask_t node_to_cpumask(int node)
108{
109 return node_to_cpumask_map[node];
110}
111
112#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
113
114/* Replace default node_to_cpumask_ptr with optimized version */
105#define node_to_cpumask_ptr(v, node) \ 115#define node_to_cpumask_ptr(v, node) \
106 cpumask_t *v = &(node_to_cpumask_map[node]) 116 cpumask_t *v = _node_to_cpumask_ptr(node)
107 117
108#define node_to_cpumask_ptr_next(v, node) \ 118#define node_to_cpumask_ptr_next(v, node) \
109 v = &(node_to_cpumask_map[node]) 119 v = _node_to_cpumask_ptr(node)
110#endif
111 120
112#endif /* CONFIG_X86_64 */ 121#endif /* CONFIG_X86_64 */
113 122
@@ -117,20 +126,6 @@ static inline int cpu_to_node(int cpu)
117 */ 126 */
118#define parent_node(node) (node) 127#define parent_node(node) (node)
119 128
120/* Returns a bitmask of CPUs on Node 'node'. */
121static inline cpumask_t node_to_cpumask(int node)
122{
123 return node_to_cpumask_map[node];
124}
125
126/* Returns the number of the first CPU on Node 'node'. */
127static inline int node_to_first_cpu(int node)
128{
129 cpumask_t mask = node_to_cpumask(node);
130
131 return first_cpu(mask);
132}
133
134#define pcibus_to_node(bus) __pcibus_to_node(bus) 129#define pcibus_to_node(bus) __pcibus_to_node(bus)
135#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus) 130#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus)
136 131
@@ -180,12 +175,44 @@ extern int __node_distance(int, int);
180#define node_distance(a, b) __node_distance(a, b) 175#define node_distance(a, b) __node_distance(a, b)
181#endif 176#endif
182 177
183#else /* CONFIG_NUMA */ 178#else /* !CONFIG_NUMA */
184 179
180#define numa_node_id() 0
181#define cpu_to_node(cpu) 0
182#define early_cpu_to_node(cpu) 0
183
184static inline cpumask_t *_node_to_cpumask_ptr(int node)
185{
186 return &cpu_online_map;
187}
188static inline cpumask_t node_to_cpumask(int node)
189{
190 return cpu_online_map;
191}
192static inline int node_to_first_cpu(int node)
193{
194 return first_cpu(cpu_online_map);
195}
196
197/* Replace default node_to_cpumask_ptr with optimized version */
198#define node_to_cpumask_ptr(v, node) \
199 cpumask_t *v = _node_to_cpumask_ptr(node)
200
201#define node_to_cpumask_ptr_next(v, node) \
202 v = _node_to_cpumask_ptr(node)
185#endif 203#endif
186 204
187#include <asm-generic/topology.h> 205#include <asm-generic/topology.h>
188 206
207#ifdef CONFIG_NUMA
208/* Returns the number of the first CPU on Node 'node'. */
209static inline int node_to_first_cpu(int node)
210{
211 node_to_cpumask_ptr(mask, node);
212 return first_cpu(*mask);
213}
214#endif
215
189extern cpumask_t cpu_coregroup_map(int cpu); 216extern cpumask_t cpu_coregroup_map(int cpu);
190 217
191#ifdef ENABLE_TOPO_DEFINES 218#ifdef ENABLE_TOPO_DEFINES
@@ -193,6 +220,9 @@ extern cpumask_t cpu_coregroup_map(int cpu);
193#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) 220#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
194#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu)) 221#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu))
195#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu)) 222#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu))
223
224/* indicates that pointers to the topology cpumask_t maps are valid */
225#define arch_provides_topology_pointers yes
196#endif 226#endif
197 227
198static inline void arch_fix_phys_package_id(int num, u32 slot) 228static inline void arch_fix_phys_package_id(int num, u32 slot)
@@ -220,4 +250,4 @@ static inline void set_mp_bus_to_node(int busnum, int node)
220} 250}
221#endif 251#endif
222 252
223#endif 253#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e4de460907c1..3d647b24041f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1026,6 +1026,7 @@ extern void mem_init(void);
1026extern void show_mem(void); 1026extern void show_mem(void);
1027extern void si_meminfo(struct sysinfo * val); 1027extern void si_meminfo(struct sysinfo * val);
1028extern void si_meminfo_node(struct sysinfo *val, int nid); 1028extern void si_meminfo_node(struct sysinfo *val, int nid);
1029extern int after_bootmem;
1029 1030
1030#ifdef CONFIG_NUMA 1031#ifdef CONFIG_NUMA
1031extern void setup_per_cpu_pageset(void); 1032extern void setup_per_cpu_pageset(void);
diff --git a/kernel/sched.c b/kernel/sched.c
index 94ead43eda62..bcc22b569ee9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6538,9 +6538,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6538 6538
6539 min_val = INT_MAX; 6539 min_val = INT_MAX;
6540 6540
6541 for (i = 0; i < MAX_NUMNODES; i++) { 6541 for (i = 0; i < nr_node_ids; i++) {
6542 /* Start at @node */ 6542 /* Start at @node */
6543 n = (node + i) % MAX_NUMNODES; 6543 n = (node + i) % nr_node_ids;
6544 6544
6545 if (!nr_cpus_node(n)) 6545 if (!nr_cpus_node(n))
6546 continue; 6546 continue;
@@ -6734,7 +6734,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6734 if (!sched_group_nodes) 6734 if (!sched_group_nodes)
6735 continue; 6735 continue;
6736 6736
6737 for (i = 0; i < MAX_NUMNODES; i++) { 6737 for (i = 0; i < nr_node_ids; i++) {
6738 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 6738 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6739 6739
6740 *nodemask = node_to_cpumask(i); 6740 *nodemask = node_to_cpumask(i);
@@ -6927,7 +6927,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6927 /* 6927 /*
6928 * Allocate the per-node list of sched groups 6928 * Allocate the per-node list of sched groups
6929 */ 6929 */
6930 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), 6930 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
6931 GFP_KERNEL); 6931 GFP_KERNEL);
6932 if (!sched_group_nodes) { 6932 if (!sched_group_nodes) {
6933 printk(KERN_WARNING "Can not alloc sched group node list\n"); 6933 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -7066,7 +7066,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7066#endif 7066#endif
7067 7067
7068 /* Set up physical groups */ 7068 /* Set up physical groups */
7069 for (i = 0; i < MAX_NUMNODES; i++) { 7069 for (i = 0; i < nr_node_ids; i++) {
7070 SCHED_CPUMASK_VAR(nodemask, allmasks); 7070 SCHED_CPUMASK_VAR(nodemask, allmasks);
7071 SCHED_CPUMASK_VAR(send_covered, allmasks); 7071 SCHED_CPUMASK_VAR(send_covered, allmasks);
7072 7072
@@ -7090,7 +7090,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7090 send_covered, tmpmask); 7090 send_covered, tmpmask);
7091 } 7091 }
7092 7092
7093 for (i = 0; i < MAX_NUMNODES; i++) { 7093 for (i = 0; i < nr_node_ids; i++) {
7094 /* Set up node groups */ 7094 /* Set up node groups */
7095 struct sched_group *sg, *prev; 7095 struct sched_group *sg, *prev;
7096 SCHED_CPUMASK_VAR(nodemask, allmasks); 7096 SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7129,9 +7129,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7129 cpus_or(*covered, *covered, *nodemask); 7129 cpus_or(*covered, *covered, *nodemask);
7130 prev = sg; 7130 prev = sg;
7131 7131
7132 for (j = 0; j < MAX_NUMNODES; j++) { 7132 for (j = 0; j < nr_node_ids; j++) {
7133 SCHED_CPUMASK_VAR(notcovered, allmasks); 7133 SCHED_CPUMASK_VAR(notcovered, allmasks);
7134 int n = (i + j) % MAX_NUMNODES; 7134 int n = (i + j) % nr_node_ids;
7135 node_to_cpumask_ptr(pnodemask, n); 7135 node_to_cpumask_ptr(pnodemask, n);
7136 7136
7137 cpus_complement(*notcovered, *covered); 7137 cpus_complement(*notcovered, *covered);
@@ -7184,7 +7184,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7184 } 7184 }
7185 7185
7186#ifdef CONFIG_NUMA 7186#ifdef CONFIG_NUMA
7187 for (i = 0; i < MAX_NUMNODES; i++) 7187 for (i = 0; i < nr_node_ids; i++)
7188 init_numa_sched_groups_power(sched_group_nodes[i]); 7188 init_numa_sched_groups_power(sched_group_nodes[i]);
7189 7189
7190 if (sd_allnodes) { 7190 if (sd_allnodes) {