diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-03-15 22:49:10 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-03-15 22:49:10 -0400 |
commit | 181f977d134a9f8e3f8839f42af655b045fc059e (patch) | |
tree | 5d9bb67c62ef1476c18ed350106a84c02f0dd8e4 /arch/x86/mm | |
parent | d5d42399bd7b66bd6b55363b311810504110c967 (diff) | |
parent | 25542c646afbf14c43fa7d2b443055cadb73b07a (diff) |
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (93 commits)
x86, tlb, UV: Do small micro-optimization for native_flush_tlb_others()
x86-64, NUMA: Don't call numa_set_distanc() for all possible node combinations during emulation
x86-64, NUMA: Don't assume phys node 0 is always online in numa_emulation()
x86-64, NUMA: Clean up initmem_init()
x86-64, NUMA: Fix numa_emulation code with node0 without RAM
x86-64, NUMA: Revert NUMA affine page table allocation
x86: Work around old gas bug
x86-64, NUMA: Better explain numa_distance handling
x86-64, NUMA: Fix distance table handling
mm: Move early_node_map[] reverse scan helpers under HAVE_MEMBLOCK
x86-64, NUMA: Fix size of numa_distance array
x86: Rename e820_table_* to pgt_buf_*
bootmem: Move __alloc_memory_core_early() to nobootmem.c
bootmem: Move contig_page_data definition to bootmem.c/nobootmem.c
bootmem: Separate out CONFIG_NO_BOOTMEM code into nobootmem.c
x86-64, NUMA: Seperate out numa_alloc_distance() from numa_set_distance()
x86-64, NUMA: Add proper function comments to global functions
x86-64, NUMA: Move NUMA emulation into numa_emulation.c
x86-64, NUMA: Prepare numa_emulation() for moving NUMA emulation into a separate file
x86-64, NUMA: Do not scan two times for setup_node_bootmem()
...
Fix up conflicts in arch/x86/kernel/smpboot.c
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 1 | ||||
-rw-r--r-- | arch/x86/mm/amdtopology_64.c | 142 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 56 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 11 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 72 | ||||
-rw-r--r-- | arch/x86/mm/numa.c | 212 | ||||
-rw-r--r-- | arch/x86/mm/numa_32.c | 10 | ||||
-rw-r--r-- | arch/x86/mm/numa_64.c | 984 | ||||
-rw-r--r-- | arch/x86/mm/numa_emulation.c | 494 | ||||
-rw-r--r-- | arch/x86/mm/numa_internal.h | 31 | ||||
-rw-r--r-- | arch/x86/mm/srat_32.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/srat_64.c | 367 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 14 |
13 files changed, 1207 insertions, 1193 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 09df2f9a3d69..3e608edf9958 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -25,6 +25,7 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o | |||
25 | obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o | 25 | obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o |
26 | obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o | 26 | obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o |
27 | obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o | 27 | obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o |
28 | obj-$(CONFIG_NUMA_EMU) += numa_emulation.o | ||
28 | 29 | ||
29 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o | 30 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o |
30 | 31 | ||
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c index f21962c435ed..0919c26820d4 100644 --- a/arch/x86/mm/amdtopology_64.c +++ b/arch/x86/mm/amdtopology_64.c | |||
@@ -26,9 +26,7 @@ | |||
26 | #include <asm/apic.h> | 26 | #include <asm/apic.h> |
27 | #include <asm/amd_nb.h> | 27 | #include <asm/amd_nb.h> |
28 | 28 | ||
29 | static struct bootnode __initdata nodes[8]; | ||
30 | static unsigned char __initdata nodeids[8]; | 29 | static unsigned char __initdata nodeids[8]; |
31 | static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; | ||
32 | 30 | ||
33 | static __init int find_northbridge(void) | 31 | static __init int find_northbridge(void) |
34 | { | 32 | { |
@@ -51,7 +49,7 @@ static __init int find_northbridge(void) | |||
51 | return num; | 49 | return num; |
52 | } | 50 | } |
53 | 51 | ||
54 | return -1; | 52 | return -ENOENT; |
55 | } | 53 | } |
56 | 54 | ||
57 | static __init void early_get_boot_cpu_id(void) | 55 | static __init void early_get_boot_cpu_id(void) |
@@ -69,17 +67,18 @@ static __init void early_get_boot_cpu_id(void) | |||
69 | #endif | 67 | #endif |
70 | } | 68 | } |
71 | 69 | ||
72 | int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) | 70 | int __init amd_numa_init(void) |
73 | { | 71 | { |
74 | unsigned long start = PFN_PHYS(start_pfn); | 72 | unsigned long start = PFN_PHYS(0); |
75 | unsigned long end = PFN_PHYS(end_pfn); | 73 | unsigned long end = PFN_PHYS(max_pfn); |
76 | unsigned numnodes; | 74 | unsigned numnodes; |
77 | unsigned long prevbase; | 75 | unsigned long prevbase; |
78 | int i, nb, found = 0; | 76 | int i, j, nb; |
79 | u32 nodeid, reg; | 77 | u32 nodeid, reg; |
78 | unsigned int bits, cores, apicid_base; | ||
80 | 79 | ||
81 | if (!early_pci_allowed()) | 80 | if (!early_pci_allowed()) |
82 | return -1; | 81 | return -EINVAL; |
83 | 82 | ||
84 | nb = find_northbridge(); | 83 | nb = find_northbridge(); |
85 | if (nb < 0) | 84 | if (nb < 0) |
@@ -90,7 +89,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) | |||
90 | reg = read_pci_config(0, nb, 0, 0x60); | 89 | reg = read_pci_config(0, nb, 0, 0x60); |
91 | numnodes = ((reg >> 4) & 0xF) + 1; | 90 | numnodes = ((reg >> 4) & 0xF) + 1; |
92 | if (numnodes <= 1) | 91 | if (numnodes <= 1) |
93 | return -1; | 92 | return -ENOENT; |
94 | 93 | ||
95 | pr_info("Number of physical nodes %d\n", numnodes); | 94 | pr_info("Number of physical nodes %d\n", numnodes); |
96 | 95 | ||
@@ -121,9 +120,9 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) | |||
121 | if ((base >> 8) & 3 || (limit >> 8) & 3) { | 120 | if ((base >> 8) & 3 || (limit >> 8) & 3) { |
122 | pr_err("Node %d using interleaving mode %lx/%lx\n", | 121 | pr_err("Node %d using interleaving mode %lx/%lx\n", |
123 | nodeid, (base >> 8) & 3, (limit >> 8) & 3); | 122 | nodeid, (base >> 8) & 3, (limit >> 8) & 3); |
124 | return -1; | 123 | return -EINVAL; |
125 | } | 124 | } |
126 | if (node_isset(nodeid, nodes_parsed)) { | 125 | if (node_isset(nodeid, numa_nodes_parsed)) { |
127 | pr_info("Node %d already present, skipping\n", | 126 | pr_info("Node %d already present, skipping\n", |
128 | nodeid); | 127 | nodeid); |
129 | continue; | 128 | continue; |
@@ -160,117 +159,28 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) | |||
160 | if (prevbase > base) { | 159 | if (prevbase > base) { |
161 | pr_err("Node map not sorted %lx,%lx\n", | 160 | pr_err("Node map not sorted %lx,%lx\n", |
162 | prevbase, base); | 161 | prevbase, base); |
163 | return -1; | 162 | return -EINVAL; |
164 | } | 163 | } |
165 | 164 | ||
166 | pr_info("Node %d MemBase %016lx Limit %016lx\n", | 165 | pr_info("Node %d MemBase %016lx Limit %016lx\n", |
167 | nodeid, base, limit); | 166 | nodeid, base, limit); |
168 | 167 | ||
169 | found++; | ||
170 | |||
171 | nodes[nodeid].start = base; | ||
172 | nodes[nodeid].end = limit; | ||
173 | |||
174 | prevbase = base; | 168 | prevbase = base; |
175 | 169 | numa_add_memblk(nodeid, base, limit); | |
176 | node_set(nodeid, nodes_parsed); | 170 | node_set(nodeid, numa_nodes_parsed); |
177 | } | ||
178 | |||
179 | if (!found) | ||
180 | return -1; | ||
181 | return 0; | ||
182 | } | ||
183 | |||
184 | #ifdef CONFIG_NUMA_EMU | ||
185 | static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { | ||
186 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
187 | }; | ||
188 | |||
189 | void __init amd_get_nodes(struct bootnode *physnodes) | ||
190 | { | ||
191 | int i; | ||
192 | |||
193 | for_each_node_mask(i, nodes_parsed) { | ||
194 | physnodes[i].start = nodes[i].start; | ||
195 | physnodes[i].end = nodes[i].end; | ||
196 | } | 171 | } |
197 | } | ||
198 | |||
199 | static int __init find_node_by_addr(unsigned long addr) | ||
200 | { | ||
201 | int ret = NUMA_NO_NODE; | ||
202 | int i; | ||
203 | |||
204 | for (i = 0; i < 8; i++) | ||
205 | if (addr >= nodes[i].start && addr < nodes[i].end) { | ||
206 | ret = i; | ||
207 | break; | ||
208 | } | ||
209 | return ret; | ||
210 | } | ||
211 | 172 | ||
212 | /* | 173 | if (!nodes_weight(numa_nodes_parsed)) |
213 | * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be | 174 | return -ENOENT; |
214 | * setup to represent the physical topology but reflect the emulated | ||
215 | * environment. For each emulated node, the real node which it appears on is | ||
216 | * found and a fake pxm to nid mapping is created which mirrors the actual | ||
217 | * locality. node_distance() then represents the correct distances between | ||
218 | * emulated nodes by using the fake acpi mappings to pxms. | ||
219 | */ | ||
220 | void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes) | ||
221 | { | ||
222 | unsigned int bits; | ||
223 | unsigned int cores; | ||
224 | unsigned int apicid_base = 0; | ||
225 | int i; | ||
226 | 175 | ||
176 | /* | ||
177 | * We seem to have valid NUMA configuration. Map apicids to nodes | ||
178 | * using the coreid bits from early_identify_cpu. | ||
179 | */ | ||
227 | bits = boot_cpu_data.x86_coreid_bits; | 180 | bits = boot_cpu_data.x86_coreid_bits; |
228 | cores = 1 << bits; | 181 | cores = 1 << bits; |
229 | early_get_boot_cpu_id(); | ||
230 | if (boot_cpu_physical_apicid > 0) | ||
231 | apicid_base = boot_cpu_physical_apicid; | ||
232 | |||
233 | for (i = 0; i < nr_nodes; i++) { | ||
234 | int index; | ||
235 | int nid; | ||
236 | int j; | ||
237 | |||
238 | nid = find_node_by_addr(nodes[i].start); | ||
239 | if (nid == NUMA_NO_NODE) | ||
240 | continue; | ||
241 | |||
242 | index = nodeids[nid] << bits; | ||
243 | if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE) | ||
244 | for (j = apicid_base; j < cores + apicid_base; j++) | ||
245 | fake_apicid_to_node[index + j] = i; | ||
246 | #ifdef CONFIG_ACPI_NUMA | ||
247 | __acpi_map_pxm_to_node(nid, i); | ||
248 | #endif | ||
249 | } | ||
250 | memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); | ||
251 | } | ||
252 | #endif /* CONFIG_NUMA_EMU */ | ||
253 | |||
254 | int __init amd_scan_nodes(void) | ||
255 | { | ||
256 | unsigned int bits; | ||
257 | unsigned int cores; | ||
258 | unsigned int apicid_base; | ||
259 | int i; | ||
260 | |||
261 | BUG_ON(nodes_empty(nodes_parsed)); | ||
262 | node_possible_map = nodes_parsed; | ||
263 | memnode_shift = compute_hash_shift(nodes, 8, NULL); | ||
264 | if (memnode_shift < 0) { | ||
265 | pr_err("No NUMA node hash function found. Contact maintainer\n"); | ||
266 | return -1; | ||
267 | } | ||
268 | pr_info("Using node hash shift of %d\n", memnode_shift); | ||
269 | |||
270 | /* use the coreid bits from early_identify_cpu */ | ||
271 | bits = boot_cpu_data.x86_coreid_bits; | ||
272 | cores = (1<<bits); | ||
273 | apicid_base = 0; | 182 | apicid_base = 0; |
183 | |||
274 | /* get the APIC ID of the BSP early for systems with apicid lifting */ | 184 | /* get the APIC ID of the BSP early for systems with apicid lifting */ |
275 | early_get_boot_cpu_id(); | 185 | early_get_boot_cpu_id(); |
276 | if (boot_cpu_physical_apicid > 0) { | 186 | if (boot_cpu_physical_apicid > 0) { |
@@ -278,17 +188,9 @@ int __init amd_scan_nodes(void) | |||
278 | apicid_base = boot_cpu_physical_apicid; | 188 | apicid_base = boot_cpu_physical_apicid; |
279 | } | 189 | } |
280 | 190 | ||
281 | for_each_node_mask(i, node_possible_map) { | 191 | for_each_node_mask(i, numa_nodes_parsed) |
282 | int j; | ||
283 | |||
284 | memblock_x86_register_active_regions(i, | ||
285 | nodes[i].start >> PAGE_SHIFT, | ||
286 | nodes[i].end >> PAGE_SHIFT); | ||
287 | for (j = apicid_base; j < cores + apicid_base; j++) | 192 | for (j = apicid_base; j < cores + apicid_base; j++) |
288 | apicid_to_node[(i << bits) + j] = i; | 193 | set_apicid_to_node((i << bits) + j, i); |
289 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
290 | } | ||
291 | 194 | ||
292 | numa_init_array(); | ||
293 | return 0; | 195 | return 0; |
294 | } | 196 | } |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 947f42abe820..286d289b039b 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
@@ -18,9 +18,9 @@ | |||
18 | 18 | ||
19 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | 19 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); |
20 | 20 | ||
21 | unsigned long __initdata e820_table_start; | 21 | unsigned long __initdata pgt_buf_start; |
22 | unsigned long __meminitdata e820_table_end; | 22 | unsigned long __meminitdata pgt_buf_end; |
23 | unsigned long __meminitdata e820_table_top; | 23 | unsigned long __meminitdata pgt_buf_top; |
24 | 24 | ||
25 | int after_bootmem; | 25 | int after_bootmem; |
26 | 26 | ||
@@ -33,7 +33,7 @@ int direct_gbpages | |||
33 | static void __init find_early_table_space(unsigned long end, int use_pse, | 33 | static void __init find_early_table_space(unsigned long end, int use_pse, |
34 | int use_gbpages) | 34 | int use_gbpages) |
35 | { | 35 | { |
36 | unsigned long puds, pmds, ptes, tables, start; | 36 | unsigned long puds, pmds, ptes, tables, start = 0, good_end = end; |
37 | phys_addr_t base; | 37 | phys_addr_t base; |
38 | 38 | ||
39 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | 39 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; |
@@ -65,29 +65,20 @@ static void __init find_early_table_space(unsigned long end, int use_pse, | |||
65 | #ifdef CONFIG_X86_32 | 65 | #ifdef CONFIG_X86_32 |
66 | /* for fixmap */ | 66 | /* for fixmap */ |
67 | tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); | 67 | tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); |
68 | #endif | ||
69 | 68 | ||
70 | /* | 69 | good_end = max_pfn_mapped << PAGE_SHIFT; |
71 | * RED-PEN putting page tables only on node 0 could | ||
72 | * cause a hotspot and fill up ZONE_DMA. The page tables | ||
73 | * need roughly 0.5KB per GB. | ||
74 | */ | ||
75 | #ifdef CONFIG_X86_32 | ||
76 | start = 0x7000; | ||
77 | #else | ||
78 | start = 0x8000; | ||
79 | #endif | 70 | #endif |
80 | base = memblock_find_in_range(start, max_pfn_mapped<<PAGE_SHIFT, | 71 | |
81 | tables, PAGE_SIZE); | 72 | base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); |
82 | if (base == MEMBLOCK_ERROR) | 73 | if (base == MEMBLOCK_ERROR) |
83 | panic("Cannot find space for the kernel page tables"); | 74 | panic("Cannot find space for the kernel page tables"); |
84 | 75 | ||
85 | e820_table_start = base >> PAGE_SHIFT; | 76 | pgt_buf_start = base >> PAGE_SHIFT; |
86 | e820_table_end = e820_table_start; | 77 | pgt_buf_end = pgt_buf_start; |
87 | e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); | 78 | pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); |
88 | 79 | ||
89 | printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", | 80 | printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", |
90 | end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); | 81 | end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); |
91 | } | 82 | } |
92 | 83 | ||
93 | struct map_range { | 84 | struct map_range { |
@@ -279,30 +270,11 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, | |||
279 | load_cr3(swapper_pg_dir); | 270 | load_cr3(swapper_pg_dir); |
280 | #endif | 271 | #endif |
281 | 272 | ||
282 | #ifdef CONFIG_X86_64 | ||
283 | if (!after_bootmem && !start) { | ||
284 | pud_t *pud; | ||
285 | pmd_t *pmd; | ||
286 | |||
287 | mmu_cr4_features = read_cr4(); | ||
288 | |||
289 | /* | ||
290 | * _brk_end cannot change anymore, but it and _end may be | ||
291 | * located on different 2M pages. cleanup_highmap(), however, | ||
292 | * can only consider _end when it runs, so destroy any | ||
293 | * mappings beyond _brk_end here. | ||
294 | */ | ||
295 | pud = pud_offset(pgd_offset_k(_brk_end), _brk_end); | ||
296 | pmd = pmd_offset(pud, _brk_end - 1); | ||
297 | while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1)) | ||
298 | pmd_clear(pmd); | ||
299 | } | ||
300 | #endif | ||
301 | __flush_tlb_all(); | 273 | __flush_tlb_all(); |
302 | 274 | ||
303 | if (!after_bootmem && e820_table_end > e820_table_start) | 275 | if (!after_bootmem && pgt_buf_end > pgt_buf_start) |
304 | memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT, | 276 | memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT, |
305 | e820_table_end << PAGE_SHIFT, "PGTABLE"); | 277 | pgt_buf_end << PAGE_SHIFT, "PGTABLE"); |
306 | 278 | ||
307 | if (!after_bootmem) | 279 | if (!after_bootmem) |
308 | early_memtest(start, end); | 280 | early_memtest(start, end); |
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c821074b7f0b..73ad7ebd6e9c 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -62,10 +62,10 @@ bool __read_mostly __vmalloc_start_set = false; | |||
62 | 62 | ||
63 | static __init void *alloc_low_page(void) | 63 | static __init void *alloc_low_page(void) |
64 | { | 64 | { |
65 | unsigned long pfn = e820_table_end++; | 65 | unsigned long pfn = pgt_buf_end++; |
66 | void *adr; | 66 | void *adr; |
67 | 67 | ||
68 | if (pfn >= e820_table_top) | 68 | if (pfn >= pgt_buf_top) |
69 | panic("alloc_low_page: ran out of memory"); | 69 | panic("alloc_low_page: ran out of memory"); |
70 | 70 | ||
71 | adr = __va(pfn * PAGE_SIZE); | 71 | adr = __va(pfn * PAGE_SIZE); |
@@ -163,8 +163,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, | |||
163 | if (pmd_idx_kmap_begin != pmd_idx_kmap_end | 163 | if (pmd_idx_kmap_begin != pmd_idx_kmap_end |
164 | && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin | 164 | && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin |
165 | && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end | 165 | && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end |
166 | && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start | 166 | && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start |
167 | || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { | 167 | || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) { |
168 | pte_t *newpte; | 168 | pte_t *newpte; |
169 | int i; | 169 | int i; |
170 | 170 | ||
@@ -644,8 +644,7 @@ void __init find_low_pfn_range(void) | |||
644 | } | 644 | } |
645 | 645 | ||
646 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 646 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
647 | void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, | 647 | void __init initmem_init(void) |
648 | int acpi, int k8) | ||
649 | { | 648 | { |
650 | #ifdef CONFIG_HIGHMEM | 649 | #ifdef CONFIG_HIGHMEM |
651 | highstart_pfn = highend_pfn = max_pfn; | 650 | highstart_pfn = highend_pfn = max_pfn; |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index c14a5422e152..a08a62cb136e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -314,7 +314,7 @@ void __init cleanup_highmap(void) | |||
314 | 314 | ||
315 | static __ref void *alloc_low_page(unsigned long *phys) | 315 | static __ref void *alloc_low_page(unsigned long *phys) |
316 | { | 316 | { |
317 | unsigned long pfn = e820_table_end++; | 317 | unsigned long pfn = pgt_buf_end++; |
318 | void *adr; | 318 | void *adr; |
319 | 319 | ||
320 | if (after_bootmem) { | 320 | if (after_bootmem) { |
@@ -324,7 +324,7 @@ static __ref void *alloc_low_page(unsigned long *phys) | |||
324 | return adr; | 324 | return adr; |
325 | } | 325 | } |
326 | 326 | ||
327 | if (pfn >= e820_table_top) | 327 | if (pfn >= pgt_buf_top) |
328 | panic("alloc_low_page: ran out of memory"); | 328 | panic("alloc_low_page: ran out of memory"); |
329 | 329 | ||
330 | adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); | 330 | adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); |
@@ -333,12 +333,28 @@ static __ref void *alloc_low_page(unsigned long *phys) | |||
333 | return adr; | 333 | return adr; |
334 | } | 334 | } |
335 | 335 | ||
336 | static __ref void *map_low_page(void *virt) | ||
337 | { | ||
338 | void *adr; | ||
339 | unsigned long phys, left; | ||
340 | |||
341 | if (after_bootmem) | ||
342 | return virt; | ||
343 | |||
344 | phys = __pa(virt); | ||
345 | left = phys & (PAGE_SIZE - 1); | ||
346 | adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE); | ||
347 | adr = (void *)(((unsigned long)adr) | left); | ||
348 | |||
349 | return adr; | ||
350 | } | ||
351 | |||
336 | static __ref void unmap_low_page(void *adr) | 352 | static __ref void unmap_low_page(void *adr) |
337 | { | 353 | { |
338 | if (after_bootmem) | 354 | if (after_bootmem) |
339 | return; | 355 | return; |
340 | 356 | ||
341 | early_iounmap(adr, PAGE_SIZE); | 357 | early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE); |
342 | } | 358 | } |
343 | 359 | ||
344 | static unsigned long __meminit | 360 | static unsigned long __meminit |
@@ -386,15 +402,6 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, | |||
386 | } | 402 | } |
387 | 403 | ||
388 | static unsigned long __meminit | 404 | static unsigned long __meminit |
389 | phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end, | ||
390 | pgprot_t prot) | ||
391 | { | ||
392 | pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); | ||
393 | |||
394 | return phys_pte_init(pte, address, end, prot); | ||
395 | } | ||
396 | |||
397 | static unsigned long __meminit | ||
398 | phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | 405 | phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, |
399 | unsigned long page_size_mask, pgprot_t prot) | 406 | unsigned long page_size_mask, pgprot_t prot) |
400 | { | 407 | { |
@@ -420,8 +427,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | |||
420 | if (pmd_val(*pmd)) { | 427 | if (pmd_val(*pmd)) { |
421 | if (!pmd_large(*pmd)) { | 428 | if (!pmd_large(*pmd)) { |
422 | spin_lock(&init_mm.page_table_lock); | 429 | spin_lock(&init_mm.page_table_lock); |
423 | last_map_addr = phys_pte_update(pmd, address, | 430 | pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd)); |
431 | last_map_addr = phys_pte_init(pte, address, | ||
424 | end, prot); | 432 | end, prot); |
433 | unmap_low_page(pte); | ||
425 | spin_unlock(&init_mm.page_table_lock); | 434 | spin_unlock(&init_mm.page_table_lock); |
426 | continue; | 435 | continue; |
427 | } | 436 | } |
@@ -468,18 +477,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | |||
468 | } | 477 | } |
469 | 478 | ||
470 | static unsigned long __meminit | 479 | static unsigned long __meminit |
471 | phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, | ||
472 | unsigned long page_size_mask, pgprot_t prot) | ||
473 | { | ||
474 | pmd_t *pmd = pmd_offset(pud, 0); | ||
475 | unsigned long last_map_addr; | ||
476 | |||
477 | last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot); | ||
478 | __flush_tlb_all(); | ||
479 | return last_map_addr; | ||
480 | } | ||
481 | |||
482 | static unsigned long __meminit | ||
483 | phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | 480 | phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, |
484 | unsigned long page_size_mask) | 481 | unsigned long page_size_mask) |
485 | { | 482 | { |
@@ -504,8 +501,11 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | |||
504 | 501 | ||
505 | if (pud_val(*pud)) { | 502 | if (pud_val(*pud)) { |
506 | if (!pud_large(*pud)) { | 503 | if (!pud_large(*pud)) { |
507 | last_map_addr = phys_pmd_update(pud, addr, end, | 504 | pmd = map_low_page(pmd_offset(pud, 0)); |
505 | last_map_addr = phys_pmd_init(pmd, addr, end, | ||
508 | page_size_mask, prot); | 506 | page_size_mask, prot); |
507 | unmap_low_page(pmd); | ||
508 | __flush_tlb_all(); | ||
509 | continue; | 509 | continue; |
510 | } | 510 | } |
511 | /* | 511 | /* |
@@ -553,17 +553,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | |||
553 | return last_map_addr; | 553 | return last_map_addr; |
554 | } | 554 | } |
555 | 555 | ||
556 | static unsigned long __meminit | ||
557 | phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, | ||
558 | unsigned long page_size_mask) | ||
559 | { | ||
560 | pud_t *pud; | ||
561 | |||
562 | pud = (pud_t *)pgd_page_vaddr(*pgd); | ||
563 | |||
564 | return phys_pud_init(pud, addr, end, page_size_mask); | ||
565 | } | ||
566 | |||
567 | unsigned long __meminit | 556 | unsigned long __meminit |
568 | kernel_physical_mapping_init(unsigned long start, | 557 | kernel_physical_mapping_init(unsigned long start, |
569 | unsigned long end, | 558 | unsigned long end, |
@@ -587,8 +576,10 @@ kernel_physical_mapping_init(unsigned long start, | |||
587 | next = end; | 576 | next = end; |
588 | 577 | ||
589 | if (pgd_val(*pgd)) { | 578 | if (pgd_val(*pgd)) { |
590 | last_map_addr = phys_pud_update(pgd, __pa(start), | 579 | pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd)); |
580 | last_map_addr = phys_pud_init(pud, __pa(start), | ||
591 | __pa(end), page_size_mask); | 581 | __pa(end), page_size_mask); |
582 | unmap_low_page(pud); | ||
592 | continue; | 583 | continue; |
593 | } | 584 | } |
594 | 585 | ||
@@ -612,10 +603,9 @@ kernel_physical_mapping_init(unsigned long start, | |||
612 | } | 603 | } |
613 | 604 | ||
614 | #ifndef CONFIG_NUMA | 605 | #ifndef CONFIG_NUMA |
615 | void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, | 606 | void __init initmem_init(void) |
616 | int acpi, int k8) | ||
617 | { | 607 | { |
618 | memblock_x86_register_active_regions(0, start_pfn, end_pfn); | 608 | memblock_x86_register_active_regions(0, 0, max_pfn); |
619 | } | 609 | } |
620 | #endif | 610 | #endif |
621 | 611 | ||
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index ebf6d7887a38..9559d360fde7 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c | |||
@@ -26,12 +26,50 @@ static __init int numa_setup(char *opt) | |||
26 | early_param("numa", numa_setup); | 26 | early_param("numa", numa_setup); |
27 | 27 | ||
28 | /* | 28 | /* |
29 | * Which logical CPUs are on which nodes | 29 | * apicid, cpu, node mappings |
30 | */ | 30 | */ |
31 | s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { | ||
32 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
33 | }; | ||
34 | |||
31 | cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; | 35 | cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; |
32 | EXPORT_SYMBOL(node_to_cpumask_map); | 36 | EXPORT_SYMBOL(node_to_cpumask_map); |
33 | 37 | ||
34 | /* | 38 | /* |
39 | * Map cpu index to node index | ||
40 | */ | ||
41 | DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); | ||
42 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); | ||
43 | |||
44 | void __cpuinit numa_set_node(int cpu, int node) | ||
45 | { | ||
46 | int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); | ||
47 | |||
48 | /* early setting, no percpu area yet */ | ||
49 | if (cpu_to_node_map) { | ||
50 | cpu_to_node_map[cpu] = node; | ||
51 | return; | ||
52 | } | ||
53 | |||
54 | #ifdef CONFIG_DEBUG_PER_CPU_MAPS | ||
55 | if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { | ||
56 | printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); | ||
57 | dump_stack(); | ||
58 | return; | ||
59 | } | ||
60 | #endif | ||
61 | per_cpu(x86_cpu_to_node_map, cpu) = node; | ||
62 | |||
63 | if (node != NUMA_NO_NODE) | ||
64 | set_cpu_numa_node(cpu, node); | ||
65 | } | ||
66 | |||
67 | void __cpuinit numa_clear_node(int cpu) | ||
68 | { | ||
69 | numa_set_node(cpu, NUMA_NO_NODE); | ||
70 | } | ||
71 | |||
72 | /* | ||
35 | * Allocate node_to_cpumask_map based on number of available nodes | 73 | * Allocate node_to_cpumask_map based on number of available nodes |
36 | * Requires node_possible_map to be valid. | 74 | * Requires node_possible_map to be valid. |
37 | * | 75 | * |
@@ -57,7 +95,174 @@ void __init setup_node_to_cpumask_map(void) | |||
57 | pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); | 95 | pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); |
58 | } | 96 | } |
59 | 97 | ||
60 | #ifdef CONFIG_DEBUG_PER_CPU_MAPS | 98 | /* |
99 | * There are unfortunately some poorly designed mainboards around that | ||
100 | * only connect memory to a single CPU. This breaks the 1:1 cpu->node | ||
101 | * mapping. To avoid this fill in the mapping for all possible CPUs, | ||
102 | * as the number of CPUs is not known yet. We round robin the existing | ||
103 | * nodes. | ||
104 | */ | ||
105 | void __init numa_init_array(void) | ||
106 | { | ||
107 | int rr, i; | ||
108 | |||
109 | rr = first_node(node_online_map); | ||
110 | for (i = 0; i < nr_cpu_ids; i++) { | ||
111 | if (early_cpu_to_node(i) != NUMA_NO_NODE) | ||
112 | continue; | ||
113 | numa_set_node(i, rr); | ||
114 | rr = next_node(rr, node_online_map); | ||
115 | if (rr == MAX_NUMNODES) | ||
116 | rr = first_node(node_online_map); | ||
117 | } | ||
118 | } | ||
119 | |||
120 | static __init int find_near_online_node(int node) | ||
121 | { | ||
122 | int n, val; | ||
123 | int min_val = INT_MAX; | ||
124 | int best_node = -1; | ||
125 | |||
126 | for_each_online_node(n) { | ||
127 | val = node_distance(node, n); | ||
128 | |||
129 | if (val < min_val) { | ||
130 | min_val = val; | ||
131 | best_node = n; | ||
132 | } | ||
133 | } | ||
134 | |||
135 | return best_node; | ||
136 | } | ||
137 | |||
138 | /* | ||
139 | * Setup early cpu_to_node. | ||
140 | * | ||
141 | * Populate cpu_to_node[] only if x86_cpu_to_apicid[], | ||
142 | * and apicid_to_node[] tables have valid entries for a CPU. | ||
143 | * This means we skip cpu_to_node[] initialisation for NUMA | ||
144 | * emulation and faking node case (when running a kernel compiled | ||
145 | * for NUMA on a non NUMA box), which is OK as cpu_to_node[] | ||
146 | * is already initialized in a round robin manner at numa_init_array, | ||
147 | * prior to this call, and this initialization is good enough | ||
148 | * for the fake NUMA cases. | ||
149 | * | ||
150 | * Called before the per_cpu areas are setup. | ||
151 | */ | ||
152 | void __init init_cpu_to_node(void) | ||
153 | { | ||
154 | int cpu; | ||
155 | u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); | ||
156 | |||
157 | BUG_ON(cpu_to_apicid == NULL); | ||
158 | |||
159 | for_each_possible_cpu(cpu) { | ||
160 | int node = numa_cpu_node(cpu); | ||
161 | |||
162 | if (node == NUMA_NO_NODE) | ||
163 | continue; | ||
164 | if (!node_online(node)) | ||
165 | node = find_near_online_node(node); | ||
166 | numa_set_node(cpu, node); | ||
167 | } | ||
168 | } | ||
169 | |||
170 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS | ||
171 | |||
172 | # ifndef CONFIG_NUMA_EMU | ||
173 | void __cpuinit numa_add_cpu(int cpu) | ||
174 | { | ||
175 | cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | ||
176 | } | ||
177 | |||
178 | void __cpuinit numa_remove_cpu(int cpu) | ||
179 | { | ||
180 | cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | ||
181 | } | ||
182 | # endif /* !CONFIG_NUMA_EMU */ | ||
183 | |||
184 | #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ | ||
185 | |||
186 | int __cpu_to_node(int cpu) | ||
187 | { | ||
188 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) { | ||
189 | printk(KERN_WARNING | ||
190 | "cpu_to_node(%d): usage too early!\n", cpu); | ||
191 | dump_stack(); | ||
192 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; | ||
193 | } | ||
194 | return per_cpu(x86_cpu_to_node_map, cpu); | ||
195 | } | ||
196 | EXPORT_SYMBOL(__cpu_to_node); | ||
197 | |||
198 | /* | ||
199 | * Same function as cpu_to_node() but used if called before the | ||
200 | * per_cpu areas are setup. | ||
201 | */ | ||
202 | int early_cpu_to_node(int cpu) | ||
203 | { | ||
204 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) | ||
205 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; | ||
206 | |||
207 | if (!cpu_possible(cpu)) { | ||
208 | printk(KERN_WARNING | ||
209 | "early_cpu_to_node(%d): no per_cpu area!\n", cpu); | ||
210 | dump_stack(); | ||
211 | return NUMA_NO_NODE; | ||
212 | } | ||
213 | return per_cpu(x86_cpu_to_node_map, cpu); | ||
214 | } | ||
215 | |||
216 | struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) | ||
217 | { | ||
218 | int node = early_cpu_to_node(cpu); | ||
219 | struct cpumask *mask; | ||
220 | char buf[64]; | ||
221 | |||
222 | if (node == NUMA_NO_NODE) { | ||
223 | /* early_cpu_to_node() already emits a warning and trace */ | ||
224 | return NULL; | ||
225 | } | ||
226 | mask = node_to_cpumask_map[node]; | ||
227 | if (!mask) { | ||
228 | pr_err("node_to_cpumask_map[%i] NULL\n", node); | ||
229 | dump_stack(); | ||
230 | return NULL; | ||
231 | } | ||
232 | |||
233 | cpulist_scnprintf(buf, sizeof(buf), mask); | ||
234 | printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", | ||
235 | enable ? "numa_add_cpu" : "numa_remove_cpu", | ||
236 | cpu, node, buf); | ||
237 | return mask; | ||
238 | } | ||
239 | |||
240 | # ifndef CONFIG_NUMA_EMU | ||
241 | static void __cpuinit numa_set_cpumask(int cpu, int enable) | ||
242 | { | ||
243 | struct cpumask *mask; | ||
244 | |||
245 | mask = debug_cpumask_set_cpu(cpu, enable); | ||
246 | if (!mask) | ||
247 | return; | ||
248 | |||
249 | if (enable) | ||
250 | cpumask_set_cpu(cpu, mask); | ||
251 | else | ||
252 | cpumask_clear_cpu(cpu, mask); | ||
253 | } | ||
254 | |||
255 | void __cpuinit numa_add_cpu(int cpu) | ||
256 | { | ||
257 | numa_set_cpumask(cpu, 1); | ||
258 | } | ||
259 | |||
260 | void __cpuinit numa_remove_cpu(int cpu) | ||
261 | { | ||
262 | numa_set_cpumask(cpu, 0); | ||
263 | } | ||
264 | # endif /* !CONFIG_NUMA_EMU */ | ||
265 | |||
61 | /* | 266 | /* |
62 | * Returns a pointer to the bitmask of CPUs on Node 'node'. | 267 | * Returns a pointer to the bitmask of CPUs on Node 'node'. |
63 | */ | 268 | */ |
@@ -80,4 +285,5 @@ const struct cpumask *cpumask_of_node(int node) | |||
80 | return node_to_cpumask_map[node]; | 285 | return node_to_cpumask_map[node]; |
81 | } | 286 | } |
82 | EXPORT_SYMBOL(cpumask_of_node); | 287 | EXPORT_SYMBOL(cpumask_of_node); |
83 | #endif | 288 | |
289 | #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ | ||
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 84a3e4c9f277..bde3906420df 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c | |||
@@ -110,6 +110,12 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); | |||
110 | 110 | ||
111 | static unsigned long kva_start_pfn; | 111 | static unsigned long kva_start_pfn; |
112 | static unsigned long kva_pages; | 112 | static unsigned long kva_pages; |
113 | |||
114 | int __cpuinit numa_cpu_node(int cpu) | ||
115 | { | ||
116 | return apic->x86_32_numa_cpu_node(cpu); | ||
117 | } | ||
118 | |||
113 | /* | 119 | /* |
114 | * FLAT - support for basic PC memory model with discontig enabled, essentially | 120 | * FLAT - support for basic PC memory model with discontig enabled, essentially |
115 | * a single node with all available processors in it with a flat | 121 | * a single node with all available processors in it with a flat |
@@ -346,8 +352,7 @@ static void init_remap_allocator(int nid) | |||
346 | (ulong) node_remap_end_vaddr[nid]); | 352 | (ulong) node_remap_end_vaddr[nid]); |
347 | } | 353 | } |
348 | 354 | ||
349 | void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, | 355 | void __init initmem_init(void) |
350 | int acpi, int k8) | ||
351 | { | 356 | { |
352 | int nid; | 357 | int nid; |
353 | long kva_target_pfn; | 358 | long kva_target_pfn; |
@@ -361,6 +366,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, | |||
361 | */ | 366 | */ |
362 | 367 | ||
363 | get_memcfg_numa(); | 368 | get_memcfg_numa(); |
369 | numa_init_array(); | ||
364 | 370 | ||
365 | kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); | 371 | kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); |
366 | 372 | ||
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 1337c51b07d7..9ec0f209a6a4 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -13,31 +13,30 @@ | |||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/nodemask.h> | 14 | #include <linux/nodemask.h> |
15 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
16 | #include <linux/acpi.h> | ||
16 | 17 | ||
17 | #include <asm/e820.h> | 18 | #include <asm/e820.h> |
18 | #include <asm/proto.h> | 19 | #include <asm/proto.h> |
19 | #include <asm/dma.h> | 20 | #include <asm/dma.h> |
20 | #include <asm/numa.h> | ||
21 | #include <asm/acpi.h> | 21 | #include <asm/acpi.h> |
22 | #include <asm/amd_nb.h> | 22 | #include <asm/amd_nb.h> |
23 | 23 | ||
24 | #include "numa_internal.h" | ||
25 | |||
24 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; | 26 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; |
25 | EXPORT_SYMBOL(node_data); | 27 | EXPORT_SYMBOL(node_data); |
26 | 28 | ||
27 | struct memnode memnode; | 29 | nodemask_t numa_nodes_parsed __initdata; |
28 | 30 | ||
29 | s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { | 31 | struct memnode memnode; |
30 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
31 | }; | ||
32 | 32 | ||
33 | static unsigned long __initdata nodemap_addr; | 33 | static unsigned long __initdata nodemap_addr; |
34 | static unsigned long __initdata nodemap_size; | 34 | static unsigned long __initdata nodemap_size; |
35 | 35 | ||
36 | /* | 36 | static struct numa_meminfo numa_meminfo __initdata; |
37 | * Map cpu index to node index | 37 | |
38 | */ | 38 | static int numa_distance_cnt; |
39 | DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); | 39 | static u8 *numa_distance; |
40 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); | ||
41 | 40 | ||
42 | /* | 41 | /* |
43 | * Given a shift value, try to populate memnodemap[] | 42 | * Given a shift value, try to populate memnodemap[] |
@@ -46,16 +45,15 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); | |||
46 | * 0 if memnodmap[] too small (of shift too small) | 45 | * 0 if memnodmap[] too small (of shift too small) |
47 | * -1 if node overlap or lost ram (shift too big) | 46 | * -1 if node overlap or lost ram (shift too big) |
48 | */ | 47 | */ |
49 | static int __init populate_memnodemap(const struct bootnode *nodes, | 48 | static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift) |
50 | int numnodes, int shift, int *nodeids) | ||
51 | { | 49 | { |
52 | unsigned long addr, end; | 50 | unsigned long addr, end; |
53 | int i, res = -1; | 51 | int i, res = -1; |
54 | 52 | ||
55 | memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); | 53 | memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); |
56 | for (i = 0; i < numnodes; i++) { | 54 | for (i = 0; i < mi->nr_blks; i++) { |
57 | addr = nodes[i].start; | 55 | addr = mi->blk[i].start; |
58 | end = nodes[i].end; | 56 | end = mi->blk[i].end; |
59 | if (addr >= end) | 57 | if (addr >= end) |
60 | continue; | 58 | continue; |
61 | if ((end >> shift) >= memnodemapsize) | 59 | if ((end >> shift) >= memnodemapsize) |
@@ -63,12 +61,7 @@ static int __init populate_memnodemap(const struct bootnode *nodes, | |||
63 | do { | 61 | do { |
64 | if (memnodemap[addr >> shift] != NUMA_NO_NODE) | 62 | if (memnodemap[addr >> shift] != NUMA_NO_NODE) |
65 | return -1; | 63 | return -1; |
66 | 64 | memnodemap[addr >> shift] = mi->blk[i].nid; | |
67 | if (!nodeids) | ||
68 | memnodemap[addr >> shift] = i; | ||
69 | else | ||
70 | memnodemap[addr >> shift] = nodeids[i]; | ||
71 | |||
72 | addr += (1UL << shift); | 65 | addr += (1UL << shift); |
73 | } while (addr < end); | 66 | } while (addr < end); |
74 | res = 1; | 67 | res = 1; |
@@ -86,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void) | |||
86 | 79 | ||
87 | addr = 0x8000; | 80 | addr = 0x8000; |
88 | nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); | 81 | nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); |
89 | nodemap_addr = memblock_find_in_range(addr, max_pfn<<PAGE_SHIFT, | 82 | nodemap_addr = memblock_find_in_range(addr, get_max_mapped(), |
90 | nodemap_size, L1_CACHE_BYTES); | 83 | nodemap_size, L1_CACHE_BYTES); |
91 | if (nodemap_addr == MEMBLOCK_ERROR) { | 84 | if (nodemap_addr == MEMBLOCK_ERROR) { |
92 | printk(KERN_ERR | 85 | printk(KERN_ERR |
@@ -106,16 +99,15 @@ static int __init allocate_cachealigned_memnodemap(void) | |||
106 | * The LSB of all start and end addresses in the node map is the value of the | 99 | * The LSB of all start and end addresses in the node map is the value of the |
107 | * maximum possible shift. | 100 | * maximum possible shift. |
108 | */ | 101 | */ |
109 | static int __init extract_lsb_from_nodes(const struct bootnode *nodes, | 102 | static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi) |
110 | int numnodes) | ||
111 | { | 103 | { |
112 | int i, nodes_used = 0; | 104 | int i, nodes_used = 0; |
113 | unsigned long start, end; | 105 | unsigned long start, end; |
114 | unsigned long bitfield = 0, memtop = 0; | 106 | unsigned long bitfield = 0, memtop = 0; |
115 | 107 | ||
116 | for (i = 0; i < numnodes; i++) { | 108 | for (i = 0; i < mi->nr_blks; i++) { |
117 | start = nodes[i].start; | 109 | start = mi->blk[i].start; |
118 | end = nodes[i].end; | 110 | end = mi->blk[i].end; |
119 | if (start >= end) | 111 | if (start >= end) |
120 | continue; | 112 | continue; |
121 | bitfield |= start; | 113 | bitfield |= start; |
@@ -131,18 +123,17 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes, | |||
131 | return i; | 123 | return i; |
132 | } | 124 | } |
133 | 125 | ||
134 | int __init compute_hash_shift(struct bootnode *nodes, int numnodes, | 126 | static int __init compute_hash_shift(const struct numa_meminfo *mi) |
135 | int *nodeids) | ||
136 | { | 127 | { |
137 | int shift; | 128 | int shift; |
138 | 129 | ||
139 | shift = extract_lsb_from_nodes(nodes, numnodes); | 130 | shift = extract_lsb_from_nodes(mi); |
140 | if (allocate_cachealigned_memnodemap()) | 131 | if (allocate_cachealigned_memnodemap()) |
141 | return -1; | 132 | return -1; |
142 | printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", | 133 | printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", |
143 | shift); | 134 | shift); |
144 | 135 | ||
145 | if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { | 136 | if (populate_memnodemap(mi, shift) != 1) { |
146 | printk(KERN_INFO "Your memory is not aligned you need to " | 137 | printk(KERN_INFO "Your memory is not aligned you need to " |
147 | "rebuild your kernel with a bigger NODEMAPSIZE " | 138 | "rebuild your kernel with a bigger NODEMAPSIZE " |
148 | "shift=%d\n", shift); | 139 | "shift=%d\n", shift); |
@@ -188,6 +179,63 @@ static void * __init early_node_mem(int nodeid, unsigned long start, | |||
188 | return NULL; | 179 | return NULL; |
189 | } | 180 | } |
190 | 181 | ||
182 | static int __init numa_add_memblk_to(int nid, u64 start, u64 end, | ||
183 | struct numa_meminfo *mi) | ||
184 | { | ||
185 | /* ignore zero length blks */ | ||
186 | if (start == end) | ||
187 | return 0; | ||
188 | |||
189 | /* whine about and ignore invalid blks */ | ||
190 | if (start > end || nid < 0 || nid >= MAX_NUMNODES) { | ||
191 | pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", | ||
192 | nid, start, end); | ||
193 | return 0; | ||
194 | } | ||
195 | |||
196 | if (mi->nr_blks >= NR_NODE_MEMBLKS) { | ||
197 | pr_err("NUMA: too many memblk ranges\n"); | ||
198 | return -EINVAL; | ||
199 | } | ||
200 | |||
201 | mi->blk[mi->nr_blks].start = start; | ||
202 | mi->blk[mi->nr_blks].end = end; | ||
203 | mi->blk[mi->nr_blks].nid = nid; | ||
204 | mi->nr_blks++; | ||
205 | return 0; | ||
206 | } | ||
207 | |||
208 | /** | ||
209 | * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo | ||
210 | * @idx: Index of memblk to remove | ||
211 | * @mi: numa_meminfo to remove memblk from | ||
212 | * | ||
213 | * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and | ||
214 | * decrementing @mi->nr_blks. | ||
215 | */ | ||
216 | void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) | ||
217 | { | ||
218 | mi->nr_blks--; | ||
219 | memmove(&mi->blk[idx], &mi->blk[idx + 1], | ||
220 | (mi->nr_blks - idx) * sizeof(mi->blk[0])); | ||
221 | } | ||
222 | |||
223 | /** | ||
224 | * numa_add_memblk - Add one numa_memblk to numa_meminfo | ||
225 | * @nid: NUMA node ID of the new memblk | ||
226 | * @start: Start address of the new memblk | ||
227 | * @end: End address of the new memblk | ||
228 | * | ||
229 | * Add a new memblk to the default numa_meminfo. | ||
230 | * | ||
231 | * RETURNS: | ||
232 | * 0 on success, -errno on failure. | ||
233 | */ | ||
234 | int __init numa_add_memblk(int nid, u64 start, u64 end) | ||
235 | { | ||
236 | return numa_add_memblk_to(nid, start, end, &numa_meminfo); | ||
237 | } | ||
238 | |||
191 | /* Initialize bootmem allocator for a node */ | 239 | /* Initialize bootmem allocator for a node */ |
192 | void __init | 240 | void __init |
193 | setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | 241 | setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) |
@@ -234,692 +282,386 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | |||
234 | node_set_online(nodeid); | 282 | node_set_online(nodeid); |
235 | } | 283 | } |
236 | 284 | ||
237 | /* | 285 | /** |
238 | * There are unfortunately some poorly designed mainboards around that | 286 | * numa_cleanup_meminfo - Cleanup a numa_meminfo |
239 | * only connect memory to a single CPU. This breaks the 1:1 cpu->node | 287 | * @mi: numa_meminfo to clean up |
240 | * mapping. To avoid this fill in the mapping for all possible CPUs, | 288 | * |
241 | * as the number of CPUs is not known yet. We round robin the existing | 289 | * Sanitize @mi by merging and removing unncessary memblks. Also check for |
242 | * nodes. | 290 | * conflicts and clear unused memblks. |
291 | * | ||
292 | * RETURNS: | ||
293 | * 0 on success, -errno on failure. | ||
243 | */ | 294 | */ |
244 | void __init numa_init_array(void) | 295 | int __init numa_cleanup_meminfo(struct numa_meminfo *mi) |
245 | { | 296 | { |
246 | int rr, i; | 297 | const u64 low = 0; |
298 | const u64 high = (u64)max_pfn << PAGE_SHIFT; | ||
299 | int i, j, k; | ||
247 | 300 | ||
248 | rr = first_node(node_online_map); | 301 | for (i = 0; i < mi->nr_blks; i++) { |
249 | for (i = 0; i < nr_cpu_ids; i++) { | 302 | struct numa_memblk *bi = &mi->blk[i]; |
250 | if (early_cpu_to_node(i) != NUMA_NO_NODE) | ||
251 | continue; | ||
252 | numa_set_node(i, rr); | ||
253 | rr = next_node(rr, node_online_map); | ||
254 | if (rr == MAX_NUMNODES) | ||
255 | rr = first_node(node_online_map); | ||
256 | } | ||
257 | } | ||
258 | |||
259 | #ifdef CONFIG_NUMA_EMU | ||
260 | /* Numa emulation */ | ||
261 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | ||
262 | static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata; | ||
263 | static char *cmdline __initdata; | ||
264 | 303 | ||
265 | void __init numa_emu_cmdline(char *str) | 304 | /* make sure all blocks are inside the limits */ |
266 | { | 305 | bi->start = max(bi->start, low); |
267 | cmdline = str; | 306 | bi->end = min(bi->end, high); |
268 | } | ||
269 | 307 | ||
270 | static int __init setup_physnodes(unsigned long start, unsigned long end, | 308 | /* and there's no empty block */ |
271 | int acpi, int amd) | 309 | if (bi->start == bi->end) { |
272 | { | 310 | numa_remove_memblk_from(i--, mi); |
273 | int ret = 0; | ||
274 | int i; | ||
275 | |||
276 | memset(physnodes, 0, sizeof(physnodes)); | ||
277 | #ifdef CONFIG_ACPI_NUMA | ||
278 | if (acpi) | ||
279 | acpi_get_nodes(physnodes, start, end); | ||
280 | #endif | ||
281 | #ifdef CONFIG_AMD_NUMA | ||
282 | if (amd) | ||
283 | amd_get_nodes(physnodes); | ||
284 | #endif | ||
285 | /* | ||
286 | * Basic sanity checking on the physical node map: there may be errors | ||
287 | * if the SRAT or AMD code incorrectly reported the topology or the mem= | ||
288 | * kernel parameter is used. | ||
289 | */ | ||
290 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
291 | if (physnodes[i].start == physnodes[i].end) | ||
292 | continue; | ||
293 | if (physnodes[i].start > end) { | ||
294 | physnodes[i].end = physnodes[i].start; | ||
295 | continue; | ||
296 | } | ||
297 | if (physnodes[i].end < start) { | ||
298 | physnodes[i].start = physnodes[i].end; | ||
299 | continue; | 311 | continue; |
300 | } | 312 | } |
301 | if (physnodes[i].start < start) | ||
302 | physnodes[i].start = start; | ||
303 | if (physnodes[i].end > end) | ||
304 | physnodes[i].end = end; | ||
305 | ret++; | ||
306 | } | ||
307 | 313 | ||
308 | /* | 314 | for (j = i + 1; j < mi->nr_blks; j++) { |
309 | * If no physical topology was detected, a single node is faked to cover | 315 | struct numa_memblk *bj = &mi->blk[j]; |
310 | * the entire address space. | 316 | unsigned long start, end; |
311 | */ | ||
312 | if (!ret) { | ||
313 | physnodes[ret].start = start; | ||
314 | physnodes[ret].end = end; | ||
315 | ret = 1; | ||
316 | } | ||
317 | return ret; | ||
318 | } | ||
319 | |||
320 | static void __init fake_physnodes(int acpi, int amd, int nr_nodes) | ||
321 | { | ||
322 | int i; | ||
323 | |||
324 | BUG_ON(acpi && amd); | ||
325 | #ifdef CONFIG_ACPI_NUMA | ||
326 | if (acpi) | ||
327 | acpi_fake_nodes(nodes, nr_nodes); | ||
328 | #endif | ||
329 | #ifdef CONFIG_AMD_NUMA | ||
330 | if (amd) | ||
331 | amd_fake_nodes(nodes, nr_nodes); | ||
332 | #endif | ||
333 | if (!acpi && !amd) | ||
334 | for (i = 0; i < nr_cpu_ids; i++) | ||
335 | numa_set_node(i, 0); | ||
336 | } | ||
337 | |||
338 | /* | ||
339 | * Setups up nid to range from addr to addr + size. If the end | ||
340 | * boundary is greater than max_addr, then max_addr is used instead. | ||
341 | * The return value is 0 if there is additional memory left for | ||
342 | * allocation past addr and -1 otherwise. addr is adjusted to be at | ||
343 | * the end of the node. | ||
344 | */ | ||
345 | static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) | ||
346 | { | ||
347 | int ret = 0; | ||
348 | nodes[nid].start = *addr; | ||
349 | *addr += size; | ||
350 | if (*addr >= max_addr) { | ||
351 | *addr = max_addr; | ||
352 | ret = -1; | ||
353 | } | ||
354 | nodes[nid].end = *addr; | ||
355 | node_set(nid, node_possible_map); | ||
356 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, | ||
357 | nodes[nid].start, nodes[nid].end, | ||
358 | (nodes[nid].end - nodes[nid].start) >> 20); | ||
359 | return ret; | ||
360 | } | ||
361 | |||
362 | /* | ||
363 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr | ||
364 | * to max_addr. The return value is the number of nodes allocated. | ||
365 | */ | ||
366 | static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes) | ||
367 | { | ||
368 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
369 | u64 size; | ||
370 | int big; | ||
371 | int ret = 0; | ||
372 | int i; | ||
373 | |||
374 | if (nr_nodes <= 0) | ||
375 | return -1; | ||
376 | if (nr_nodes > MAX_NUMNODES) { | ||
377 | pr_info("numa=fake=%d too large, reducing to %d\n", | ||
378 | nr_nodes, MAX_NUMNODES); | ||
379 | nr_nodes = MAX_NUMNODES; | ||
380 | } | ||
381 | |||
382 | size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; | ||
383 | /* | ||
384 | * Calculate the number of big nodes that can be allocated as a result | ||
385 | * of consolidating the remainder. | ||
386 | */ | ||
387 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / | ||
388 | FAKE_NODE_MIN_SIZE; | ||
389 | |||
390 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
391 | if (!size) { | ||
392 | pr_err("Not enough memory for each node. " | ||
393 | "NUMA emulation disabled.\n"); | ||
394 | return -1; | ||
395 | } | ||
396 | |||
397 | for (i = 0; i < MAX_NUMNODES; i++) | ||
398 | if (physnodes[i].start != physnodes[i].end) | ||
399 | node_set(i, physnode_mask); | ||
400 | |||
401 | /* | ||
402 | * Continue to fill physical nodes with fake nodes until there is no | ||
403 | * memory left on any of them. | ||
404 | */ | ||
405 | while (nodes_weight(physnode_mask)) { | ||
406 | for_each_node_mask(i, physnode_mask) { | ||
407 | u64 end = physnodes[i].start + size; | ||
408 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); | ||
409 | |||
410 | if (ret < big) | ||
411 | end += FAKE_NODE_MIN_SIZE; | ||
412 | 317 | ||
413 | /* | 318 | /* |
414 | * Continue to add memory to this fake node if its | 319 | * See whether there are overlapping blocks. Whine |
415 | * non-reserved memory is less than the per-node size. | 320 | * about but allow overlaps of the same nid. They |
321 | * will be merged below. | ||
416 | */ | 322 | */ |
417 | while (end - physnodes[i].start - | 323 | if (bi->end > bj->start && bi->start < bj->end) { |
418 | memblock_x86_hole_size(physnodes[i].start, end) < size) { | 324 | if (bi->nid != bj->nid) { |
419 | end += FAKE_NODE_MIN_SIZE; | 325 | pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", |
420 | if (end > physnodes[i].end) { | 326 | bi->nid, bi->start, bi->end, |
421 | end = physnodes[i].end; | 327 | bj->nid, bj->start, bj->end); |
422 | break; | 328 | return -EINVAL; |
423 | } | 329 | } |
330 | pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", | ||
331 | bi->nid, bi->start, bi->end, | ||
332 | bj->start, bj->end); | ||
424 | } | 333 | } |
425 | 334 | ||
426 | /* | 335 | /* |
427 | * If there won't be at least FAKE_NODE_MIN_SIZE of | 336 | * Join together blocks on the same node, holes |
428 | * non-reserved memory in ZONE_DMA32 for the next node, | 337 | * between which don't overlap with memory on other |
429 | * this one must extend to the boundary. | 338 | * nodes. |
430 | */ | ||
431 | if (end < dma32_end && dma32_end - end - | ||
432 | memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
433 | end = dma32_end; | ||
434 | |||
435 | /* | ||
436 | * If there won't be enough non-reserved memory for the | ||
437 | * next node, this one must extend to the end of the | ||
438 | * physical node. | ||
439 | */ | 339 | */ |
440 | if (physnodes[i].end - end - | 340 | if (bi->nid != bj->nid) |
441 | memblock_x86_hole_size(end, physnodes[i].end) < size) | 341 | continue; |
442 | end = physnodes[i].end; | 342 | start = max(min(bi->start, bj->start), low); |
443 | 343 | end = min(max(bi->end, bj->end), high); | |
444 | /* | 344 | for (k = 0; k < mi->nr_blks; k++) { |
445 | * Avoid allocating more nodes than requested, which can | 345 | struct numa_memblk *bk = &mi->blk[k]; |
446 | * happen as a result of rounding down each node's size | 346 | |
447 | * to FAKE_NODE_MIN_SIZE. | 347 | if (bi->nid == bk->nid) |
448 | */ | 348 | continue; |
449 | if (nodes_weight(physnode_mask) + ret >= nr_nodes) | 349 | if (start < bk->end && end > bk->start) |
450 | end = physnodes[i].end; | 350 | break; |
451 | 351 | } | |
452 | if (setup_node_range(ret++, &physnodes[i].start, | 352 | if (k < mi->nr_blks) |
453 | end - physnodes[i].start, | 353 | continue; |
454 | physnodes[i].end) < 0) | 354 | printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", |
455 | node_clear(i, physnode_mask); | 355 | bi->nid, bi->start, bi->end, bj->start, bj->end, |
356 | start, end); | ||
357 | bi->start = start; | ||
358 | bi->end = end; | ||
359 | numa_remove_memblk_from(j--, mi); | ||
456 | } | 360 | } |
457 | } | 361 | } |
458 | return ret; | ||
459 | } | ||
460 | |||
461 | /* | ||
462 | * Returns the end address of a node so that there is at least `size' amount of | ||
463 | * non-reserved memory or `max_addr' is reached. | ||
464 | */ | ||
465 | static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) | ||
466 | { | ||
467 | u64 end = start + size; | ||
468 | 362 | ||
469 | while (end - start - memblock_x86_hole_size(start, end) < size) { | 363 | for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { |
470 | end += FAKE_NODE_MIN_SIZE; | 364 | mi->blk[i].start = mi->blk[i].end = 0; |
471 | if (end > max_addr) { | 365 | mi->blk[i].nid = NUMA_NO_NODE; |
472 | end = max_addr; | ||
473 | break; | ||
474 | } | ||
475 | } | 366 | } |
476 | return end; | 367 | |
368 | return 0; | ||
477 | } | 369 | } |
478 | 370 | ||
479 | /* | 371 | /* |
480 | * Sets up fake nodes of `size' interleaved over physical nodes ranging from | 372 | * Set nodes, which have memory in @mi, in *@nodemask. |
481 | * `addr' to `max_addr'. The return value is the number of nodes allocated. | ||
482 | */ | 373 | */ |
483 | static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) | 374 | static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, |
375 | const struct numa_meminfo *mi) | ||
484 | { | 376 | { |
485 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
486 | u64 min_size; | ||
487 | int ret = 0; | ||
488 | int i; | 377 | int i; |
489 | 378 | ||
490 | if (!size) | 379 | for (i = 0; i < ARRAY_SIZE(mi->blk); i++) |
491 | return -1; | 380 | if (mi->blk[i].start != mi->blk[i].end && |
492 | /* | 381 | mi->blk[i].nid != NUMA_NO_NODE) |
493 | * The limit on emulated nodes is MAX_NUMNODES, so the size per node is | 382 | node_set(mi->blk[i].nid, *nodemask); |
494 | * increased accordingly if the requested size is too small. This | 383 | } |
495 | * creates a uniform distribution of node sizes across the entire | ||
496 | * machine (but not necessarily over physical nodes). | ||
497 | */ | ||
498 | min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / | ||
499 | MAX_NUMNODES; | ||
500 | min_size = max(min_size, FAKE_NODE_MIN_SIZE); | ||
501 | if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) | ||
502 | min_size = (min_size + FAKE_NODE_MIN_SIZE) & | ||
503 | FAKE_NODE_MIN_HASH_MASK; | ||
504 | if (size < min_size) { | ||
505 | pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", | ||
506 | size >> 20, min_size >> 20); | ||
507 | size = min_size; | ||
508 | } | ||
509 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
510 | |||
511 | for (i = 0; i < MAX_NUMNODES; i++) | ||
512 | if (physnodes[i].start != physnodes[i].end) | ||
513 | node_set(i, physnode_mask); | ||
514 | /* | ||
515 | * Fill physical nodes with fake nodes of size until there is no memory | ||
516 | * left on any of them. | ||
517 | */ | ||
518 | while (nodes_weight(physnode_mask)) { | ||
519 | for_each_node_mask(i, physnode_mask) { | ||
520 | u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; | ||
521 | u64 end; | ||
522 | |||
523 | end = find_end_of_node(physnodes[i].start, | ||
524 | physnodes[i].end, size); | ||
525 | /* | ||
526 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
527 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
528 | * this one must extend to the boundary. | ||
529 | */ | ||
530 | if (end < dma32_end && dma32_end - end - | ||
531 | memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
532 | end = dma32_end; | ||
533 | 384 | ||
534 | /* | 385 | /** |
535 | * If there won't be enough non-reserved memory for the | 386 | * numa_reset_distance - Reset NUMA distance table |
536 | * next node, this one must extend to the end of the | 387 | * |
537 | * physical node. | 388 | * The current table is freed. The next numa_set_distance() call will |
538 | */ | 389 | * create a new one. |
539 | if (physnodes[i].end - end - | 390 | */ |
540 | memblock_x86_hole_size(end, physnodes[i].end) < size) | 391 | void __init numa_reset_distance(void) |
541 | end = physnodes[i].end; | 392 | { |
393 | size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); | ||
542 | 394 | ||
543 | /* | 395 | /* numa_distance could be 1LU marking allocation failure, test cnt */ |
544 | * Setup the fake node that will be allocated as bootmem | 396 | if (numa_distance_cnt) |
545 | * later. If setup_node_range() returns non-zero, there | 397 | memblock_x86_free_range(__pa(numa_distance), |
546 | * is no more memory available on this physical node. | 398 | __pa(numa_distance) + size); |
547 | */ | 399 | numa_distance_cnt = 0; |
548 | if (setup_node_range(ret++, &physnodes[i].start, | 400 | numa_distance = NULL; /* enable table creation */ |
549 | end - physnodes[i].start, | ||
550 | physnodes[i].end) < 0) | ||
551 | node_clear(i, physnode_mask); | ||
552 | } | ||
553 | } | ||
554 | return ret; | ||
555 | } | 401 | } |
556 | 402 | ||
557 | /* | 403 | static int __init numa_alloc_distance(void) |
558 | * Sets up the system RAM area from start_pfn to last_pfn according to the | ||
559 | * numa=fake command-line option. | ||
560 | */ | ||
561 | static int __init numa_emulation(unsigned long start_pfn, | ||
562 | unsigned long last_pfn, int acpi, int amd) | ||
563 | { | 404 | { |
564 | u64 addr = start_pfn << PAGE_SHIFT; | 405 | nodemask_t nodes_parsed; |
565 | u64 max_addr = last_pfn << PAGE_SHIFT; | 406 | size_t size; |
566 | int num_nodes; | 407 | int i, j, cnt = 0; |
567 | int i; | 408 | u64 phys; |
568 | 409 | ||
569 | /* | 410 | /* size the new table and allocate it */ |
570 | * If the numa=fake command-line contains a 'M' or 'G', it represents | 411 | nodes_parsed = numa_nodes_parsed; |
571 | * the fixed node size. Otherwise, if it is just a single number N, | 412 | numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); |
572 | * split the system RAM into N fake nodes. | ||
573 | */ | ||
574 | if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { | ||
575 | u64 size; | ||
576 | 413 | ||
577 | size = memparse(cmdline, &cmdline); | 414 | for_each_node_mask(i, nodes_parsed) |
578 | num_nodes = split_nodes_size_interleave(addr, max_addr, size); | 415 | cnt = i; |
579 | } else { | 416 | cnt++; |
580 | unsigned long n; | 417 | size = cnt * cnt * sizeof(numa_distance[0]); |
581 | 418 | ||
582 | n = simple_strtoul(cmdline, NULL, 0); | 419 | phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT, |
583 | num_nodes = split_nodes_interleave(addr, max_addr, n); | 420 | size, PAGE_SIZE); |
421 | if (phys == MEMBLOCK_ERROR) { | ||
422 | pr_warning("NUMA: Warning: can't allocate distance table!\n"); | ||
423 | /* don't retry until explicitly reset */ | ||
424 | numa_distance = (void *)1LU; | ||
425 | return -ENOMEM; | ||
584 | } | 426 | } |
427 | memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); | ||
585 | 428 | ||
586 | if (num_nodes < 0) | 429 | numa_distance = __va(phys); |
587 | return num_nodes; | 430 | numa_distance_cnt = cnt; |
588 | memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); | 431 | |
589 | if (memnode_shift < 0) { | 432 | /* fill with the default distances */ |
590 | memnode_shift = 0; | 433 | for (i = 0; i < cnt; i++) |
591 | printk(KERN_ERR "No NUMA hash function found. NUMA emulation " | 434 | for (j = 0; j < cnt; j++) |
592 | "disabled.\n"); | 435 | numa_distance[i * cnt + j] = i == j ? |
593 | return -1; | 436 | LOCAL_DISTANCE : REMOTE_DISTANCE; |
594 | } | 437 | printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); |
595 | 438 | ||
596 | /* | ||
597 | * We need to vacate all active ranges that may have been registered for | ||
598 | * the e820 memory map. | ||
599 | */ | ||
600 | remove_all_active_ranges(); | ||
601 | for_each_node_mask(i, node_possible_map) { | ||
602 | memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, | ||
603 | nodes[i].end >> PAGE_SHIFT); | ||
604 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
605 | } | ||
606 | setup_physnodes(addr, max_addr, acpi, amd); | ||
607 | fake_physnodes(acpi, amd, num_nodes); | ||
608 | numa_init_array(); | ||
609 | return 0; | 439 | return 0; |
610 | } | 440 | } |
611 | #endif /* CONFIG_NUMA_EMU */ | ||
612 | 441 | ||
613 | void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, | 442 | /** |
614 | int acpi, int amd) | 443 | * numa_set_distance - Set NUMA distance from one NUMA to another |
444 | * @from: the 'from' node to set distance | ||
445 | * @to: the 'to' node to set distance | ||
446 | * @distance: NUMA distance | ||
447 | * | ||
448 | * Set the distance from node @from to @to to @distance. If distance table | ||
449 | * doesn't exist, one which is large enough to accomodate all the currently | ||
450 | * known nodes will be created. | ||
451 | * | ||
452 | * If such table cannot be allocated, a warning is printed and further | ||
453 | * calls are ignored until the distance table is reset with | ||
454 | * numa_reset_distance(). | ||
455 | * | ||
456 | * If @from or @to is higher than the highest known node at the time of | ||
457 | * table creation or @distance doesn't make sense, the call is ignored. | ||
458 | * This is to allow simplification of specific NUMA config implementations. | ||
459 | */ | ||
460 | void __init numa_set_distance(int from, int to, int distance) | ||
615 | { | 461 | { |
616 | int i; | 462 | if (!numa_distance && numa_alloc_distance() < 0) |
617 | |||
618 | nodes_clear(node_possible_map); | ||
619 | nodes_clear(node_online_map); | ||
620 | |||
621 | #ifdef CONFIG_NUMA_EMU | ||
622 | setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, | ||
623 | acpi, amd); | ||
624 | if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) | ||
625 | return; | 463 | return; |
626 | setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT, | ||
627 | acpi, amd); | ||
628 | nodes_clear(node_possible_map); | ||
629 | nodes_clear(node_online_map); | ||
630 | #endif | ||
631 | 464 | ||
632 | #ifdef CONFIG_ACPI_NUMA | 465 | if (from >= numa_distance_cnt || to >= numa_distance_cnt) { |
633 | if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, | 466 | printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", |
634 | last_pfn << PAGE_SHIFT)) | 467 | from, to, distance); |
635 | return; | 468 | return; |
636 | nodes_clear(node_possible_map); | 469 | } |
637 | nodes_clear(node_online_map); | ||
638 | #endif | ||
639 | 470 | ||
640 | #ifdef CONFIG_AMD_NUMA | 471 | if ((u8)distance != distance || |
641 | if (!numa_off && amd && !amd_scan_nodes()) | 472 | (from == to && distance != LOCAL_DISTANCE)) { |
473 | pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", | ||
474 | from, to, distance); | ||
642 | return; | 475 | return; |
643 | nodes_clear(node_possible_map); | 476 | } |
644 | nodes_clear(node_online_map); | ||
645 | #endif | ||
646 | printk(KERN_INFO "%s\n", | ||
647 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); | ||
648 | 477 | ||
649 | printk(KERN_INFO "Faking a node at %016lx-%016lx\n", | 478 | numa_distance[from * numa_distance_cnt + to] = distance; |
650 | start_pfn << PAGE_SHIFT, | ||
651 | last_pfn << PAGE_SHIFT); | ||
652 | /* setup dummy node covering all memory */ | ||
653 | memnode_shift = 63; | ||
654 | memnodemap = memnode.embedded_map; | ||
655 | memnodemap[0] = 0; | ||
656 | node_set_online(0); | ||
657 | node_set(0, node_possible_map); | ||
658 | for (i = 0; i < nr_cpu_ids; i++) | ||
659 | numa_set_node(i, 0); | ||
660 | memblock_x86_register_active_regions(0, start_pfn, last_pfn); | ||
661 | setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); | ||
662 | } | 479 | } |
663 | 480 | ||
664 | unsigned long __init numa_free_all_bootmem(void) | 481 | int __node_distance(int from, int to) |
665 | { | 482 | { |
666 | unsigned long pages = 0; | 483 | if (from >= numa_distance_cnt || to >= numa_distance_cnt) |
667 | int i; | 484 | return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; |
485 | return numa_distance[from * numa_distance_cnt + to]; | ||
486 | } | ||
487 | EXPORT_SYMBOL(__node_distance); | ||
668 | 488 | ||
669 | for_each_online_node(i) | 489 | /* |
670 | pages += free_all_bootmem_node(NODE_DATA(i)); | 490 | * Sanity check to catch more bad NUMA configurations (they are amazingly |
491 | * common). Make sure the nodes cover all memory. | ||
492 | */ | ||
493 | static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) | ||
494 | { | ||
495 | unsigned long numaram, e820ram; | ||
496 | int i; | ||
671 | 497 | ||
672 | pages += free_all_memory_core_early(MAX_NUMNODES); | 498 | numaram = 0; |
499 | for (i = 0; i < mi->nr_blks; i++) { | ||
500 | unsigned long s = mi->blk[i].start >> PAGE_SHIFT; | ||
501 | unsigned long e = mi->blk[i].end >> PAGE_SHIFT; | ||
502 | numaram += e - s; | ||
503 | numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); | ||
504 | if ((long)numaram < 0) | ||
505 | numaram = 0; | ||
506 | } | ||
673 | 507 | ||
674 | return pages; | 508 | e820ram = max_pfn - (memblock_x86_hole_size(0, |
509 | max_pfn << PAGE_SHIFT) >> PAGE_SHIFT); | ||
510 | /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ | ||
511 | if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { | ||
512 | printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n", | ||
513 | (numaram << PAGE_SHIFT) >> 20, | ||
514 | (e820ram << PAGE_SHIFT) >> 20); | ||
515 | return false; | ||
516 | } | ||
517 | return true; | ||
675 | } | 518 | } |
676 | 519 | ||
677 | #ifdef CONFIG_NUMA | 520 | static int __init numa_register_memblks(struct numa_meminfo *mi) |
678 | |||
679 | static __init int find_near_online_node(int node) | ||
680 | { | 521 | { |
681 | int n, val; | 522 | int i, nid; |
682 | int min_val = INT_MAX; | ||
683 | int best_node = -1; | ||
684 | 523 | ||
685 | for_each_online_node(n) { | 524 | /* Account for nodes with cpus and no memory */ |
686 | val = node_distance(node, n); | 525 | node_possible_map = numa_nodes_parsed; |
526 | numa_nodemask_from_meminfo(&node_possible_map, mi); | ||
527 | if (WARN_ON(nodes_empty(node_possible_map))) | ||
528 | return -EINVAL; | ||
687 | 529 | ||
688 | if (val < min_val) { | 530 | memnode_shift = compute_hash_shift(mi); |
689 | min_val = val; | 531 | if (memnode_shift < 0) { |
690 | best_node = n; | 532 | printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n"); |
533 | return -EINVAL; | ||
534 | } | ||
535 | |||
536 | for (i = 0; i < mi->nr_blks; i++) | ||
537 | memblock_x86_register_active_regions(mi->blk[i].nid, | ||
538 | mi->blk[i].start >> PAGE_SHIFT, | ||
539 | mi->blk[i].end >> PAGE_SHIFT); | ||
540 | |||
541 | /* for out of order entries */ | ||
542 | sort_node_map(); | ||
543 | if (!numa_meminfo_cover_memory(mi)) | ||
544 | return -EINVAL; | ||
545 | |||
546 | /* Finally register nodes. */ | ||
547 | for_each_node_mask(nid, node_possible_map) { | ||
548 | u64 start = (u64)max_pfn << PAGE_SHIFT; | ||
549 | u64 end = 0; | ||
550 | |||
551 | for (i = 0; i < mi->nr_blks; i++) { | ||
552 | if (nid != mi->blk[i].nid) | ||
553 | continue; | ||
554 | start = min(mi->blk[i].start, start); | ||
555 | end = max(mi->blk[i].end, end); | ||
691 | } | 556 | } |
557 | |||
558 | if (start < end) | ||
559 | setup_node_bootmem(nid, start, end); | ||
692 | } | 560 | } |
693 | 561 | ||
694 | return best_node; | 562 | return 0; |
695 | } | 563 | } |
696 | 564 | ||
697 | /* | 565 | /** |
698 | * Setup early cpu_to_node. | 566 | * dummy_numma_init - Fallback dummy NUMA init |
699 | * | 567 | * |
700 | * Populate cpu_to_node[] only if x86_cpu_to_apicid[], | 568 | * Used if there's no underlying NUMA architecture, NUMA initialization |
701 | * and apicid_to_node[] tables have valid entries for a CPU. | 569 | * fails, or NUMA is disabled on the command line. |
702 | * This means we skip cpu_to_node[] initialisation for NUMA | ||
703 | * emulation and faking node case (when running a kernel compiled | ||
704 | * for NUMA on a non NUMA box), which is OK as cpu_to_node[] | ||
705 | * is already initialized in a round robin manner at numa_init_array, | ||
706 | * prior to this call, and this initialization is good enough | ||
707 | * for the fake NUMA cases. | ||
708 | * | 570 | * |
709 | * Called before the per_cpu areas are setup. | 571 | * Must online at least one node and add memory blocks that cover all |
572 | * allowed memory. This function must not fail. | ||
710 | */ | 573 | */ |
711 | void __init init_cpu_to_node(void) | 574 | static int __init dummy_numa_init(void) |
712 | { | 575 | { |
713 | int cpu; | 576 | printk(KERN_INFO "%s\n", |
714 | u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); | 577 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); |
715 | 578 | printk(KERN_INFO "Faking a node at %016lx-%016lx\n", | |
716 | BUG_ON(cpu_to_apicid == NULL); | 579 | 0LU, max_pfn << PAGE_SHIFT); |
717 | 580 | ||
718 | for_each_possible_cpu(cpu) { | 581 | node_set(0, numa_nodes_parsed); |
719 | int node; | 582 | numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT); |
720 | u16 apicid = cpu_to_apicid[cpu]; | ||
721 | 583 | ||
722 | if (apicid == BAD_APICID) | 584 | return 0; |
723 | continue; | ||
724 | node = apicid_to_node[apicid]; | ||
725 | if (node == NUMA_NO_NODE) | ||
726 | continue; | ||
727 | if (!node_online(node)) | ||
728 | node = find_near_online_node(node); | ||
729 | numa_set_node(cpu, node); | ||
730 | } | ||
731 | } | 585 | } |
732 | #endif | ||
733 | 586 | ||
734 | 587 | static int __init numa_init(int (*init_func)(void)) | |
735 | void __cpuinit numa_set_node(int cpu, int node) | ||
736 | { | 588 | { |
737 | int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); | 589 | int i; |
738 | 590 | int ret; | |
739 | /* early setting, no percpu area yet */ | ||
740 | if (cpu_to_node_map) { | ||
741 | cpu_to_node_map[cpu] = node; | ||
742 | return; | ||
743 | } | ||
744 | |||
745 | #ifdef CONFIG_DEBUG_PER_CPU_MAPS | ||
746 | if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { | ||
747 | printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); | ||
748 | dump_stack(); | ||
749 | return; | ||
750 | } | ||
751 | #endif | ||
752 | per_cpu(x86_cpu_to_node_map, cpu) = node; | ||
753 | 591 | ||
754 | if (node != NUMA_NO_NODE) | 592 | for (i = 0; i < MAX_LOCAL_APIC; i++) |
755 | set_cpu_numa_node(cpu, node); | 593 | set_apicid_to_node(i, NUMA_NO_NODE); |
756 | } | ||
757 | 594 | ||
758 | void __cpuinit numa_clear_node(int cpu) | 595 | nodes_clear(numa_nodes_parsed); |
759 | { | 596 | nodes_clear(node_possible_map); |
760 | numa_set_node(cpu, NUMA_NO_NODE); | 597 | nodes_clear(node_online_map); |
761 | } | 598 | memset(&numa_meminfo, 0, sizeof(numa_meminfo)); |
762 | 599 | remove_all_active_ranges(); | |
763 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS | 600 | numa_reset_distance(); |
764 | 601 | ||
765 | #ifndef CONFIG_NUMA_EMU | 602 | ret = init_func(); |
766 | void __cpuinit numa_add_cpu(int cpu) | 603 | if (ret < 0) |
767 | { | 604 | return ret; |
768 | cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | 605 | ret = numa_cleanup_meminfo(&numa_meminfo); |
769 | } | 606 | if (ret < 0) |
607 | return ret; | ||
770 | 608 | ||
771 | void __cpuinit numa_remove_cpu(int cpu) | 609 | numa_emulation(&numa_meminfo, numa_distance_cnt); |
772 | { | ||
773 | cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | ||
774 | } | ||
775 | #else | ||
776 | void __cpuinit numa_add_cpu(int cpu) | ||
777 | { | ||
778 | unsigned long addr; | ||
779 | u16 apicid; | ||
780 | int physnid; | ||
781 | int nid = NUMA_NO_NODE; | ||
782 | 610 | ||
783 | nid = early_cpu_to_node(cpu); | 611 | ret = numa_register_memblks(&numa_meminfo); |
784 | BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); | 612 | if (ret < 0) |
613 | return ret; | ||
785 | 614 | ||
786 | /* | 615 | for (i = 0; i < nr_cpu_ids; i++) { |
787 | * Use the starting address of the emulated node to find which physical | 616 | int nid = early_cpu_to_node(i); |
788 | * node it is allocated on. | ||
789 | */ | ||
790 | addr = node_start_pfn(nid) << PAGE_SHIFT; | ||
791 | for (physnid = 0; physnid < MAX_NUMNODES; physnid++) | ||
792 | if (addr >= physnodes[physnid].start && | ||
793 | addr < physnodes[physnid].end) | ||
794 | break; | ||
795 | 617 | ||
796 | /* | 618 | if (nid == NUMA_NO_NODE) |
797 | * Map the cpu to each emulated node that is allocated on the physical | 619 | continue; |
798 | * node of the cpu's apic id. | 620 | if (!node_online(nid)) |
799 | */ | 621 | numa_clear_node(i); |
800 | for_each_online_node(nid) { | ||
801 | addr = node_start_pfn(nid) << PAGE_SHIFT; | ||
802 | if (addr >= physnodes[physnid].start && | ||
803 | addr < physnodes[physnid].end) | ||
804 | cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); | ||
805 | } | 622 | } |
623 | numa_init_array(); | ||
624 | return 0; | ||
806 | } | 625 | } |
807 | 626 | ||
808 | void __cpuinit numa_remove_cpu(int cpu) | 627 | void __init initmem_init(void) |
809 | { | 628 | { |
810 | int i; | 629 | int ret; |
811 | 630 | ||
812 | for_each_online_node(i) | 631 | if (!numa_off) { |
813 | cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); | 632 | #ifdef CONFIG_ACPI_NUMA |
814 | } | 633 | ret = numa_init(x86_acpi_numa_init); |
815 | #endif /* !CONFIG_NUMA_EMU */ | 634 | if (!ret) |
816 | 635 | return; | |
817 | #else /* CONFIG_DEBUG_PER_CPU_MAPS */ | 636 | #endif |
818 | static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) | 637 | #ifdef CONFIG_AMD_NUMA |
819 | { | 638 | ret = numa_init(amd_numa_init); |
820 | int node = early_cpu_to_node(cpu); | 639 | if (!ret) |
821 | struct cpumask *mask; | 640 | return; |
822 | char buf[64]; | 641 | #endif |
823 | |||
824 | mask = node_to_cpumask_map[node]; | ||
825 | if (!mask) { | ||
826 | pr_err("node_to_cpumask_map[%i] NULL\n", node); | ||
827 | dump_stack(); | ||
828 | return NULL; | ||
829 | } | 642 | } |
830 | 643 | ||
831 | cpulist_scnprintf(buf, sizeof(buf), mask); | 644 | numa_init(dummy_numa_init); |
832 | printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", | ||
833 | enable ? "numa_add_cpu" : "numa_remove_cpu", | ||
834 | cpu, node, buf); | ||
835 | return mask; | ||
836 | } | 645 | } |
837 | 646 | ||
838 | /* | 647 | unsigned long __init numa_free_all_bootmem(void) |
839 | * --------- debug versions of the numa functions --------- | ||
840 | */ | ||
841 | #ifndef CONFIG_NUMA_EMU | ||
842 | static void __cpuinit numa_set_cpumask(int cpu, int enable) | ||
843 | { | ||
844 | struct cpumask *mask; | ||
845 | |||
846 | mask = debug_cpumask_set_cpu(cpu, enable); | ||
847 | if (!mask) | ||
848 | return; | ||
849 | |||
850 | if (enable) | ||
851 | cpumask_set_cpu(cpu, mask); | ||
852 | else | ||
853 | cpumask_clear_cpu(cpu, mask); | ||
854 | } | ||
855 | #else | ||
856 | static void __cpuinit numa_set_cpumask(int cpu, int enable) | ||
857 | { | 648 | { |
858 | int node = early_cpu_to_node(cpu); | 649 | unsigned long pages = 0; |
859 | struct cpumask *mask; | ||
860 | int i; | 650 | int i; |
861 | 651 | ||
862 | for_each_online_node(i) { | 652 | for_each_online_node(i) |
863 | unsigned long addr; | 653 | pages += free_all_bootmem_node(NODE_DATA(i)); |
864 | |||
865 | addr = node_start_pfn(i) << PAGE_SHIFT; | ||
866 | if (addr < physnodes[node].start || | ||
867 | addr >= physnodes[node].end) | ||
868 | continue; | ||
869 | mask = debug_cpumask_set_cpu(cpu, enable); | ||
870 | if (!mask) | ||
871 | return; | ||
872 | |||
873 | if (enable) | ||
874 | cpumask_set_cpu(cpu, mask); | ||
875 | else | ||
876 | cpumask_clear_cpu(cpu, mask); | ||
877 | } | ||
878 | } | ||
879 | #endif /* CONFIG_NUMA_EMU */ | ||
880 | 654 | ||
881 | void __cpuinit numa_add_cpu(int cpu) | 655 | pages += free_all_memory_core_early(MAX_NUMNODES); |
882 | { | ||
883 | numa_set_cpumask(cpu, 1); | ||
884 | } | ||
885 | 656 | ||
886 | void __cpuinit numa_remove_cpu(int cpu) | 657 | return pages; |
887 | { | ||
888 | numa_set_cpumask(cpu, 0); | ||
889 | } | 658 | } |
890 | 659 | ||
891 | int __cpu_to_node(int cpu) | 660 | int __cpuinit numa_cpu_node(int cpu) |
892 | { | 661 | { |
893 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) { | 662 | int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); |
894 | printk(KERN_WARNING | ||
895 | "cpu_to_node(%d): usage too early!\n", cpu); | ||
896 | dump_stack(); | ||
897 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; | ||
898 | } | ||
899 | return per_cpu(x86_cpu_to_node_map, cpu); | ||
900 | } | ||
901 | EXPORT_SYMBOL(__cpu_to_node); | ||
902 | 663 | ||
903 | /* | 664 | if (apicid != BAD_APICID) |
904 | * Same function as cpu_to_node() but used if called before the | 665 | return __apicid_to_node[apicid]; |
905 | * per_cpu areas are setup. | 666 | return NUMA_NO_NODE; |
906 | */ | ||
907 | int early_cpu_to_node(int cpu) | ||
908 | { | ||
909 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) | ||
910 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; | ||
911 | |||
912 | if (!cpu_possible(cpu)) { | ||
913 | printk(KERN_WARNING | ||
914 | "early_cpu_to_node(%d): no per_cpu area!\n", cpu); | ||
915 | dump_stack(); | ||
916 | return NUMA_NO_NODE; | ||
917 | } | ||
918 | return per_cpu(x86_cpu_to_node_map, cpu); | ||
919 | } | 667 | } |
920 | |||
921 | /* | ||
922 | * --------- end of debug versions of the numa functions --------- | ||
923 | */ | ||
924 | |||
925 | #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ | ||
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c new file mode 100644 index 000000000000..ad091e4cff17 --- /dev/null +++ b/arch/x86/mm/numa_emulation.c | |||
@@ -0,0 +1,494 @@ | |||
1 | /* | ||
2 | * NUMA emulation | ||
3 | */ | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/errno.h> | ||
6 | #include <linux/topology.h> | ||
7 | #include <linux/memblock.h> | ||
8 | #include <asm/dma.h> | ||
9 | |||
10 | #include "numa_internal.h" | ||
11 | |||
12 | static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; | ||
13 | static char *emu_cmdline __initdata; | ||
14 | |||
15 | void __init numa_emu_cmdline(char *str) | ||
16 | { | ||
17 | emu_cmdline = str; | ||
18 | } | ||
19 | |||
20 | static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) | ||
21 | { | ||
22 | int i; | ||
23 | |||
24 | for (i = 0; i < mi->nr_blks; i++) | ||
25 | if (mi->blk[i].nid == nid) | ||
26 | return i; | ||
27 | return -ENOENT; | ||
28 | } | ||
29 | |||
30 | /* | ||
31 | * Sets up nid to range from @start to @end. The return value is -errno if | ||
32 | * something went wrong, 0 otherwise. | ||
33 | */ | ||
34 | static int __init emu_setup_memblk(struct numa_meminfo *ei, | ||
35 | struct numa_meminfo *pi, | ||
36 | int nid, int phys_blk, u64 size) | ||
37 | { | ||
38 | struct numa_memblk *eb = &ei->blk[ei->nr_blks]; | ||
39 | struct numa_memblk *pb = &pi->blk[phys_blk]; | ||
40 | |||
41 | if (ei->nr_blks >= NR_NODE_MEMBLKS) { | ||
42 | pr_err("NUMA: Too many emulated memblks, failing emulation\n"); | ||
43 | return -EINVAL; | ||
44 | } | ||
45 | |||
46 | ei->nr_blks++; | ||
47 | eb->start = pb->start; | ||
48 | eb->end = pb->start + size; | ||
49 | eb->nid = nid; | ||
50 | |||
51 | if (emu_nid_to_phys[nid] == NUMA_NO_NODE) | ||
52 | emu_nid_to_phys[nid] = pb->nid; | ||
53 | |||
54 | pb->start += size; | ||
55 | if (pb->start >= pb->end) { | ||
56 | WARN_ON_ONCE(pb->start > pb->end); | ||
57 | numa_remove_memblk_from(phys_blk, pi); | ||
58 | } | ||
59 | |||
60 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, | ||
61 | eb->start, eb->end, (eb->end - eb->start) >> 20); | ||
62 | return 0; | ||
63 | } | ||
64 | |||
65 | /* | ||
66 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr | ||
67 | * to max_addr. The return value is the number of nodes allocated. | ||
68 | */ | ||
69 | static int __init split_nodes_interleave(struct numa_meminfo *ei, | ||
70 | struct numa_meminfo *pi, | ||
71 | u64 addr, u64 max_addr, int nr_nodes) | ||
72 | { | ||
73 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
74 | u64 size; | ||
75 | int big; | ||
76 | int nid = 0; | ||
77 | int i, ret; | ||
78 | |||
79 | if (nr_nodes <= 0) | ||
80 | return -1; | ||
81 | if (nr_nodes > MAX_NUMNODES) { | ||
82 | pr_info("numa=fake=%d too large, reducing to %d\n", | ||
83 | nr_nodes, MAX_NUMNODES); | ||
84 | nr_nodes = MAX_NUMNODES; | ||
85 | } | ||
86 | |||
87 | size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; | ||
88 | /* | ||
89 | * Calculate the number of big nodes that can be allocated as a result | ||
90 | * of consolidating the remainder. | ||
91 | */ | ||
92 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / | ||
93 | FAKE_NODE_MIN_SIZE; | ||
94 | |||
95 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
96 | if (!size) { | ||
97 | pr_err("Not enough memory for each node. " | ||
98 | "NUMA emulation disabled.\n"); | ||
99 | return -1; | ||
100 | } | ||
101 | |||
102 | for (i = 0; i < pi->nr_blks; i++) | ||
103 | node_set(pi->blk[i].nid, physnode_mask); | ||
104 | |||
105 | /* | ||
106 | * Continue to fill physical nodes with fake nodes until there is no | ||
107 | * memory left on any of them. | ||
108 | */ | ||
109 | while (nodes_weight(physnode_mask)) { | ||
110 | for_each_node_mask(i, physnode_mask) { | ||
111 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); | ||
112 | u64 start, limit, end; | ||
113 | int phys_blk; | ||
114 | |||
115 | phys_blk = emu_find_memblk_by_nid(i, pi); | ||
116 | if (phys_blk < 0) { | ||
117 | node_clear(i, physnode_mask); | ||
118 | continue; | ||
119 | } | ||
120 | start = pi->blk[phys_blk].start; | ||
121 | limit = pi->blk[phys_blk].end; | ||
122 | end = start + size; | ||
123 | |||
124 | if (nid < big) | ||
125 | end += FAKE_NODE_MIN_SIZE; | ||
126 | |||
127 | /* | ||
128 | * Continue to add memory to this fake node if its | ||
129 | * non-reserved memory is less than the per-node size. | ||
130 | */ | ||
131 | while (end - start - | ||
132 | memblock_x86_hole_size(start, end) < size) { | ||
133 | end += FAKE_NODE_MIN_SIZE; | ||
134 | if (end > limit) { | ||
135 | end = limit; | ||
136 | break; | ||
137 | } | ||
138 | } | ||
139 | |||
140 | /* | ||
141 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
142 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
143 | * this one must extend to the boundary. | ||
144 | */ | ||
145 | if (end < dma32_end && dma32_end - end - | ||
146 | memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
147 | end = dma32_end; | ||
148 | |||
149 | /* | ||
150 | * If there won't be enough non-reserved memory for the | ||
151 | * next node, this one must extend to the end of the | ||
152 | * physical node. | ||
153 | */ | ||
154 | if (limit - end - | ||
155 | memblock_x86_hole_size(end, limit) < size) | ||
156 | end = limit; | ||
157 | |||
158 | ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, | ||
159 | phys_blk, | ||
160 | min(end, limit) - start); | ||
161 | if (ret < 0) | ||
162 | return ret; | ||
163 | } | ||
164 | } | ||
165 | return 0; | ||
166 | } | ||
167 | |||
168 | /* | ||
169 | * Returns the end address of a node so that there is at least `size' amount of | ||
170 | * non-reserved memory or `max_addr' is reached. | ||
171 | */ | ||
172 | static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) | ||
173 | { | ||
174 | u64 end = start + size; | ||
175 | |||
176 | while (end - start - memblock_x86_hole_size(start, end) < size) { | ||
177 | end += FAKE_NODE_MIN_SIZE; | ||
178 | if (end > max_addr) { | ||
179 | end = max_addr; | ||
180 | break; | ||
181 | } | ||
182 | } | ||
183 | return end; | ||
184 | } | ||
185 | |||
186 | /* | ||
187 | * Sets up fake nodes of `size' interleaved over physical nodes ranging from | ||
188 | * `addr' to `max_addr'. The return value is the number of nodes allocated. | ||
189 | */ | ||
190 | static int __init split_nodes_size_interleave(struct numa_meminfo *ei, | ||
191 | struct numa_meminfo *pi, | ||
192 | u64 addr, u64 max_addr, u64 size) | ||
193 | { | ||
194 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
195 | u64 min_size; | ||
196 | int nid = 0; | ||
197 | int i, ret; | ||
198 | |||
199 | if (!size) | ||
200 | return -1; | ||
201 | /* | ||
202 | * The limit on emulated nodes is MAX_NUMNODES, so the size per node is | ||
203 | * increased accordingly if the requested size is too small. This | ||
204 | * creates a uniform distribution of node sizes across the entire | ||
205 | * machine (but not necessarily over physical nodes). | ||
206 | */ | ||
207 | min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / | ||
208 | MAX_NUMNODES; | ||
209 | min_size = max(min_size, FAKE_NODE_MIN_SIZE); | ||
210 | if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) | ||
211 | min_size = (min_size + FAKE_NODE_MIN_SIZE) & | ||
212 | FAKE_NODE_MIN_HASH_MASK; | ||
213 | if (size < min_size) { | ||
214 | pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", | ||
215 | size >> 20, min_size >> 20); | ||
216 | size = min_size; | ||
217 | } | ||
218 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
219 | |||
220 | for (i = 0; i < pi->nr_blks; i++) | ||
221 | node_set(pi->blk[i].nid, physnode_mask); | ||
222 | |||
223 | /* | ||
224 | * Fill physical nodes with fake nodes of size until there is no memory | ||
225 | * left on any of them. | ||
226 | */ | ||
227 | while (nodes_weight(physnode_mask)) { | ||
228 | for_each_node_mask(i, physnode_mask) { | ||
229 | u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; | ||
230 | u64 start, limit, end; | ||
231 | int phys_blk; | ||
232 | |||
233 | phys_blk = emu_find_memblk_by_nid(i, pi); | ||
234 | if (phys_blk < 0) { | ||
235 | node_clear(i, physnode_mask); | ||
236 | continue; | ||
237 | } | ||
238 | start = pi->blk[phys_blk].start; | ||
239 | limit = pi->blk[phys_blk].end; | ||
240 | |||
241 | end = find_end_of_node(start, limit, size); | ||
242 | /* | ||
243 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
244 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
245 | * this one must extend to the boundary. | ||
246 | */ | ||
247 | if (end < dma32_end && dma32_end - end - | ||
248 | memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
249 | end = dma32_end; | ||
250 | |||
251 | /* | ||
252 | * If there won't be enough non-reserved memory for the | ||
253 | * next node, this one must extend to the end of the | ||
254 | * physical node. | ||
255 | */ | ||
256 | if (limit - end - | ||
257 | memblock_x86_hole_size(end, limit) < size) | ||
258 | end = limit; | ||
259 | |||
260 | ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, | ||
261 | phys_blk, | ||
262 | min(end, limit) - start); | ||
263 | if (ret < 0) | ||
264 | return ret; | ||
265 | } | ||
266 | } | ||
267 | return 0; | ||
268 | } | ||
269 | |||
270 | /** | ||
271 | * numa_emulation - Emulate NUMA nodes | ||
272 | * @numa_meminfo: NUMA configuration to massage | ||
273 | * @numa_dist_cnt: The size of the physical NUMA distance table | ||
274 | * | ||
275 | * Emulate NUMA nodes according to the numa=fake kernel parameter. | ||
276 | * @numa_meminfo contains the physical memory configuration and is modified | ||
277 | * to reflect the emulated configuration on success. @numa_dist_cnt is | ||
278 | * used to determine the size of the physical distance table. | ||
279 | * | ||
280 | * On success, the following modifications are made. | ||
281 | * | ||
282 | * - @numa_meminfo is updated to reflect the emulated nodes. | ||
283 | * | ||
284 | * - __apicid_to_node[] is updated such that APIC IDs are mapped to the | ||
285 | * emulated nodes. | ||
286 | * | ||
287 | * - NUMA distance table is rebuilt to represent distances between emulated | ||
288 | * nodes. The distances are determined considering how emulated nodes | ||
289 | * are mapped to physical nodes and match the actual distances. | ||
290 | * | ||
291 | * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical | ||
292 | * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). | ||
293 | * | ||
294 | * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with | ||
295 | * identity mapping and no other modification is made. | ||
296 | */ | ||
297 | void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) | ||
298 | { | ||
299 | static struct numa_meminfo ei __initdata; | ||
300 | static struct numa_meminfo pi __initdata; | ||
301 | const u64 max_addr = max_pfn << PAGE_SHIFT; | ||
302 | u8 *phys_dist = NULL; | ||
303 | size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); | ||
304 | int max_emu_nid, dfl_phys_nid; | ||
305 | int i, j, ret; | ||
306 | |||
307 | if (!emu_cmdline) | ||
308 | goto no_emu; | ||
309 | |||
310 | memset(&ei, 0, sizeof(ei)); | ||
311 | pi = *numa_meminfo; | ||
312 | |||
313 | for (i = 0; i < MAX_NUMNODES; i++) | ||
314 | emu_nid_to_phys[i] = NUMA_NO_NODE; | ||
315 | |||
316 | /* | ||
317 | * If the numa=fake command-line contains a 'M' or 'G', it represents | ||
318 | * the fixed node size. Otherwise, if it is just a single number N, | ||
319 | * split the system RAM into N fake nodes. | ||
320 | */ | ||
321 | if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { | ||
322 | u64 size; | ||
323 | |||
324 | size = memparse(emu_cmdline, &emu_cmdline); | ||
325 | ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); | ||
326 | } else { | ||
327 | unsigned long n; | ||
328 | |||
329 | n = simple_strtoul(emu_cmdline, NULL, 0); | ||
330 | ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); | ||
331 | } | ||
332 | |||
333 | if (ret < 0) | ||
334 | goto no_emu; | ||
335 | |||
336 | if (numa_cleanup_meminfo(&ei) < 0) { | ||
337 | pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); | ||
338 | goto no_emu; | ||
339 | } | ||
340 | |||
341 | /* copy the physical distance table */ | ||
342 | if (numa_dist_cnt) { | ||
343 | u64 phys; | ||
344 | |||
345 | phys = memblock_find_in_range(0, | ||
346 | (u64)max_pfn_mapped << PAGE_SHIFT, | ||
347 | phys_size, PAGE_SIZE); | ||
348 | if (phys == MEMBLOCK_ERROR) { | ||
349 | pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); | ||
350 | goto no_emu; | ||
351 | } | ||
352 | memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST"); | ||
353 | phys_dist = __va(phys); | ||
354 | |||
355 | for (i = 0; i < numa_dist_cnt; i++) | ||
356 | for (j = 0; j < numa_dist_cnt; j++) | ||
357 | phys_dist[i * numa_dist_cnt + j] = | ||
358 | node_distance(i, j); | ||
359 | } | ||
360 | |||
361 | /* | ||
362 | * Determine the max emulated nid and the default phys nid to use | ||
363 | * for unmapped nodes. | ||
364 | */ | ||
365 | max_emu_nid = 0; | ||
366 | dfl_phys_nid = NUMA_NO_NODE; | ||
367 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { | ||
368 | if (emu_nid_to_phys[i] != NUMA_NO_NODE) { | ||
369 | max_emu_nid = i; | ||
370 | if (dfl_phys_nid == NUMA_NO_NODE) | ||
371 | dfl_phys_nid = emu_nid_to_phys[i]; | ||
372 | } | ||
373 | } | ||
374 | if (dfl_phys_nid == NUMA_NO_NODE) { | ||
375 | pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); | ||
376 | goto no_emu; | ||
377 | } | ||
378 | |||
379 | /* commit */ | ||
380 | *numa_meminfo = ei; | ||
381 | |||
382 | /* | ||
383 | * Transform __apicid_to_node table to use emulated nids by | ||
384 | * reverse-mapping phys_nid. The maps should always exist but fall | ||
385 | * back to zero just in case. | ||
386 | */ | ||
387 | for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { | ||
388 | if (__apicid_to_node[i] == NUMA_NO_NODE) | ||
389 | continue; | ||
390 | for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) | ||
391 | if (__apicid_to_node[i] == emu_nid_to_phys[j]) | ||
392 | break; | ||
393 | __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; | ||
394 | } | ||
395 | |||
396 | /* make sure all emulated nodes are mapped to a physical node */ | ||
397 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) | ||
398 | if (emu_nid_to_phys[i] == NUMA_NO_NODE) | ||
399 | emu_nid_to_phys[i] = dfl_phys_nid; | ||
400 | |||
401 | /* transform distance table */ | ||
402 | numa_reset_distance(); | ||
403 | for (i = 0; i < max_emu_nid + 1; i++) { | ||
404 | for (j = 0; j < max_emu_nid + 1; j++) { | ||
405 | int physi = emu_nid_to_phys[i]; | ||
406 | int physj = emu_nid_to_phys[j]; | ||
407 | int dist; | ||
408 | |||
409 | if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) | ||
410 | dist = physi == physj ? | ||
411 | LOCAL_DISTANCE : REMOTE_DISTANCE; | ||
412 | else | ||
413 | dist = phys_dist[physi * numa_dist_cnt + physj]; | ||
414 | |||
415 | numa_set_distance(i, j, dist); | ||
416 | } | ||
417 | } | ||
418 | |||
419 | /* free the copied physical distance table */ | ||
420 | if (phys_dist) | ||
421 | memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size); | ||
422 | return; | ||
423 | |||
424 | no_emu: | ||
425 | /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ | ||
426 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) | ||
427 | emu_nid_to_phys[i] = i; | ||
428 | } | ||
429 | |||
430 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS | ||
431 | void __cpuinit numa_add_cpu(int cpu) | ||
432 | { | ||
433 | int physnid, nid; | ||
434 | |||
435 | nid = early_cpu_to_node(cpu); | ||
436 | BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); | ||
437 | |||
438 | physnid = emu_nid_to_phys[nid]; | ||
439 | |||
440 | /* | ||
441 | * Map the cpu to each emulated node that is allocated on the physical | ||
442 | * node of the cpu's apic id. | ||
443 | */ | ||
444 | for_each_online_node(nid) | ||
445 | if (emu_nid_to_phys[nid] == physnid) | ||
446 | cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); | ||
447 | } | ||
448 | |||
449 | void __cpuinit numa_remove_cpu(int cpu) | ||
450 | { | ||
451 | int i; | ||
452 | |||
453 | for_each_online_node(i) | ||
454 | cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); | ||
455 | } | ||
456 | #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ | ||
457 | static void __cpuinit numa_set_cpumask(int cpu, int enable) | ||
458 | { | ||
459 | struct cpumask *mask; | ||
460 | int nid, physnid, i; | ||
461 | |||
462 | nid = early_cpu_to_node(cpu); | ||
463 | if (nid == NUMA_NO_NODE) { | ||
464 | /* early_cpu_to_node() already emits a warning and trace */ | ||
465 | return; | ||
466 | } | ||
467 | |||
468 | physnid = emu_nid_to_phys[nid]; | ||
469 | |||
470 | for_each_online_node(i) { | ||
471 | if (emu_nid_to_phys[nid] != physnid) | ||
472 | continue; | ||
473 | |||
474 | mask = debug_cpumask_set_cpu(cpu, enable); | ||
475 | if (!mask) | ||
476 | return; | ||
477 | |||
478 | if (enable) | ||
479 | cpumask_set_cpu(cpu, mask); | ||
480 | else | ||
481 | cpumask_clear_cpu(cpu, mask); | ||
482 | } | ||
483 | } | ||
484 | |||
485 | void __cpuinit numa_add_cpu(int cpu) | ||
486 | { | ||
487 | numa_set_cpumask(cpu, 1); | ||
488 | } | ||
489 | |||
490 | void __cpuinit numa_remove_cpu(int cpu) | ||
491 | { | ||
492 | numa_set_cpumask(cpu, 0); | ||
493 | } | ||
494 | #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ | ||
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h new file mode 100644 index 000000000000..ef2d97377d7c --- /dev/null +++ b/arch/x86/mm/numa_internal.h | |||
@@ -0,0 +1,31 @@ | |||
1 | #ifndef __X86_MM_NUMA_INTERNAL_H | ||
2 | #define __X86_MM_NUMA_INTERNAL_H | ||
3 | |||
4 | #include <linux/types.h> | ||
5 | #include <asm/numa.h> | ||
6 | |||
7 | struct numa_memblk { | ||
8 | u64 start; | ||
9 | u64 end; | ||
10 | int nid; | ||
11 | }; | ||
12 | |||
13 | struct numa_meminfo { | ||
14 | int nr_blks; | ||
15 | struct numa_memblk blk[NR_NODE_MEMBLKS]; | ||
16 | }; | ||
17 | |||
18 | void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); | ||
19 | int __init numa_cleanup_meminfo(struct numa_meminfo *mi); | ||
20 | void __init numa_reset_distance(void); | ||
21 | |||
22 | #ifdef CONFIG_NUMA_EMU | ||
23 | void __init numa_emulation(struct numa_meminfo *numa_meminfo, | ||
24 | int numa_dist_cnt); | ||
25 | #else | ||
26 | static inline void numa_emulation(struct numa_meminfo *numa_meminfo, | ||
27 | int numa_dist_cnt) | ||
28 | { } | ||
29 | #endif | ||
30 | |||
31 | #endif /* __X86_MM_NUMA_INTERNAL_H */ | ||
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index ae96e7b8051d..48651c6f657d 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c | |||
@@ -57,7 +57,7 @@ struct node_memory_chunk_s { | |||
57 | static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; | 57 | static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; |
58 | 58 | ||
59 | static int __initdata num_memory_chunks; /* total number of memory chunks */ | 59 | static int __initdata num_memory_chunks; /* total number of memory chunks */ |
60 | static u8 __initdata apicid_to_pxm[MAX_APICID]; | 60 | static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC]; |
61 | 61 | ||
62 | int acpi_numa __initdata; | 62 | int acpi_numa __initdata; |
63 | 63 | ||
@@ -254,8 +254,8 @@ int __init get_memcfg_from_srat(void) | |||
254 | printk(KERN_DEBUG "Number of memory chunks in system = %d\n", | 254 | printk(KERN_DEBUG "Number of memory chunks in system = %d\n", |
255 | num_memory_chunks); | 255 | num_memory_chunks); |
256 | 256 | ||
257 | for (i = 0; i < MAX_APICID; i++) | 257 | for (i = 0; i < MAX_LOCAL_APIC; i++) |
258 | apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); | 258 | set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i])); |
259 | 259 | ||
260 | for (j = 0; j < num_memory_chunks; j++){ | 260 | for (j = 0; j < num_memory_chunks; j++){ |
261 | struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; | 261 | struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; |
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 603d285d1daa..8e9d3394f6d4 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c | |||
@@ -26,88 +26,34 @@ | |||
26 | 26 | ||
27 | int acpi_numa __initdata; | 27 | int acpi_numa __initdata; |
28 | 28 | ||
29 | static struct acpi_table_slit *acpi_slit; | ||
30 | |||
31 | static nodemask_t nodes_parsed __initdata; | ||
32 | static nodemask_t cpu_nodes_parsed __initdata; | ||
33 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | ||
34 | static struct bootnode nodes_add[MAX_NUMNODES]; | 29 | static struct bootnode nodes_add[MAX_NUMNODES]; |
35 | 30 | ||
36 | static int num_node_memblks __initdata; | ||
37 | static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; | ||
38 | static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; | ||
39 | |||
40 | static __init int setup_node(int pxm) | 31 | static __init int setup_node(int pxm) |
41 | { | 32 | { |
42 | return acpi_map_pxm_to_node(pxm); | 33 | return acpi_map_pxm_to_node(pxm); |
43 | } | 34 | } |
44 | 35 | ||
45 | static __init int conflicting_memblks(unsigned long start, unsigned long end) | ||
46 | { | ||
47 | int i; | ||
48 | for (i = 0; i < num_node_memblks; i++) { | ||
49 | struct bootnode *nd = &node_memblk_range[i]; | ||
50 | if (nd->start == nd->end) | ||
51 | continue; | ||
52 | if (nd->end > start && nd->start < end) | ||
53 | return memblk_nodeid[i]; | ||
54 | if (nd->end == end && nd->start == start) | ||
55 | return memblk_nodeid[i]; | ||
56 | } | ||
57 | return -1; | ||
58 | } | ||
59 | |||
60 | static __init void cutoff_node(int i, unsigned long start, unsigned long end) | ||
61 | { | ||
62 | struct bootnode *nd = &nodes[i]; | ||
63 | |||
64 | if (nd->start < start) { | ||
65 | nd->start = start; | ||
66 | if (nd->end < nd->start) | ||
67 | nd->start = nd->end; | ||
68 | } | ||
69 | if (nd->end > end) { | ||
70 | nd->end = end; | ||
71 | if (nd->start > nd->end) | ||
72 | nd->start = nd->end; | ||
73 | } | ||
74 | } | ||
75 | |||
76 | static __init void bad_srat(void) | 36 | static __init void bad_srat(void) |
77 | { | 37 | { |
78 | int i; | ||
79 | printk(KERN_ERR "SRAT: SRAT not used.\n"); | 38 | printk(KERN_ERR "SRAT: SRAT not used.\n"); |
80 | acpi_numa = -1; | 39 | acpi_numa = -1; |
81 | for (i = 0; i < MAX_LOCAL_APIC; i++) | 40 | memset(nodes_add, 0, sizeof(nodes_add)); |
82 | apicid_to_node[i] = NUMA_NO_NODE; | ||
83 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
84 | nodes[i].start = nodes[i].end = 0; | ||
85 | nodes_add[i].start = nodes_add[i].end = 0; | ||
86 | } | ||
87 | remove_all_active_ranges(); | ||
88 | } | 41 | } |
89 | 42 | ||
90 | static __init inline int srat_disabled(void) | 43 | static __init inline int srat_disabled(void) |
91 | { | 44 | { |
92 | return numa_off || acpi_numa < 0; | 45 | return acpi_numa < 0; |
93 | } | 46 | } |
94 | 47 | ||
95 | /* Callback for SLIT parsing */ | 48 | /* Callback for SLIT parsing */ |
96 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) | 49 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) |
97 | { | 50 | { |
98 | unsigned length; | 51 | int i, j; |
99 | unsigned long phys; | ||
100 | |||
101 | length = slit->header.length; | ||
102 | phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length, | ||
103 | PAGE_SIZE); | ||
104 | |||
105 | if (phys == MEMBLOCK_ERROR) | ||
106 | panic(" Can not save slit!\n"); | ||
107 | 52 | ||
108 | acpi_slit = __va(phys); | 53 | for (i = 0; i < slit->locality_count; i++) |
109 | memcpy(acpi_slit, slit, length); | 54 | for (j = 0; j < slit->locality_count; j++) |
110 | memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT"); | 55 | numa_set_distance(pxm_to_node(i), pxm_to_node(j), |
56 | slit->entry[slit->locality_count * i + j]); | ||
111 | } | 57 | } |
112 | 58 | ||
113 | /* Callback for Proximity Domain -> x2APIC mapping */ | 59 | /* Callback for Proximity Domain -> x2APIC mapping */ |
@@ -138,8 +84,8 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) | |||
138 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); | 84 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); |
139 | return; | 85 | return; |
140 | } | 86 | } |
141 | apicid_to_node[apic_id] = node; | 87 | set_apicid_to_node(apic_id, node); |
142 | node_set(node, cpu_nodes_parsed); | 88 | node_set(node, numa_nodes_parsed); |
143 | acpi_numa = 1; | 89 | acpi_numa = 1; |
144 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", | 90 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", |
145 | pxm, apic_id, node); | 91 | pxm, apic_id, node); |
@@ -178,8 +124,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) | |||
178 | return; | 124 | return; |
179 | } | 125 | } |
180 | 126 | ||
181 | apicid_to_node[apic_id] = node; | 127 | set_apicid_to_node(apic_id, node); |
182 | node_set(node, cpu_nodes_parsed); | 128 | node_set(node, numa_nodes_parsed); |
183 | acpi_numa = 1; | 129 | acpi_numa = 1; |
184 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", | 130 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", |
185 | pxm, apic_id, node); | 131 | pxm, apic_id, node); |
@@ -241,7 +187,7 @@ update_nodes_add(int node, unsigned long start, unsigned long end) | |||
241 | } | 187 | } |
242 | 188 | ||
243 | if (changed) { | 189 | if (changed) { |
244 | node_set(node, cpu_nodes_parsed); | 190 | node_set(node, numa_nodes_parsed); |
245 | printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", | 191 | printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", |
246 | nd->start, nd->end); | 192 | nd->start, nd->end); |
247 | } | 193 | } |
@@ -251,10 +197,8 @@ update_nodes_add(int node, unsigned long start, unsigned long end) | |||
251 | void __init | 197 | void __init |
252 | acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) | 198 | acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) |
253 | { | 199 | { |
254 | struct bootnode *nd, oldnode; | ||
255 | unsigned long start, end; | 200 | unsigned long start, end; |
256 | int node, pxm; | 201 | int node, pxm; |
257 | int i; | ||
258 | 202 | ||
259 | if (srat_disabled()) | 203 | if (srat_disabled()) |
260 | return; | 204 | return; |
@@ -276,300 +220,31 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) | |||
276 | bad_srat(); | 220 | bad_srat(); |
277 | return; | 221 | return; |
278 | } | 222 | } |
279 | i = conflicting_memblks(start, end); | 223 | |
280 | if (i == node) { | 224 | if (numa_add_memblk(node, start, end) < 0) { |
281 | printk(KERN_WARNING | ||
282 | "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", | ||
283 | pxm, start, end, nodes[i].start, nodes[i].end); | ||
284 | } else if (i >= 0) { | ||
285 | printk(KERN_ERR | ||
286 | "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n", | ||
287 | pxm, start, end, node_to_pxm(i), | ||
288 | nodes[i].start, nodes[i].end); | ||
289 | bad_srat(); | 225 | bad_srat(); |
290 | return; | 226 | return; |
291 | } | 227 | } |
292 | nd = &nodes[node]; | ||
293 | oldnode = *nd; | ||
294 | if (!node_test_and_set(node, nodes_parsed)) { | ||
295 | nd->start = start; | ||
296 | nd->end = end; | ||
297 | } else { | ||
298 | if (start < nd->start) | ||
299 | nd->start = start; | ||
300 | if (nd->end < end) | ||
301 | nd->end = end; | ||
302 | } | ||
303 | 228 | ||
304 | printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, | 229 | printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, |
305 | start, end); | 230 | start, end); |
306 | 231 | ||
307 | if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { | 232 | if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) |
308 | update_nodes_add(node, start, end); | 233 | update_nodes_add(node, start, end); |
309 | /* restore nodes[node] */ | ||
310 | *nd = oldnode; | ||
311 | if ((nd->start | nd->end) == 0) | ||
312 | node_clear(node, nodes_parsed); | ||
313 | } | ||
314 | |||
315 | node_memblk_range[num_node_memblks].start = start; | ||
316 | node_memblk_range[num_node_memblks].end = end; | ||
317 | memblk_nodeid[num_node_memblks] = node; | ||
318 | num_node_memblks++; | ||
319 | } | ||
320 | |||
321 | /* Sanity check to catch more bad SRATs (they are amazingly common). | ||
322 | Make sure the PXMs cover all memory. */ | ||
323 | static int __init nodes_cover_memory(const struct bootnode *nodes) | ||
324 | { | ||
325 | int i; | ||
326 | unsigned long pxmram, e820ram; | ||
327 | |||
328 | pxmram = 0; | ||
329 | for_each_node_mask(i, nodes_parsed) { | ||
330 | unsigned long s = nodes[i].start >> PAGE_SHIFT; | ||
331 | unsigned long e = nodes[i].end >> PAGE_SHIFT; | ||
332 | pxmram += e - s; | ||
333 | pxmram -= __absent_pages_in_range(i, s, e); | ||
334 | if ((long)pxmram < 0) | ||
335 | pxmram = 0; | ||
336 | } | ||
337 | |||
338 | e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT); | ||
339 | /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ | ||
340 | if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) { | ||
341 | printk(KERN_ERR | ||
342 | "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", | ||
343 | (pxmram << PAGE_SHIFT) >> 20, | ||
344 | (e820ram << PAGE_SHIFT) >> 20); | ||
345 | return 0; | ||
346 | } | ||
347 | return 1; | ||
348 | } | 234 | } |
349 | 235 | ||
350 | void __init acpi_numa_arch_fixup(void) {} | 236 | void __init acpi_numa_arch_fixup(void) {} |
351 | 237 | ||
352 | #ifdef CONFIG_NUMA_EMU | 238 | int __init x86_acpi_numa_init(void) |
353 | void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start, | ||
354 | unsigned long end) | ||
355 | { | ||
356 | int i; | ||
357 | |||
358 | for_each_node_mask(i, nodes_parsed) { | ||
359 | cutoff_node(i, start, end); | ||
360 | physnodes[i].start = nodes[i].start; | ||
361 | physnodes[i].end = nodes[i].end; | ||
362 | } | ||
363 | } | ||
364 | #endif /* CONFIG_NUMA_EMU */ | ||
365 | |||
366 | /* Use the information discovered above to actually set up the nodes. */ | ||
367 | int __init acpi_scan_nodes(unsigned long start, unsigned long end) | ||
368 | { | 239 | { |
369 | int i; | 240 | int ret; |
370 | |||
371 | if (acpi_numa <= 0) | ||
372 | return -1; | ||
373 | |||
374 | /* First clean up the node list */ | ||
375 | for (i = 0; i < MAX_NUMNODES; i++) | ||
376 | cutoff_node(i, start, end); | ||
377 | |||
378 | /* | ||
379 | * Join together blocks on the same node, holes between | ||
380 | * which don't overlap with memory on other nodes. | ||
381 | */ | ||
382 | for (i = 0; i < num_node_memblks; ++i) { | ||
383 | int j, k; | ||
384 | |||
385 | for (j = i + 1; j < num_node_memblks; ++j) { | ||
386 | unsigned long start, end; | ||
387 | |||
388 | if (memblk_nodeid[i] != memblk_nodeid[j]) | ||
389 | continue; | ||
390 | start = min(node_memblk_range[i].end, | ||
391 | node_memblk_range[j].end); | ||
392 | end = max(node_memblk_range[i].start, | ||
393 | node_memblk_range[j].start); | ||
394 | for (k = 0; k < num_node_memblks; ++k) { | ||
395 | if (memblk_nodeid[i] == memblk_nodeid[k]) | ||
396 | continue; | ||
397 | if (start < node_memblk_range[k].end && | ||
398 | end > node_memblk_range[k].start) | ||
399 | break; | ||
400 | } | ||
401 | if (k < num_node_memblks) | ||
402 | continue; | ||
403 | start = min(node_memblk_range[i].start, | ||
404 | node_memblk_range[j].start); | ||
405 | end = max(node_memblk_range[i].end, | ||
406 | node_memblk_range[j].end); | ||
407 | printk(KERN_INFO "SRAT: Node %d " | ||
408 | "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", | ||
409 | memblk_nodeid[i], | ||
410 | node_memblk_range[i].start, | ||
411 | node_memblk_range[i].end, | ||
412 | node_memblk_range[j].start, | ||
413 | node_memblk_range[j].end, | ||
414 | start, end); | ||
415 | node_memblk_range[i].start = start; | ||
416 | node_memblk_range[i].end = end; | ||
417 | k = --num_node_memblks - j; | ||
418 | memmove(memblk_nodeid + j, memblk_nodeid + j+1, | ||
419 | k * sizeof(*memblk_nodeid)); | ||
420 | memmove(node_memblk_range + j, node_memblk_range + j+1, | ||
421 | k * sizeof(*node_memblk_range)); | ||
422 | --j; | ||
423 | } | ||
424 | } | ||
425 | |||
426 | memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, | ||
427 | memblk_nodeid); | ||
428 | if (memnode_shift < 0) { | ||
429 | printk(KERN_ERR | ||
430 | "SRAT: No NUMA node hash function found. Contact maintainer\n"); | ||
431 | bad_srat(); | ||
432 | return -1; | ||
433 | } | ||
434 | |||
435 | for (i = 0; i < num_node_memblks; i++) | ||
436 | memblock_x86_register_active_regions(memblk_nodeid[i], | ||
437 | node_memblk_range[i].start >> PAGE_SHIFT, | ||
438 | node_memblk_range[i].end >> PAGE_SHIFT); | ||
439 | |||
440 | /* for out of order entries in SRAT */ | ||
441 | sort_node_map(); | ||
442 | if (!nodes_cover_memory(nodes)) { | ||
443 | bad_srat(); | ||
444 | return -1; | ||
445 | } | ||
446 | 241 | ||
447 | /* Account for nodes with cpus and no memory */ | 242 | ret = acpi_numa_init(); |
448 | nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); | 243 | if (ret < 0) |
449 | 244 | return ret; | |
450 | /* Finally register nodes */ | 245 | return srat_disabled() ? -EINVAL : 0; |
451 | for_each_node_mask(i, node_possible_map) | ||
452 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
453 | /* Try again in case setup_node_bootmem missed one due | ||
454 | to missing bootmem */ | ||
455 | for_each_node_mask(i, node_possible_map) | ||
456 | if (!node_online(i)) | ||
457 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
458 | |||
459 | for (i = 0; i < nr_cpu_ids; i++) { | ||
460 | int node = early_cpu_to_node(i); | ||
461 | |||
462 | if (node == NUMA_NO_NODE) | ||
463 | continue; | ||
464 | if (!node_online(node)) | ||
465 | numa_clear_node(i); | ||
466 | } | ||
467 | numa_init_array(); | ||
468 | return 0; | ||
469 | } | ||
470 | |||
471 | #ifdef CONFIG_NUMA_EMU | ||
472 | static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = { | ||
473 | [0 ... MAX_NUMNODES-1] = PXM_INVAL | ||
474 | }; | ||
475 | static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { | ||
476 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
477 | }; | ||
478 | static int __init find_node_by_addr(unsigned long addr) | ||
479 | { | ||
480 | int ret = NUMA_NO_NODE; | ||
481 | int i; | ||
482 | |||
483 | for_each_node_mask(i, nodes_parsed) { | ||
484 | /* | ||
485 | * Find the real node that this emulated node appears on. For | ||
486 | * the sake of simplicity, we only use a real node's starting | ||
487 | * address to determine which emulated node it appears on. | ||
488 | */ | ||
489 | if (addr >= nodes[i].start && addr < nodes[i].end) { | ||
490 | ret = i; | ||
491 | break; | ||
492 | } | ||
493 | } | ||
494 | return ret; | ||
495 | } | 246 | } |
496 | 247 | ||
497 | /* | ||
498 | * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID | ||
499 | * mappings that respect the real ACPI topology but reflect our emulated | ||
500 | * environment. For each emulated node, we find which real node it appears on | ||
501 | * and create PXM to NID mappings for those fake nodes which mirror that | ||
502 | * locality. SLIT will now represent the correct distances between emulated | ||
503 | * nodes as a result of the real topology. | ||
504 | */ | ||
505 | void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) | ||
506 | { | ||
507 | int i, j; | ||
508 | |||
509 | for (i = 0; i < num_nodes; i++) { | ||
510 | int nid, pxm; | ||
511 | |||
512 | nid = find_node_by_addr(fake_nodes[i].start); | ||
513 | if (nid == NUMA_NO_NODE) | ||
514 | continue; | ||
515 | pxm = node_to_pxm(nid); | ||
516 | if (pxm == PXM_INVAL) | ||
517 | continue; | ||
518 | fake_node_to_pxm_map[i] = pxm; | ||
519 | /* | ||
520 | * For each apicid_to_node mapping that exists for this real | ||
521 | * node, it must now point to the fake node ID. | ||
522 | */ | ||
523 | for (j = 0; j < MAX_LOCAL_APIC; j++) | ||
524 | if (apicid_to_node[j] == nid && | ||
525 | fake_apicid_to_node[j] == NUMA_NO_NODE) | ||
526 | fake_apicid_to_node[j] = i; | ||
527 | } | ||
528 | |||
529 | /* | ||
530 | * If there are apicid-to-node mappings for physical nodes that do not | ||
531 | * have a corresponding emulated node, it should default to a guaranteed | ||
532 | * value. | ||
533 | */ | ||
534 | for (i = 0; i < MAX_LOCAL_APIC; i++) | ||
535 | if (apicid_to_node[i] != NUMA_NO_NODE && | ||
536 | fake_apicid_to_node[i] == NUMA_NO_NODE) | ||
537 | fake_apicid_to_node[i] = 0; | ||
538 | |||
539 | for (i = 0; i < num_nodes; i++) | ||
540 | __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); | ||
541 | memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); | ||
542 | |||
543 | nodes_clear(nodes_parsed); | ||
544 | for (i = 0; i < num_nodes; i++) | ||
545 | if (fake_nodes[i].start != fake_nodes[i].end) | ||
546 | node_set(i, nodes_parsed); | ||
547 | } | ||
548 | |||
549 | static int null_slit_node_compare(int a, int b) | ||
550 | { | ||
551 | return node_to_pxm(a) == node_to_pxm(b); | ||
552 | } | ||
553 | #else | ||
554 | static int null_slit_node_compare(int a, int b) | ||
555 | { | ||
556 | return a == b; | ||
557 | } | ||
558 | #endif /* CONFIG_NUMA_EMU */ | ||
559 | |||
560 | int __node_distance(int a, int b) | ||
561 | { | ||
562 | int index; | ||
563 | |||
564 | if (!acpi_slit) | ||
565 | return null_slit_node_compare(a, b) ? LOCAL_DISTANCE : | ||
566 | REMOTE_DISTANCE; | ||
567 | index = acpi_slit->locality_count * node_to_pxm(a); | ||
568 | return acpi_slit->entry[index + node_to_pxm(b)]; | ||
569 | } | ||
570 | |||
571 | EXPORT_SYMBOL(__node_distance); | ||
572 | |||
573 | #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) | 248 | #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) |
574 | int memory_add_physaddr_to_nid(u64 start) | 249 | int memory_add_physaddr_to_nid(u64 start) |
575 | { | 250 | { |
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6acc724d5d8f..d6c0418c3e47 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -179,12 +179,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, | |||
179 | sender = this_cpu_read(tlb_vector_offset); | 179 | sender = this_cpu_read(tlb_vector_offset); |
180 | f = &flush_state[sender]; | 180 | f = &flush_state[sender]; |
181 | 181 | ||
182 | /* | 182 | if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) |
183 | * Could avoid this lock when | 183 | raw_spin_lock(&f->tlbstate_lock); |
184 | * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is | ||
185 | * probably not worth checking this for a cache-hot lock. | ||
186 | */ | ||
187 | raw_spin_lock(&f->tlbstate_lock); | ||
188 | 184 | ||
189 | f->flush_mm = mm; | 185 | f->flush_mm = mm; |
190 | f->flush_va = va; | 186 | f->flush_va = va; |
@@ -202,7 +198,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, | |||
202 | 198 | ||
203 | f->flush_mm = NULL; | 199 | f->flush_mm = NULL; |
204 | f->flush_va = 0; | 200 | f->flush_va = 0; |
205 | raw_spin_unlock(&f->tlbstate_lock); | 201 | if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) |
202 | raw_spin_unlock(&f->tlbstate_lock); | ||
206 | } | 203 | } |
207 | 204 | ||
208 | void native_flush_tlb_others(const struct cpumask *cpumask, | 205 | void native_flush_tlb_others(const struct cpumask *cpumask, |
@@ -211,11 +208,10 @@ void native_flush_tlb_others(const struct cpumask *cpumask, | |||
211 | if (is_uv_system()) { | 208 | if (is_uv_system()) { |
212 | unsigned int cpu; | 209 | unsigned int cpu; |
213 | 210 | ||
214 | cpu = get_cpu(); | 211 | cpu = smp_processor_id(); |
215 | cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); | 212 | cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); |
216 | if (cpumask) | 213 | if (cpumask) |
217 | flush_tlb_others_ipi(cpumask, mm, va); | 214 | flush_tlb_others_ipi(cpumask, mm, va); |
218 | put_cpu(); | ||
219 | return; | 215 | return; |
220 | } | 216 | } |
221 | flush_tlb_others_ipi(cpumask, mm, va); | 217 | flush_tlb_others_ipi(cpumask, mm, va); |