diff options
author | Tejun Heo <tj@kernel.org> | 2011-05-02 08:18:53 -0400 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2011-05-02 08:18:53 -0400 |
commit | 5acd91ab837c9d066af7345aea6462dc55695db7 (patch) | |
tree | 73101072880ccdde1de57f69030806729556c30b /arch/x86 | |
parent | b0d310801a4c1f95b44357e4ebc22a9903e3bf3d (diff) |
x86-32, NUMA: Replace srat_32.c with srat.c
SRAT support implementation in srat_32.c and srat.c are generally
similar; however, there are some differences.
First of all, 64bit implementation supports more types of SRAT
entries. 64bit supports x2apic, affinity, memory and SLIT. 32bit
only supports processor and memory.
Most other differences stem from different initialization protocols
employed by 64bit and 32bit NUMA init paths.
On 64bit,
* Mappings among PXM, node and apicid are directly done in each SRAT
entry callback.
* Memory affinity information is passed to numa_add_memblk() which
takes care of all interfacing with NUMA init.
* Doesn't directly initialize NUMA configurations. All the
information is recorded in numa_nodes_parsed and memblks.
On 32bit,
* Checks numa_off.
* Things go through one more level of indirection via private tables
but eventually end up initializing the same mappings.
* node_start/end_pfn[] are initialized and
memblock_x86_register_active_regions() is called for each memory
chunk.
* node_set_online() is called for each online node.
* sort_node_map() is called.
There are also other minor differences in sanity checking and messages
but taking 64bit version should be good enough.
This patch drops the 32bit specific implementation and makes the 64bit
implementation common for both 32 and 64bit.
The init protocol differences are dealt with in two places - the
numa_add_memblk() shim added in the previous patch and new temporary
numa_32.c:get_memcfg_from_srat() which wraps invocation of
x86_acpi_numa_init().
The shim numa_add_memblk() handles the folowings.
* node_start/end_pfn[] initialization.
* node_set_online() for memory nodes.
* Invocation of memblock_x86_register_active_regions().
The shim get_memcfg_from_srat() handles the followings.
* numa_off check.
* node_set_online() for CPU nodes.
* sort_node_map() invocation.
* Clearing of numa_nodes_parsed and active_ranges on failure.
The shims are temporary and will be removed as the generic NUMA init
path in 32bit is replaced with 64bit one.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/include/asm/mmzone_32.h | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/srat.h | 39 | ||||
-rw-r--r-- | arch/x86/mm/Makefile | 5 | ||||
-rw-r--r-- | arch/x86/mm/numa_32.c | 23 | ||||
-rw-r--r-- | arch/x86/mm/srat_32.c | 281 |
5 files changed, 24 insertions, 326 deletions
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 73e5745aef34..5e83a416eca8 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h | |||
@@ -13,8 +13,6 @@ extern struct pglist_data *node_data[]; | |||
13 | #define NODE_DATA(nid) (node_data[nid]) | 13 | #define NODE_DATA(nid) (node_data[nid]) |
14 | 14 | ||
15 | #include <asm/numaq.h> | 15 | #include <asm/numaq.h> |
16 | /* summit or generic arch */ | ||
17 | #include <asm/srat.h> | ||
18 | 16 | ||
19 | extern void resume_map_numa_kva(pgd_t *pgd); | 17 | extern void resume_map_numa_kva(pgd_t *pgd); |
20 | 18 | ||
diff --git a/arch/x86/include/asm/srat.h b/arch/x86/include/asm/srat.h deleted file mode 100644 index b508d639d1a7..000000000000 --- a/arch/x86/include/asm/srat.h +++ /dev/null | |||
@@ -1,39 +0,0 @@ | |||
1 | /* | ||
2 | * Some of the code in this file has been gleaned from the 64 bit | ||
3 | * discontigmem support code base. | ||
4 | * | ||
5 | * Copyright (C) 2002, IBM Corp. | ||
6 | * | ||
7 | * All rights reserved. | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License as published by | ||
11 | * the Free Software Foundation; either version 2 of the License, or | ||
12 | * (at your option) any later version. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, but | ||
15 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
17 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
18 | * details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public License | ||
21 | * along with this program; if not, write to the Free Software | ||
22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | * | ||
24 | * Send feedback to Pat Gaughen <gone@us.ibm.com> | ||
25 | */ | ||
26 | |||
27 | #ifndef _ASM_X86_SRAT_H | ||
28 | #define _ASM_X86_SRAT_H | ||
29 | |||
30 | #ifdef CONFIG_ACPI_NUMA | ||
31 | extern int get_memcfg_from_srat(void); | ||
32 | #else | ||
33 | static inline int get_memcfg_from_srat(void) | ||
34 | { | ||
35 | return 0; | ||
36 | } | ||
37 | #endif | ||
38 | |||
39 | #endif /* _ASM_X86_SRAT_H */ | ||
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 37e7043362a1..62997be33072 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -24,10 +24,7 @@ obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o | |||
24 | 24 | ||
25 | obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o | 25 | obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o |
26 | obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o | 26 | obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o |
27 | ifeq ($(CONFIG_ACPI_NUMA),y) | 27 | obj-$(CONFIG_ACPI_NUMA) += srat.o |
28 | obj-$(CONFIG_X86_64) += srat.o | ||
29 | obj-$(CONFIG_X86_32) += srat_32.o | ||
30 | endif | ||
31 | obj-$(CONFIG_NUMA_EMU) += numa_emulation.o | 28 | obj-$(CONFIG_NUMA_EMU) += numa_emulation.o |
32 | 29 | ||
33 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o | 30 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o |
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index d0369a56f843..8641239a0667 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c | |||
@@ -332,6 +332,29 @@ static __init void init_alloc_remap(int nid) | |||
332 | nid, node_pa, node_pa + size, remap_va, remap_va + size); | 332 | nid, node_pa, node_pa + size, remap_va, remap_va + size); |
333 | } | 333 | } |
334 | 334 | ||
335 | static int get_memcfg_from_srat(void) | ||
336 | { | ||
337 | #ifdef CONFIG_ACPI_NUMA | ||
338 | int nid; | ||
339 | |||
340 | if (numa_off) | ||
341 | return 0; | ||
342 | |||
343 | if (x86_acpi_numa_init() < 0) { | ||
344 | nodes_clear(numa_nodes_parsed); | ||
345 | remove_all_active_ranges(); | ||
346 | return 0; | ||
347 | } | ||
348 | |||
349 | for_each_node_mask(nid, numa_nodes_parsed) | ||
350 | node_set_online(nid); | ||
351 | sort_node_map(); | ||
352 | return 1; | ||
353 | #else | ||
354 | return 0; | ||
355 | #endif | ||
356 | } | ||
357 | |||
335 | static void get_memcfg_numa(void) | 358 | static void get_memcfg_numa(void) |
336 | { | 359 | { |
337 | if (get_memcfg_numaq()) | 360 | if (get_memcfg_numaq()) |
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c deleted file mode 100644 index 6b9bfd78bc35..000000000000 --- a/arch/x86/mm/srat_32.c +++ /dev/null | |||
@@ -1,281 +0,0 @@ | |||
1 | /* | ||
2 | * Some of the code in this file has been gleaned from the 64 bit | ||
3 | * discontigmem support code base. | ||
4 | * | ||
5 | * Copyright (C) 2002, IBM Corp. | ||
6 | * | ||
7 | * All rights reserved. | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License as published by | ||
11 | * the Free Software Foundation; either version 2 of the License, or | ||
12 | * (at your option) any later version. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, but | ||
15 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
17 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
18 | * details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public License | ||
21 | * along with this program; if not, write to the Free Software | ||
22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | * | ||
24 | * Send feedback to Pat Gaughen <gone@us.ibm.com> | ||
25 | */ | ||
26 | #include <linux/mm.h> | ||
27 | #include <linux/bootmem.h> | ||
28 | #include <linux/memblock.h> | ||
29 | #include <linux/mmzone.h> | ||
30 | #include <linux/acpi.h> | ||
31 | #include <linux/nodemask.h> | ||
32 | #include <asm/srat.h> | ||
33 | #include <asm/topology.h> | ||
34 | #include <asm/smp.h> | ||
35 | #include <asm/e820.h> | ||
36 | |||
37 | /* | ||
38 | * proximity macros and definitions | ||
39 | */ | ||
40 | #define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ | ||
41 | #define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ | ||
42 | #define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) | ||
43 | #define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) | ||
44 | /* bitmap length; _PXM is at most 255 */ | ||
45 | #define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) | ||
46 | static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ | ||
47 | |||
48 | #define MAX_CHUNKS_PER_NODE 3 | ||
49 | #define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) | ||
50 | struct node_memory_chunk_s { | ||
51 | unsigned long start_pfn; | ||
52 | unsigned long end_pfn; | ||
53 | u8 pxm; // proximity domain of node | ||
54 | u8 nid; // which cnode contains this chunk? | ||
55 | u8 bank; // which mem bank on this node | ||
56 | }; | ||
57 | static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; | ||
58 | |||
59 | static int __initdata num_memory_chunks; /* total number of memory chunks */ | ||
60 | static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC]; | ||
61 | |||
62 | int acpi_numa __initdata; | ||
63 | |||
64 | static __init void bad_srat(void) | ||
65 | { | ||
66 | printk(KERN_ERR "SRAT: SRAT not used.\n"); | ||
67 | acpi_numa = -1; | ||
68 | num_memory_chunks = 0; | ||
69 | } | ||
70 | |||
71 | static __init inline int srat_disabled(void) | ||
72 | { | ||
73 | return numa_off || acpi_numa < 0; | ||
74 | } | ||
75 | |||
76 | /* Identify CPU proximity domains */ | ||
77 | void __init | ||
78 | acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) | ||
79 | { | ||
80 | if (srat_disabled()) | ||
81 | return; | ||
82 | if (cpu_affinity->header.length != | ||
83 | sizeof(struct acpi_srat_cpu_affinity)) { | ||
84 | bad_srat(); | ||
85 | return; | ||
86 | } | ||
87 | |||
88 | if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0) | ||
89 | return; /* empty entry */ | ||
90 | |||
91 | /* mark this node as "seen" in node bitmap */ | ||
92 | BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); | ||
93 | |||
94 | /* don't need to check apic_id here, because it is always 8 bits */ | ||
95 | apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; | ||
96 | |||
97 | printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", | ||
98 | cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo); | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * Identify memory proximity domains and hot-remove capabilities. | ||
103 | * Fill node memory chunk list structure. | ||
104 | */ | ||
105 | void __init | ||
106 | acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity) | ||
107 | { | ||
108 | unsigned long long paddr, size; | ||
109 | unsigned long start_pfn, end_pfn; | ||
110 | u8 pxm; | ||
111 | struct node_memory_chunk_s *p, *q, *pend; | ||
112 | |||
113 | if (srat_disabled()) | ||
114 | return; | ||
115 | if (memory_affinity->header.length != | ||
116 | sizeof(struct acpi_srat_mem_affinity)) { | ||
117 | bad_srat(); | ||
118 | return; | ||
119 | } | ||
120 | |||
121 | if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0) | ||
122 | return; /* empty entry */ | ||
123 | |||
124 | pxm = memory_affinity->proximity_domain & 0xff; | ||
125 | |||
126 | /* mark this node as "seen" in node bitmap */ | ||
127 | BMAP_SET(pxm_bitmap, pxm); | ||
128 | |||
129 | /* calculate info for memory chunk structure */ | ||
130 | paddr = memory_affinity->base_address; | ||
131 | size = memory_affinity->length; | ||
132 | |||
133 | start_pfn = paddr >> PAGE_SHIFT; | ||
134 | end_pfn = (paddr + size) >> PAGE_SHIFT; | ||
135 | |||
136 | |||
137 | if (num_memory_chunks >= MAXCHUNKS) { | ||
138 | printk(KERN_WARNING "Too many mem chunks in SRAT." | ||
139 | " Ignoring %lld MBytes at %llx\n", | ||
140 | size/(1024*1024), paddr); | ||
141 | return; | ||
142 | } | ||
143 | |||
144 | /* Insertion sort based on base address */ | ||
145 | pend = &node_memory_chunk[num_memory_chunks]; | ||
146 | for (p = &node_memory_chunk[0]; p < pend; p++) { | ||
147 | if (start_pfn < p->start_pfn) | ||
148 | break; | ||
149 | } | ||
150 | if (p < pend) { | ||
151 | for (q = pend; q >= p; q--) | ||
152 | *(q + 1) = *q; | ||
153 | } | ||
154 | p->start_pfn = start_pfn; | ||
155 | p->end_pfn = end_pfn; | ||
156 | p->pxm = pxm; | ||
157 | |||
158 | num_memory_chunks++; | ||
159 | |||
160 | printk(KERN_DEBUG "Memory range %08lx to %08lx" | ||
161 | " in proximity domain %02x %s\n", | ||
162 | start_pfn, end_pfn, | ||
163 | pxm, | ||
164 | ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? | ||
165 | "enabled and removable" : "enabled" ) ); | ||
166 | } | ||
167 | |||
168 | /* Callback for SLIT parsing */ | ||
169 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) | ||
170 | { | ||
171 | } | ||
172 | |||
173 | void acpi_numa_arch_fixup(void) | ||
174 | { | ||
175 | } | ||
176 | /* | ||
177 | * The SRAT table always lists ascending addresses, so can always | ||
178 | * assume that the first "start" address that you see is the real | ||
179 | * start of the node, and that the current "end" address is after | ||
180 | * the previous one. | ||
181 | */ | ||
182 | static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) | ||
183 | { | ||
184 | /* | ||
185 | * Only add present memory as told by the e820. | ||
186 | * There is no guarantee from the SRAT that the memory it | ||
187 | * enumerates is present at boot time because it represents | ||
188 | * *possible* memory hotplug areas the same as normal RAM. | ||
189 | */ | ||
190 | if (memory_chunk->start_pfn >= max_pfn) { | ||
191 | printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", | ||
192 | memory_chunk->start_pfn, memory_chunk->end_pfn); | ||
193 | return -1; | ||
194 | } | ||
195 | if (memory_chunk->nid != nid) | ||
196 | return -1; | ||
197 | |||
198 | if (!node_has_online_mem(nid)) | ||
199 | node_start_pfn[nid] = memory_chunk->start_pfn; | ||
200 | |||
201 | if (node_start_pfn[nid] > memory_chunk->start_pfn) | ||
202 | node_start_pfn[nid] = memory_chunk->start_pfn; | ||
203 | |||
204 | if (node_end_pfn[nid] < memory_chunk->end_pfn) | ||
205 | node_end_pfn[nid] = memory_chunk->end_pfn; | ||
206 | |||
207 | return 0; | ||
208 | } | ||
209 | |||
210 | int __init get_memcfg_from_srat(void) | ||
211 | { | ||
212 | int i, j; | ||
213 | |||
214 | if (srat_disabled()) | ||
215 | goto out_fail; | ||
216 | |||
217 | if (acpi_numa_init() < 0) | ||
218 | goto out_fail; | ||
219 | |||
220 | if (num_memory_chunks == 0) { | ||
221 | printk(KERN_DEBUG | ||
222 | "could not find any ACPI SRAT memory areas.\n"); | ||
223 | goto out_fail; | ||
224 | } | ||
225 | |||
226 | /* Calculate total number of nodes in system from PXM bitmap and create | ||
227 | * a set of sequential node IDs starting at zero. (ACPI doesn't seem | ||
228 | * to specify the range of _PXM values.) | ||
229 | */ | ||
230 | /* | ||
231 | * MCD - we no longer HAVE to number nodes sequentially. PXM domain | ||
232 | * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically | ||
233 | * 32, so we will continue numbering them in this manner until MAX_NUMNODES | ||
234 | * approaches MAX_PXM_DOMAINS for i386. | ||
235 | */ | ||
236 | nodes_clear(node_online_map); | ||
237 | for (i = 0; i < MAX_PXM_DOMAINS; i++) { | ||
238 | if (BMAP_TEST(pxm_bitmap, i)) { | ||
239 | int nid = acpi_map_pxm_to_node(i); | ||
240 | node_set_online(nid); | ||
241 | } | ||
242 | } | ||
243 | BUG_ON(num_online_nodes() == 0); | ||
244 | |||
245 | /* set cnode id in memory chunk structure */ | ||
246 | for (i = 0; i < num_memory_chunks; i++) | ||
247 | node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm); | ||
248 | |||
249 | printk(KERN_DEBUG "pxm bitmap: "); | ||
250 | for (i = 0; i < sizeof(pxm_bitmap); i++) { | ||
251 | printk(KERN_CONT "%02x ", pxm_bitmap[i]); | ||
252 | } | ||
253 | printk(KERN_CONT "\n"); | ||
254 | printk(KERN_DEBUG "Number of logical nodes in system = %d\n", | ||
255 | num_online_nodes()); | ||
256 | printk(KERN_DEBUG "Number of memory chunks in system = %d\n", | ||
257 | num_memory_chunks); | ||
258 | |||
259 | for (i = 0; i < MAX_LOCAL_APIC; i++) | ||
260 | set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i])); | ||
261 | |||
262 | for (j = 0; j < num_memory_chunks; j++){ | ||
263 | struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; | ||
264 | printk(KERN_DEBUG | ||
265 | "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", | ||
266 | j, chunk->nid, chunk->start_pfn, chunk->end_pfn); | ||
267 | if (node_read_chunk(chunk->nid, chunk)) | ||
268 | continue; | ||
269 | |||
270 | memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn, | ||
271 | min(chunk->end_pfn, max_pfn)); | ||
272 | } | ||
273 | /* for out of order entries in SRAT */ | ||
274 | sort_node_map(); | ||
275 | |||
276 | return 1; | ||
277 | out_fail: | ||
278 | printk(KERN_DEBUG "failed to get NUMA memory information from SRAT" | ||
279 | " table\n"); | ||
280 | return 0; | ||
281 | } | ||