diff options
Diffstat (limited to 'arch/x86_64/mm/numa.c')
-rw-r--r-- | arch/x86_64/mm/numa.c | 294 |
1 files changed, 294 insertions, 0 deletions
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c new file mode 100644 index 000000000000..fd9f25d7a6c4 --- /dev/null +++ b/arch/x86_64/mm/numa.c | |||
@@ -0,0 +1,294 @@ | |||
1 | /* | ||
2 | * Generic VM initialization for x86-64 NUMA setups. | ||
3 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. | ||
4 | */ | ||
5 | #include <linux/kernel.h> | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/string.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/bootmem.h> | ||
10 | #include <linux/mmzone.h> | ||
11 | #include <linux/ctype.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/nodemask.h> | ||
14 | |||
15 | #include <asm/e820.h> | ||
16 | #include <asm/proto.h> | ||
17 | #include <asm/dma.h> | ||
18 | #include <asm/numa.h> | ||
19 | #include <asm/acpi.h> | ||
20 | |||
21 | #ifndef Dprintk | ||
22 | #define Dprintk(x...) | ||
23 | #endif | ||
24 | |||
25 | struct pglist_data *node_data[MAX_NUMNODES]; | ||
26 | bootmem_data_t plat_node_bdata[MAX_NUMNODES]; | ||
27 | |||
28 | int memnode_shift; | ||
29 | u8 memnodemap[NODEMAPSIZE]; | ||
30 | |||
31 | unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; | ||
32 | cpumask_t node_to_cpumask[MAX_NUMNODES]; | ||
33 | |||
34 | int numa_off __initdata; | ||
35 | |||
36 | int __init compute_hash_shift(struct node *nodes, int numnodes) | ||
37 | { | ||
38 | int i; | ||
39 | int shift = 24; | ||
40 | u64 addr; | ||
41 | |||
42 | /* When in doubt use brute force. */ | ||
43 | while (shift < 48) { | ||
44 | memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE); | ||
45 | for (i = 0; i < numnodes; i++) { | ||
46 | if (nodes[i].start == nodes[i].end) | ||
47 | continue; | ||
48 | for (addr = nodes[i].start; | ||
49 | addr < nodes[i].end; | ||
50 | addr += (1UL << shift)) { | ||
51 | if (memnodemap[addr >> shift] != 0xff && | ||
52 | memnodemap[addr >> shift] != i) { | ||
53 | printk(KERN_INFO | ||
54 | "node %d shift %d addr %Lx conflict %d\n", | ||
55 | i, shift, addr, memnodemap[addr>>shift]); | ||
56 | goto next; | ||
57 | } | ||
58 | memnodemap[addr >> shift] = i; | ||
59 | } | ||
60 | } | ||
61 | return shift; | ||
62 | next: | ||
63 | shift++; | ||
64 | } | ||
65 | memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE); | ||
66 | return -1; | ||
67 | } | ||
68 | |||
69 | /* Initialize bootmem allocator for a node */ | ||
70 | void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | ||
71 | { | ||
72 | unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; | ||
73 | unsigned long nodedata_phys; | ||
74 | const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); | ||
75 | |||
76 | start = round_up(start, ZONE_ALIGN); | ||
77 | |||
78 | printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); | ||
79 | |||
80 | start_pfn = start >> PAGE_SHIFT; | ||
81 | end_pfn = end >> PAGE_SHIFT; | ||
82 | |||
83 | nodedata_phys = find_e820_area(start, end, pgdat_size); | ||
84 | if (nodedata_phys == -1L) | ||
85 | panic("Cannot find memory pgdat in node %d\n", nodeid); | ||
86 | |||
87 | Dprintk("nodedata_phys %lx\n", nodedata_phys); | ||
88 | |||
89 | node_data[nodeid] = phys_to_virt(nodedata_phys); | ||
90 | memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); | ||
91 | NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; | ||
92 | NODE_DATA(nodeid)->node_start_pfn = start_pfn; | ||
93 | NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; | ||
94 | |||
95 | /* Find a place for the bootmem map */ | ||
96 | bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); | ||
97 | bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); | ||
98 | bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT); | ||
99 | if (bootmap_start == -1L) | ||
100 | panic("Not enough continuous space for bootmap on node %d", nodeid); | ||
101 | Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); | ||
102 | |||
103 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), | ||
104 | bootmap_start >> PAGE_SHIFT, | ||
105 | start_pfn, end_pfn); | ||
106 | |||
107 | e820_bootmem_free(NODE_DATA(nodeid), start, end); | ||
108 | |||
109 | reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); | ||
110 | reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); | ||
111 | node_set_online(nodeid); | ||
112 | } | ||
113 | |||
114 | /* Initialize final allocator for a zone */ | ||
115 | void __init setup_node_zones(int nodeid) | ||
116 | { | ||
117 | unsigned long start_pfn, end_pfn; | ||
118 | unsigned long zones[MAX_NR_ZONES]; | ||
119 | unsigned long dma_end_pfn; | ||
120 | |||
121 | memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); | ||
122 | |||
123 | start_pfn = node_start_pfn(nodeid); | ||
124 | end_pfn = node_end_pfn(nodeid); | ||
125 | |||
126 | Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn); | ||
127 | |||
128 | /* All nodes > 0 have a zero length zone DMA */ | ||
129 | dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; | ||
130 | if (start_pfn < dma_end_pfn) { | ||
131 | zones[ZONE_DMA] = dma_end_pfn - start_pfn; | ||
132 | zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; | ||
133 | } else { | ||
134 | zones[ZONE_NORMAL] = end_pfn - start_pfn; | ||
135 | } | ||
136 | |||
137 | free_area_init_node(nodeid, NODE_DATA(nodeid), zones, | ||
138 | start_pfn, NULL); | ||
139 | } | ||
140 | |||
141 | void __init numa_init_array(void) | ||
142 | { | ||
143 | int rr, i; | ||
144 | /* There are unfortunately some poorly designed mainboards around | ||
145 | that only connect memory to a single CPU. This breaks the 1:1 cpu->node | ||
146 | mapping. To avoid this fill in the mapping for all possible | ||
147 | CPUs, as the number of CPUs is not known yet. | ||
148 | We round robin the existing nodes. */ | ||
149 | rr = 0; | ||
150 | for (i = 0; i < NR_CPUS; i++) { | ||
151 | if (cpu_to_node[i] != NUMA_NO_NODE) | ||
152 | continue; | ||
153 | rr = next_node(rr, node_online_map); | ||
154 | if (rr == MAX_NUMNODES) | ||
155 | rr = first_node(node_online_map); | ||
156 | cpu_to_node[i] = rr; | ||
157 | rr++; | ||
158 | } | ||
159 | |||
160 | set_bit(0, &node_to_cpumask[cpu_to_node(0)]); | ||
161 | } | ||
162 | |||
163 | #ifdef CONFIG_NUMA_EMU | ||
164 | int numa_fake __initdata = 0; | ||
165 | |||
166 | /* Numa emulation */ | ||
167 | static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | ||
168 | { | ||
169 | int i; | ||
170 | struct node nodes[MAX_NUMNODES]; | ||
171 | unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake; | ||
172 | |||
173 | /* Kludge needed for the hash function */ | ||
174 | if (hweight64(sz) > 1) { | ||
175 | unsigned long x = 1; | ||
176 | while ((x << 1) < sz) | ||
177 | x <<= 1; | ||
178 | if (x < sz/2) | ||
179 | printk("Numa emulation unbalanced. Complain to maintainer\n"); | ||
180 | sz = x; | ||
181 | } | ||
182 | |||
183 | memset(&nodes,0,sizeof(nodes)); | ||
184 | for (i = 0; i < numa_fake; i++) { | ||
185 | nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz; | ||
186 | if (i == numa_fake-1) | ||
187 | sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; | ||
188 | nodes[i].end = nodes[i].start + sz; | ||
189 | if (i != numa_fake-1) | ||
190 | nodes[i].end--; | ||
191 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", | ||
192 | i, | ||
193 | nodes[i].start, nodes[i].end, | ||
194 | (nodes[i].end - nodes[i].start) >> 20); | ||
195 | node_set_online(i); | ||
196 | } | ||
197 | memnode_shift = compute_hash_shift(nodes, numa_fake); | ||
198 | if (memnode_shift < 0) { | ||
199 | memnode_shift = 0; | ||
200 | printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); | ||
201 | return -1; | ||
202 | } | ||
203 | for_each_online_node(i) | ||
204 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
205 | numa_init_array(); | ||
206 | return 0; | ||
207 | } | ||
208 | #endif | ||
209 | |||
210 | void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | ||
211 | { | ||
212 | int i; | ||
213 | |||
214 | #ifdef CONFIG_NUMA_EMU | ||
215 | if (numa_fake && !numa_emulation(start_pfn, end_pfn)) | ||
216 | return; | ||
217 | #endif | ||
218 | |||
219 | #ifdef CONFIG_ACPI_NUMA | ||
220 | if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, | ||
221 | end_pfn << PAGE_SHIFT)) | ||
222 | return; | ||
223 | #endif | ||
224 | |||
225 | #ifdef CONFIG_K8_NUMA | ||
226 | if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) | ||
227 | return; | ||
228 | #endif | ||
229 | printk(KERN_INFO "%s\n", | ||
230 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); | ||
231 | |||
232 | printk(KERN_INFO "Faking a node at %016lx-%016lx\n", | ||
233 | start_pfn << PAGE_SHIFT, | ||
234 | end_pfn << PAGE_SHIFT); | ||
235 | /* setup dummy node covering all memory */ | ||
236 | memnode_shift = 63; | ||
237 | memnodemap[0] = 0; | ||
238 | nodes_clear(node_online_map); | ||
239 | node_set_online(0); | ||
240 | for (i = 0; i < NR_CPUS; i++) | ||
241 | cpu_to_node[i] = 0; | ||
242 | node_to_cpumask[0] = cpumask_of_cpu(0); | ||
243 | setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); | ||
244 | } | ||
245 | |||
246 | __init void numa_add_cpu(int cpu) | ||
247 | { | ||
248 | /* BP is initialized elsewhere */ | ||
249 | if (cpu) | ||
250 | set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); | ||
251 | } | ||
252 | |||
253 | unsigned long __init numa_free_all_bootmem(void) | ||
254 | { | ||
255 | int i; | ||
256 | unsigned long pages = 0; | ||
257 | for_each_online_node(i) { | ||
258 | pages += free_all_bootmem_node(NODE_DATA(i)); | ||
259 | } | ||
260 | return pages; | ||
261 | } | ||
262 | |||
263 | void __init paging_init(void) | ||
264 | { | ||
265 | int i; | ||
266 | for_each_online_node(i) { | ||
267 | setup_node_zones(i); | ||
268 | } | ||
269 | } | ||
270 | |||
271 | /* [numa=off] */ | ||
272 | __init int numa_setup(char *opt) | ||
273 | { | ||
274 | if (!strncmp(opt,"off",3)) | ||
275 | numa_off = 1; | ||
276 | #ifdef CONFIG_NUMA_EMU | ||
277 | if(!strncmp(opt, "fake=", 5)) { | ||
278 | numa_fake = simple_strtoul(opt+5,NULL,0); ; | ||
279 | if (numa_fake >= MAX_NUMNODES) | ||
280 | numa_fake = MAX_NUMNODES; | ||
281 | } | ||
282 | #endif | ||
283 | #ifdef CONFIG_ACPI_NUMA | ||
284 | if (!strncmp(opt,"noacpi",6)) | ||
285 | acpi_numa = -1; | ||
286 | #endif | ||
287 | return 1; | ||
288 | } | ||
289 | |||
290 | EXPORT_SYMBOL(cpu_to_node); | ||
291 | EXPORT_SYMBOL(node_to_cpumask); | ||
292 | EXPORT_SYMBOL(memnode_shift); | ||
293 | EXPORT_SYMBOL(memnodemap); | ||
294 | EXPORT_SYMBOL(node_data); | ||