diff options
-rw-r--r-- | Documentation/x86_64/boot-options.txt | 5 | ||||
-rw-r--r-- | arch/x86_64/mm/init.c | 3 | ||||
-rw-r--r-- | arch/x86_64/mm/numa.c | 5 | ||||
-rw-r--r-- | arch/x86_64/mm/srat.c | 164 | ||||
-rw-r--r-- | include/asm-x86_64/numa.h | 2 |
5 files changed, 171 insertions, 8 deletions
diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 1921353259ae..f2cd6ef53ff3 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt | |||
@@ -151,6 +151,11 @@ NUMA | |||
151 | 151 | ||
152 | numa=fake=X Fake X nodes and ignore NUMA setup of the actual machine. | 152 | numa=fake=X Fake X nodes and ignore NUMA setup of the actual machine. |
153 | 153 | ||
154 | numa=hotadd=percent | ||
155 | Only allow hotadd memory to preallocate page structures upto | ||
156 | percent of already available memory. | ||
157 | numa=hotadd=0 will disable hotadd memory. | ||
158 | |||
154 | ACPI | 159 | ACPI |
155 | 160 | ||
156 | acpi=off Don't enable ACPI | 161 | acpi=off Don't enable ACPI |
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 492161168402..dff870534199 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c | |||
@@ -530,8 +530,7 @@ int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) | |||
530 | unsigned long pfn; | 530 | unsigned long pfn; |
531 | unsigned long total = 0, mem = 0; | 531 | unsigned long total = 0, mem = 0; |
532 | for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { | 532 | for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { |
533 | unsigned long addr = pfn << PAGE_SHIFT; | 533 | if (pfn_valid(pfn)) { |
534 | if (pfn_valid(pfn) && e820_mapped(addr, addr+1, E820_RAM)) { | ||
535 | online_page(pfn_to_page(pfn)); | 534 | online_page(pfn_to_page(pfn)); |
536 | err = 0; | 535 | err = 0; |
537 | mem++; | 536 | mem++; |
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 4be82d6e2b48..779132af29a7 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c | |||
@@ -142,6 +142,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en | |||
142 | 142 | ||
143 | reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); | 143 | reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); |
144 | reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); | 144 | reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); |
145 | #ifdef CONFIG_ACPI_NUMA | ||
146 | srat_reserve_add_area(nodeid); | ||
147 | #endif | ||
145 | node_set_online(nodeid); | 148 | node_set_online(nodeid); |
146 | } | 149 | } |
147 | 150 | ||
@@ -335,6 +338,8 @@ __init int numa_setup(char *opt) | |||
335 | #ifdef CONFIG_ACPI_NUMA | 338 | #ifdef CONFIG_ACPI_NUMA |
336 | if (!strncmp(opt,"noacpi",6)) | 339 | if (!strncmp(opt,"noacpi",6)) |
337 | acpi_numa = -1; | 340 | acpi_numa = -1; |
341 | if (!strncmp(opt,"hotadd=", 7)) | ||
342 | hotadd_percent = simple_strtoul(opt+7, NULL, 10); | ||
338 | #endif | 343 | #endif |
339 | return 1; | 344 | return 1; |
340 | } | 345 | } |
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 2eb879590dc4..443875eb15a2 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c | |||
@@ -15,15 +15,26 @@ | |||
15 | #include <linux/bitmap.h> | 15 | #include <linux/bitmap.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/topology.h> | 17 | #include <linux/topology.h> |
18 | #include <linux/bootmem.h> | ||
19 | #include <linux/mm.h> | ||
18 | #include <asm/proto.h> | 20 | #include <asm/proto.h> |
19 | #include <asm/numa.h> | 21 | #include <asm/numa.h> |
20 | #include <asm/e820.h> | 22 | #include <asm/e820.h> |
21 | 23 | ||
24 | #if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \ | ||
25 | defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \ | ||
26 | && !defined(CONFIG_MEMORY_HOTPLUG) | ||
27 | #define RESERVE_HOTADD 1 | ||
28 | #endif | ||
29 | |||
22 | static struct acpi_table_slit *acpi_slit; | 30 | static struct acpi_table_slit *acpi_slit; |
23 | 31 | ||
24 | static nodemask_t nodes_parsed __initdata; | 32 | static nodemask_t nodes_parsed __initdata; |
25 | static nodemask_t nodes_found __initdata; | 33 | static nodemask_t nodes_found __initdata; |
26 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | 34 | static struct bootnode nodes[MAX_NUMNODES] __initdata; |
35 | static struct bootnode nodes_add[MAX_NUMNODES] __initdata; | ||
36 | static int found_add_area __initdata; | ||
37 | int hotadd_percent __initdata = 10; | ||
27 | static u8 pxm2node[256] = { [0 ... 255] = 0xff }; | 38 | static u8 pxm2node[256] = { [0 ... 255] = 0xff }; |
28 | 39 | ||
29 | /* Too small nodes confuse the VM badly. Usually they result | 40 | /* Too small nodes confuse the VM badly. Usually they result |
@@ -71,6 +82,10 @@ static __init int conflicting_nodes(unsigned long start, unsigned long end) | |||
71 | static __init void cutoff_node(int i, unsigned long start, unsigned long end) | 82 | static __init void cutoff_node(int i, unsigned long start, unsigned long end) |
72 | { | 83 | { |
73 | struct bootnode *nd = &nodes[i]; | 84 | struct bootnode *nd = &nodes[i]; |
85 | |||
86 | if (found_add_area) | ||
87 | return; | ||
88 | |||
74 | if (nd->start < start) { | 89 | if (nd->start < start) { |
75 | nd->start = start; | 90 | nd->start = start; |
76 | if (nd->end < nd->start) | 91 | if (nd->end < nd->start) |
@@ -90,6 +105,8 @@ static __init void bad_srat(void) | |||
90 | acpi_numa = -1; | 105 | acpi_numa = -1; |
91 | for (i = 0; i < MAX_LOCAL_APIC; i++) | 106 | for (i = 0; i < MAX_LOCAL_APIC; i++) |
92 | apicid_to_node[i] = NUMA_NO_NODE; | 107 | apicid_to_node[i] = NUMA_NO_NODE; |
108 | for (i = 0; i < MAX_NUMNODES; i++) | ||
109 | nodes_add[i].start = nodes[i].end = 0; | ||
93 | } | 110 | } |
94 | 111 | ||
95 | static __init inline int srat_disabled(void) | 112 | static __init inline int srat_disabled(void) |
@@ -155,11 +172,114 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) | |||
155 | pxm, pa->apic_id, node); | 172 | pxm, pa->apic_id, node); |
156 | } | 173 | } |
157 | 174 | ||
175 | #ifdef RESERVE_HOTADD | ||
176 | /* | ||
177 | * Protect against too large hotadd areas that would fill up memory. | ||
178 | */ | ||
179 | static int hotadd_enough_memory(struct bootnode *nd) | ||
180 | { | ||
181 | static unsigned long allocated; | ||
182 | static unsigned long last_area_end; | ||
183 | unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT; | ||
184 | long mem = pages * sizeof(struct page); | ||
185 | unsigned long addr; | ||
186 | unsigned long allowed; | ||
187 | unsigned long oldpages = pages; | ||
188 | |||
189 | if (mem < 0) | ||
190 | return 0; | ||
191 | allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE; | ||
192 | allowed = (allowed / 100) * hotadd_percent; | ||
193 | if (allocated + mem > allowed) { | ||
194 | /* Give them at least part of their hotadd memory upto hotadd_percent | ||
195 | It would be better to spread the limit out | ||
196 | over multiple hotplug areas, but that is too complicated | ||
197 | right now */ | ||
198 | if (allocated >= allowed) | ||
199 | return 0; | ||
200 | pages = (allowed - allocated + mem) / sizeof(struct page); | ||
201 | mem = pages * sizeof(struct page); | ||
202 | nd->end = nd->start + pages*PAGE_SIZE; | ||
203 | } | ||
204 | /* Not completely fool proof, but a good sanity check */ | ||
205 | addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem); | ||
206 | if (addr == -1UL) | ||
207 | return 0; | ||
208 | if (pages != oldpages) | ||
209 | printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n", | ||
210 | pages << PAGE_SHIFT); | ||
211 | last_area_end = addr + mem; | ||
212 | allocated += mem; | ||
213 | return 1; | ||
214 | } | ||
215 | |||
216 | /* | ||
217 | * It is fine to add this area to the nodes data it will be used later | ||
218 | * This code supports one contigious hot add area per node. | ||
219 | */ | ||
220 | static int reserve_hotadd(int node, unsigned long start, unsigned long end) | ||
221 | { | ||
222 | unsigned long s_pfn = start >> PAGE_SHIFT; | ||
223 | unsigned long e_pfn = end >> PAGE_SHIFT; | ||
224 | int changed = 0; | ||
225 | struct bootnode *nd = &nodes_add[node]; | ||
226 | |||
227 | /* I had some trouble with strange memory hotadd regions breaking | ||
228 | the boot. Be very strict here and reject anything unexpected. | ||
229 | If you want working memory hotadd write correct SRATs. | ||
230 | |||
231 | The node size check is a basic sanity check to guard against | ||
232 | mistakes */ | ||
233 | if ((signed long)(end - start) < NODE_MIN_SIZE) { | ||
234 | printk(KERN_ERR "SRAT: Hotplug area too small\n"); | ||
235 | return -1; | ||
236 | } | ||
237 | |||
238 | /* This check might be a bit too strict, but I'm keeping it for now. */ | ||
239 | if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) { | ||
240 | printk(KERN_ERR "SRAT: Hotplug area has existing memory\n"); | ||
241 | return -1; | ||
242 | } | ||
243 | |||
244 | if (!hotadd_enough_memory(&nodes_add[node])) { | ||
245 | printk(KERN_ERR "SRAT: Hotplug area too large\n"); | ||
246 | return -1; | ||
247 | } | ||
248 | |||
249 | /* Looks good */ | ||
250 | |||
251 | found_add_area = 1; | ||
252 | if (nd->start == nd->end) { | ||
253 | nd->start = start; | ||
254 | nd->end = end; | ||
255 | changed = 1; | ||
256 | } else { | ||
257 | if (nd->start == end) { | ||
258 | nd->start = start; | ||
259 | changed = 1; | ||
260 | } | ||
261 | if (nd->end == start) { | ||
262 | nd->end = end; | ||
263 | changed = 1; | ||
264 | } | ||
265 | if (!changed) | ||
266 | printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); | ||
267 | } | ||
268 | |||
269 | if ((nd->end >> PAGE_SHIFT) > end_pfn) | ||
270 | end_pfn = nd->end >> PAGE_SHIFT; | ||
271 | |||
272 | if (changed) | ||
273 | printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); | ||
274 | return 0; | ||
275 | } | ||
276 | #endif | ||
277 | |||
158 | /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ | 278 | /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ |
159 | void __init | 279 | void __init |
160 | acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) | 280 | acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) |
161 | { | 281 | { |
162 | struct bootnode *nd; | 282 | struct bootnode *nd, oldnode; |
163 | unsigned long start, end; | 283 | unsigned long start, end; |
164 | int node, pxm; | 284 | int node, pxm; |
165 | int i; | 285 | int i; |
@@ -172,6 +292,8 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) | |||
172 | } | 292 | } |
173 | if (ma->flags.enabled == 0) | 293 | if (ma->flags.enabled == 0) |
174 | return; | 294 | return; |
295 | if (ma->flags.hot_pluggable && hotadd_percent == 0) | ||
296 | return; | ||
175 | start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); | 297 | start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); |
176 | end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); | 298 | end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); |
177 | pxm = ma->proximity_domain; | 299 | pxm = ma->proximity_domain; |
@@ -181,10 +303,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) | |||
181 | bad_srat(); | 303 | bad_srat(); |
182 | return; | 304 | return; |
183 | } | 305 | } |
184 | /* It is fine to add this area to the nodes data it will be used later*/ | ||
185 | if (ma->flags.hot_pluggable == 1) | ||
186 | printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n", | ||
187 | start, end); | ||
188 | i = conflicting_nodes(start, end); | 306 | i = conflicting_nodes(start, end); |
189 | if (i == node) { | 307 | if (i == node) { |
190 | printk(KERN_WARNING | 308 | printk(KERN_WARNING |
@@ -199,6 +317,7 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) | |||
199 | return; | 317 | return; |
200 | } | 318 | } |
201 | nd = &nodes[node]; | 319 | nd = &nodes[node]; |
320 | oldnode = *nd; | ||
202 | if (!node_test_and_set(node, nodes_parsed)) { | 321 | if (!node_test_and_set(node, nodes_parsed)) { |
203 | nd->start = start; | 322 | nd->start = start; |
204 | nd->end = end; | 323 | nd->end = end; |
@@ -208,8 +327,19 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) | |||
208 | if (nd->end < end) | 327 | if (nd->end < end) |
209 | nd->end = end; | 328 | nd->end = end; |
210 | } | 329 | } |
330 | |||
211 | printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, | 331 | printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, |
212 | nd->start, nd->end); | 332 | nd->start, nd->end); |
333 | |||
334 | #ifdef RESERVE_HOTADD | ||
335 | if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) { | ||
336 | /* Ignore hotadd region. Undo damage */ | ||
337 | printk(KERN_NOTICE "SRAT: Hotplug region ignored\n"); | ||
338 | *nd = oldnode; | ||
339 | if ((nd->start | nd->end) == 0) | ||
340 | node_clear(node, nodes_parsed); | ||
341 | } | ||
342 | #endif | ||
213 | } | 343 | } |
214 | 344 | ||
215 | /* Sanity check to catch more bad SRATs (they are amazingly common). | 345 | /* Sanity check to catch more bad SRATs (they are amazingly common). |
@@ -225,6 +355,9 @@ static int nodes_cover_memory(void) | |||
225 | unsigned long e = nodes[i].end >> PAGE_SHIFT; | 355 | unsigned long e = nodes[i].end >> PAGE_SHIFT; |
226 | pxmram += e - s; | 356 | pxmram += e - s; |
227 | pxmram -= e820_hole_size(s, e); | 357 | pxmram -= e820_hole_size(s, e); |
358 | pxmram -= nodes_add[i].end - nodes_add[i].start; | ||
359 | if ((long)pxmram < 0) | ||
360 | pxmram = 0; | ||
228 | } | 361 | } |
229 | 362 | ||
230 | e820ram = end_pfn - e820_hole_size(0, end_pfn); | 363 | e820ram = end_pfn - e820_hole_size(0, end_pfn); |
@@ -258,7 +391,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) | |||
258 | 391 | ||
259 | /* First clean up the node list */ | 392 | /* First clean up the node list */ |
260 | for (i = 0; i < MAX_NUMNODES; i++) { | 393 | for (i = 0; i < MAX_NUMNODES; i++) { |
261 | cutoff_node(i, start, end); | 394 | cutoff_node(i, start, end); |
262 | if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) | 395 | if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) |
263 | unparse_node(i); | 396 | unparse_node(i); |
264 | } | 397 | } |
@@ -303,6 +436,25 @@ static int node_to_pxm(int n) | |||
303 | return 0; | 436 | return 0; |
304 | } | 437 | } |
305 | 438 | ||
439 | void __init srat_reserve_add_area(int nodeid) | ||
440 | { | ||
441 | if (found_add_area && nodes_add[nodeid].end) { | ||
442 | u64 total_mb; | ||
443 | |||
444 | printk(KERN_INFO "SRAT: Reserving hot-add memory space " | ||
445 | "for node %d at %Lx-%Lx\n", | ||
446 | nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end); | ||
447 | total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start) | ||
448 | >> PAGE_SHIFT; | ||
449 | total_mb *= sizeof(struct page); | ||
450 | total_mb >>= 20; | ||
451 | printk(KERN_INFO "SRAT: This will cost you %Lu MB of " | ||
452 | "pre-allocated memory.\n", (unsigned long long)total_mb); | ||
453 | reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start, | ||
454 | nodes_add[nodeid].end - nodes_add[nodeid].start); | ||
455 | } | ||
456 | } | ||
457 | |||
306 | int __node_distance(int a, int b) | 458 | int __node_distance(int a, int b) |
307 | { | 459 | { |
308 | int index; | 460 | int index; |
diff --git a/include/asm-x86_64/numa.h b/include/asm-x86_64/numa.h index f6cbb4cbb5a3..f0ba4d984bdf 100644 --- a/include/asm-x86_64/numa.h +++ b/include/asm-x86_64/numa.h | |||
@@ -18,6 +18,8 @@ extern void numa_init_array(void); | |||
18 | extern int numa_off; | 18 | extern int numa_off; |
19 | 19 | ||
20 | extern void numa_set_node(int cpu, int node); | 20 | extern void numa_set_node(int cpu, int node); |
21 | extern void srat_reserve_add_area(int nodeid); | ||
22 | extern int hotadd_percent; | ||
21 | 23 | ||
22 | extern unsigned char apicid_to_node[256]; | 24 | extern unsigned char apicid_to_node[256]; |
23 | #ifdef CONFIG_NUMA | 25 | #ifdef CONFIG_NUMA |