aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2006-04-07 13:49:18 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-04-09 14:53:16 -0400
commit68a3a7feb08f960095072f28ec20f7900793c506 (patch)
tree1d458822fd4414997b6f12ad8a915f785d7e730f
parent9d99aaa31f5994d1923c3713ce9144c4c42332e1 (diff)
[PATCH] x86_64: Reserve SRAT hotadd memory on x86-64
From: Keith Mannthey, Andi Kleen Implement memory hotadd without sparsemem. The memory in the SRAT hotadd area is just preserved instead and can be activated later. There are a few restrictions: - Only one continuous hotadd area allowed per node The main problem is dealing with the many buggy SRAT tables that are out there. The strategy here is to reject anything suspicious. Originally from Keith Mannthey, with several hacks and changes by AK and also contributions from Andrew Morton [ TBD: Problems pointed out by KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>: 1) Goto's rebuild_zonelist patch will not work if CONFIG_MEMORY_HOTPLUG=n. Rebuilding zonelist is necessary when the system has just memory < 4G at boot, and hot add memory > 4G. because x86_64 has DMA32, ZONE_NORAML is not included into zonelist at boot time if system doesn't have memory >4G at boot. [AK: should just force the higher zones at boot time when SRAT tells us] 2) zone and node's spanned_pages and present_pages are not incremented. They should be. For example, our server (ia64/Fujitsu PrimeQuest) can equip memory from 4G to 1T(maybe 2T in future), and SRAT will *always* say we have possible 1T +memory. (Microsoft requires "write all possible memory in SRAT") When we reserve memmap for possible 1T memory, Linux will not work well in +minimum 4G configuraion ;) [AK: needs limiting to 5-10% of max memory] ] Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--Documentation/x86_64/boot-options.txt5
-rw-r--r--arch/x86_64/mm/init.c3
-rw-r--r--arch/x86_64/mm/numa.c5
-rw-r--r--arch/x86_64/mm/srat.c164
-rw-r--r--include/asm-x86_64/numa.h2
5 files changed, 171 insertions, 8 deletions
diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt
index 1921353259ae..f2cd6ef53ff3 100644
--- a/Documentation/x86_64/boot-options.txt
+++ b/Documentation/x86_64/boot-options.txt
@@ -151,6 +151,11 @@ NUMA
151 151
152 numa=fake=X Fake X nodes and ignore NUMA setup of the actual machine. 152 numa=fake=X Fake X nodes and ignore NUMA setup of the actual machine.
153 153
154 numa=hotadd=percent
155 Only allow hotadd memory to preallocate page structures upto
156 percent of already available memory.
157 numa=hotadd=0 will disable hotadd memory.
158
154ACPI 159ACPI
155 160
156 acpi=off Don't enable ACPI 161 acpi=off Don't enable ACPI
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 492161168402..dff870534199 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -530,8 +530,7 @@ int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
530 unsigned long pfn; 530 unsigned long pfn;
531 unsigned long total = 0, mem = 0; 531 unsigned long total = 0, mem = 0;
532 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { 532 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
533 unsigned long addr = pfn << PAGE_SHIFT; 533 if (pfn_valid(pfn)) {
534 if (pfn_valid(pfn) && e820_mapped(addr, addr+1, E820_RAM)) {
535 online_page(pfn_to_page(pfn)); 534 online_page(pfn_to_page(pfn));
536 err = 0; 535 err = 0;
537 mem++; 536 mem++;
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 4be82d6e2b48..779132af29a7 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -142,6 +142,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
142 142
143 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 143 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
144 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); 144 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
145#ifdef CONFIG_ACPI_NUMA
146 srat_reserve_add_area(nodeid);
147#endif
145 node_set_online(nodeid); 148 node_set_online(nodeid);
146} 149}
147 150
@@ -335,6 +338,8 @@ __init int numa_setup(char *opt)
335#ifdef CONFIG_ACPI_NUMA 338#ifdef CONFIG_ACPI_NUMA
336 if (!strncmp(opt,"noacpi",6)) 339 if (!strncmp(opt,"noacpi",6))
337 acpi_numa = -1; 340 acpi_numa = -1;
341 if (!strncmp(opt,"hotadd=", 7))
342 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
338#endif 343#endif
339 return 1; 344 return 1;
340} 345}
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 2eb879590dc4..443875eb15a2 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -15,15 +15,26 @@
15#include <linux/bitmap.h> 15#include <linux/bitmap.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/topology.h> 17#include <linux/topology.h>
18#include <linux/bootmem.h>
19#include <linux/mm.h>
18#include <asm/proto.h> 20#include <asm/proto.h>
19#include <asm/numa.h> 21#include <asm/numa.h>
20#include <asm/e820.h> 22#include <asm/e820.h>
21 23
24#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
25 defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
26 && !defined(CONFIG_MEMORY_HOTPLUG)
27#define RESERVE_HOTADD 1
28#endif
29
22static struct acpi_table_slit *acpi_slit; 30static struct acpi_table_slit *acpi_slit;
23 31
24static nodemask_t nodes_parsed __initdata; 32static nodemask_t nodes_parsed __initdata;
25static nodemask_t nodes_found __initdata; 33static nodemask_t nodes_found __initdata;
26static struct bootnode nodes[MAX_NUMNODES] __initdata; 34static struct bootnode nodes[MAX_NUMNODES] __initdata;
35static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
36static int found_add_area __initdata;
37int hotadd_percent __initdata = 10;
27static u8 pxm2node[256] = { [0 ... 255] = 0xff }; 38static u8 pxm2node[256] = { [0 ... 255] = 0xff };
28 39
29/* Too small nodes confuse the VM badly. Usually they result 40/* Too small nodes confuse the VM badly. Usually they result
@@ -71,6 +82,10 @@ static __init int conflicting_nodes(unsigned long start, unsigned long end)
71static __init void cutoff_node(int i, unsigned long start, unsigned long end) 82static __init void cutoff_node(int i, unsigned long start, unsigned long end)
72{ 83{
73 struct bootnode *nd = &nodes[i]; 84 struct bootnode *nd = &nodes[i];
85
86 if (found_add_area)
87 return;
88
74 if (nd->start < start) { 89 if (nd->start < start) {
75 nd->start = start; 90 nd->start = start;
76 if (nd->end < nd->start) 91 if (nd->end < nd->start)
@@ -90,6 +105,8 @@ static __init void bad_srat(void)
90 acpi_numa = -1; 105 acpi_numa = -1;
91 for (i = 0; i < MAX_LOCAL_APIC; i++) 106 for (i = 0; i < MAX_LOCAL_APIC; i++)
92 apicid_to_node[i] = NUMA_NO_NODE; 107 apicid_to_node[i] = NUMA_NO_NODE;
108 for (i = 0; i < MAX_NUMNODES; i++)
109 nodes_add[i].start = nodes[i].end = 0;
93} 110}
94 111
95static __init inline int srat_disabled(void) 112static __init inline int srat_disabled(void)
@@ -155,11 +172,114 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
155 pxm, pa->apic_id, node); 172 pxm, pa->apic_id, node);
156} 173}
157 174
175#ifdef RESERVE_HOTADD
176/*
177 * Protect against too large hotadd areas that would fill up memory.
178 */
179static int hotadd_enough_memory(struct bootnode *nd)
180{
181 static unsigned long allocated;
182 static unsigned long last_area_end;
183 unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
184 long mem = pages * sizeof(struct page);
185 unsigned long addr;
186 unsigned long allowed;
187 unsigned long oldpages = pages;
188
189 if (mem < 0)
190 return 0;
191 allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
192 allowed = (allowed / 100) * hotadd_percent;
193 if (allocated + mem > allowed) {
194 /* Give them at least part of their hotadd memory upto hotadd_percent
195 It would be better to spread the limit out
196 over multiple hotplug areas, but that is too complicated
197 right now */
198 if (allocated >= allowed)
199 return 0;
200 pages = (allowed - allocated + mem) / sizeof(struct page);
201 mem = pages * sizeof(struct page);
202 nd->end = nd->start + pages*PAGE_SIZE;
203 }
204 /* Not completely fool proof, but a good sanity check */
205 addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
206 if (addr == -1UL)
207 return 0;
208 if (pages != oldpages)
209 printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
210 pages << PAGE_SHIFT);
211 last_area_end = addr + mem;
212 allocated += mem;
213 return 1;
214}
215
216/*
217 * It is fine to add this area to the nodes data it will be used later
218 * This code supports one contigious hot add area per node.
219 */
220static int reserve_hotadd(int node, unsigned long start, unsigned long end)
221{
222 unsigned long s_pfn = start >> PAGE_SHIFT;
223 unsigned long e_pfn = end >> PAGE_SHIFT;
224 int changed = 0;
225 struct bootnode *nd = &nodes_add[node];
226
227 /* I had some trouble with strange memory hotadd regions breaking
228 the boot. Be very strict here and reject anything unexpected.
229 If you want working memory hotadd write correct SRATs.
230
231 The node size check is a basic sanity check to guard against
232 mistakes */
233 if ((signed long)(end - start) < NODE_MIN_SIZE) {
234 printk(KERN_ERR "SRAT: Hotplug area too small\n");
235 return -1;
236 }
237
238 /* This check might be a bit too strict, but I'm keeping it for now. */
239 if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
240 printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
241 return -1;
242 }
243
244 if (!hotadd_enough_memory(&nodes_add[node])) {
245 printk(KERN_ERR "SRAT: Hotplug area too large\n");
246 return -1;
247 }
248
249 /* Looks good */
250
251 found_add_area = 1;
252 if (nd->start == nd->end) {
253 nd->start = start;
254 nd->end = end;
255 changed = 1;
256 } else {
257 if (nd->start == end) {
258 nd->start = start;
259 changed = 1;
260 }
261 if (nd->end == start) {
262 nd->end = end;
263 changed = 1;
264 }
265 if (!changed)
266 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
267 }
268
269 if ((nd->end >> PAGE_SHIFT) > end_pfn)
270 end_pfn = nd->end >> PAGE_SHIFT;
271
272 if (changed)
273 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
274 return 0;
275}
276#endif
277
158/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ 278/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
159void __init 279void __init
160acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) 280acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
161{ 281{
162 struct bootnode *nd; 282 struct bootnode *nd, oldnode;
163 unsigned long start, end; 283 unsigned long start, end;
164 int node, pxm; 284 int node, pxm;
165 int i; 285 int i;
@@ -172,6 +292,8 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
172 } 292 }
173 if (ma->flags.enabled == 0) 293 if (ma->flags.enabled == 0)
174 return; 294 return;
295 if (ma->flags.hot_pluggable && hotadd_percent == 0)
296 return;
175 start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); 297 start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
176 end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); 298 end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
177 pxm = ma->proximity_domain; 299 pxm = ma->proximity_domain;
@@ -181,10 +303,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
181 bad_srat(); 303 bad_srat();
182 return; 304 return;
183 } 305 }
184 /* It is fine to add this area to the nodes data it will be used later*/
185 if (ma->flags.hot_pluggable == 1)
186 printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n",
187 start, end);
188 i = conflicting_nodes(start, end); 306 i = conflicting_nodes(start, end);
189 if (i == node) { 307 if (i == node) {
190 printk(KERN_WARNING 308 printk(KERN_WARNING
@@ -199,6 +317,7 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
199 return; 317 return;
200 } 318 }
201 nd = &nodes[node]; 319 nd = &nodes[node];
320 oldnode = *nd;
202 if (!node_test_and_set(node, nodes_parsed)) { 321 if (!node_test_and_set(node, nodes_parsed)) {
203 nd->start = start; 322 nd->start = start;
204 nd->end = end; 323 nd->end = end;
@@ -208,8 +327,19 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
208 if (nd->end < end) 327 if (nd->end < end)
209 nd->end = end; 328 nd->end = end;
210 } 329 }
330
211 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, 331 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
212 nd->start, nd->end); 332 nd->start, nd->end);
333
334#ifdef RESERVE_HOTADD
335 if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
336 /* Ignore hotadd region. Undo damage */
337 printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
338 *nd = oldnode;
339 if ((nd->start | nd->end) == 0)
340 node_clear(node, nodes_parsed);
341 }
342#endif
213} 343}
214 344
215/* Sanity check to catch more bad SRATs (they are amazingly common). 345/* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -225,6 +355,9 @@ static int nodes_cover_memory(void)
225 unsigned long e = nodes[i].end >> PAGE_SHIFT; 355 unsigned long e = nodes[i].end >> PAGE_SHIFT;
226 pxmram += e - s; 356 pxmram += e - s;
227 pxmram -= e820_hole_size(s, e); 357 pxmram -= e820_hole_size(s, e);
358 pxmram -= nodes_add[i].end - nodes_add[i].start;
359 if ((long)pxmram < 0)
360 pxmram = 0;
228 } 361 }
229 362
230 e820ram = end_pfn - e820_hole_size(0, end_pfn); 363 e820ram = end_pfn - e820_hole_size(0, end_pfn);
@@ -258,7 +391,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
258 391
259 /* First clean up the node list */ 392 /* First clean up the node list */
260 for (i = 0; i < MAX_NUMNODES; i++) { 393 for (i = 0; i < MAX_NUMNODES; i++) {
261 cutoff_node(i, start, end); 394 cutoff_node(i, start, end);
262 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) 395 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
263 unparse_node(i); 396 unparse_node(i);
264 } 397 }
@@ -303,6 +436,25 @@ static int node_to_pxm(int n)
303 return 0; 436 return 0;
304} 437}
305 438
439void __init srat_reserve_add_area(int nodeid)
440{
441 if (found_add_area && nodes_add[nodeid].end) {
442 u64 total_mb;
443
444 printk(KERN_INFO "SRAT: Reserving hot-add memory space "
445 "for node %d at %Lx-%Lx\n",
446 nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
447 total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
448 >> PAGE_SHIFT;
449 total_mb *= sizeof(struct page);
450 total_mb >>= 20;
451 printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
452 "pre-allocated memory.\n", (unsigned long long)total_mb);
453 reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
454 nodes_add[nodeid].end - nodes_add[nodeid].start);
455 }
456}
457
306int __node_distance(int a, int b) 458int __node_distance(int a, int b)
307{ 459{
308 int index; 460 int index;
diff --git a/include/asm-x86_64/numa.h b/include/asm-x86_64/numa.h
index f6cbb4cbb5a3..f0ba4d984bdf 100644
--- a/include/asm-x86_64/numa.h
+++ b/include/asm-x86_64/numa.h
@@ -18,6 +18,8 @@ extern void numa_init_array(void);
18extern int numa_off; 18extern int numa_off;
19 19
20extern void numa_set_node(int cpu, int node); 20extern void numa_set_node(int cpu, int node);
21extern void srat_reserve_add_area(int nodeid);
22extern int hotadd_percent;
21 23
22extern unsigned char apicid_to_node[256]; 24extern unsigned char apicid_to_node[256];
23#ifdef CONFIG_NUMA 25#ifdef CONFIG_NUMA