aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/mm
diff options
context:
space:
mode:
authorJeff Garzik <jeff@garzik.org>2006-04-12 16:54:43 -0400
committerJeff Garzik <jeff@garzik.org>2006-04-12 16:54:43 -0400
commita890b15c0990cc8d686edcc85f5fccde71ad5ce9 (patch)
tree73162355b58283a2531f13fbbf663809f95c1483 /arch/x86_64/mm
parent79fa1b677be3a985cc66b9218a4dd09818f1051b (diff)
parent26ec634c31a11a003040e10b4d650495158632fd (diff)
Merge branch 'upstream'
Diffstat (limited to 'arch/x86_64/mm')
-rw-r--r--arch/x86_64/mm/init.c37
-rw-r--r--arch/x86_64/mm/numa.c46
-rw-r--r--arch/x86_64/mm/srat.c170
3 files changed, 235 insertions, 18 deletions
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index e5f7f1c34462..4ba34e95d835 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -305,7 +305,7 @@ static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned
305 if (paddr >= end) 305 if (paddr >= end)
306 break; 306 break;
307 307
308 if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) { 308 if (!after_bootmem && !e820_any_mapped(paddr, paddr+PUD_SIZE, 0)) {
309 set_pud(pud, __pud(0)); 309 set_pud(pud, __pud(0));
310 continue; 310 continue;
311 } 311 }
@@ -507,9 +507,8 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size)
507 507
508/* 508/*
509 * Memory hotplug specific functions 509 * Memory hotplug specific functions
510 * These are only for non-NUMA machines right now.
511 */ 510 */
512#ifdef CONFIG_MEMORY_HOTPLUG 511#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
513 512
514void online_page(struct page *page) 513void online_page(struct page *page)
515{ 514{
@@ -520,6 +519,38 @@ void online_page(struct page *page)
520 num_physpages++; 519 num_physpages++;
521} 520}
522 521
522#ifndef CONFIG_MEMORY_HOTPLUG
523/*
524 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
525 * just online the pages.
526 */
527int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
528{
529 int err = -EIO;
530 unsigned long pfn;
531 unsigned long total = 0, mem = 0;
532 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
533 if (pfn_valid(pfn)) {
534 online_page(pfn_to_page(pfn));
535 err = 0;
536 mem++;
537 }
538 total++;
539 }
540 if (!err) {
541 z->spanned_pages += total;
542 z->present_pages += mem;
543 z->zone_pgdat->node_spanned_pages += total;
544 z->zone_pgdat->node_present_pages += mem;
545 }
546 return err;
547}
548#endif
549
550/*
551 * Memory is added always to NORMAL zone. This means you will never get
552 * additional DMA/DMA32 memory.
553 */
523int add_memory(u64 start, u64 size) 554int add_memory(u64 start, u64 size)
524{ 555{
525 struct pglist_data *pgdat = NODE_DATA(0); 556 struct pglist_data *pgdat = NODE_DATA(0);
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 4be82d6e2b48..cc02573a3271 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -100,11 +100,30 @@ int early_pfn_to_nid(unsigned long pfn)
100} 100}
101#endif 101#endif
102 102
103static void * __init
104early_node_mem(int nodeid, unsigned long start, unsigned long end,
105 unsigned long size)
106{
107 unsigned long mem = find_e820_area(start, end, size);
108 void *ptr;
109 if (mem != -1L)
110 return __va(mem);
111 ptr = __alloc_bootmem_nopanic(size,
112 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
113 if (ptr == 0) {
114 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
115 size, nodeid);
116 return NULL;
117 }
118 return ptr;
119}
120
103/* Initialize bootmem allocator for a node */ 121/* Initialize bootmem allocator for a node */
104void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 122void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
105{ 123{
106 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 124 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
107 unsigned long nodedata_phys; 125 unsigned long nodedata_phys;
126 void *bootmap;
108 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 127 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
109 128
110 start = round_up(start, ZONE_ALIGN); 129 start = round_up(start, ZONE_ALIGN);
@@ -114,13 +133,11 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
114 start_pfn = start >> PAGE_SHIFT; 133 start_pfn = start >> PAGE_SHIFT;
115 end_pfn = end >> PAGE_SHIFT; 134 end_pfn = end >> PAGE_SHIFT;
116 135
117 nodedata_phys = find_e820_area(start, end, pgdat_size); 136 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
118 if (nodedata_phys == -1L) 137 if (node_data[nodeid] == NULL)
119 panic("Cannot find memory pgdat in node %d\n", nodeid); 138 return;
120 139 nodedata_phys = __pa(node_data[nodeid]);
121 Dprintk("nodedata_phys %lx\n", nodedata_phys);
122 140
123 node_data[nodeid] = phys_to_virt(nodedata_phys);
124 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 141 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
125 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 142 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
126 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 143 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
@@ -129,9 +146,15 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
129 /* Find a place for the bootmem map */ 146 /* Find a place for the bootmem map */
130 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 147 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
131 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 148 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
132 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT); 149 bootmap = early_node_mem(nodeid, bootmap_start, end,
133 if (bootmap_start == -1L) 150 bootmap_pages<<PAGE_SHIFT);
134 panic("Not enough continuous space for bootmap on node %d", nodeid); 151 if (bootmap == NULL) {
152 if (nodedata_phys < start || nodedata_phys >= end)
153 free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
154 node_data[nodeid] = NULL;
155 return;
156 }
157 bootmap_start = __pa(bootmap);
135 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 158 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
136 159
137 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 160 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
@@ -142,6 +165,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
142 165
143 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 166 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
144 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); 167 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
168#ifdef CONFIG_ACPI_NUMA
169 srat_reserve_add_area(nodeid);
170#endif
145 node_set_online(nodeid); 171 node_set_online(nodeid);
146} 172}
147 173
@@ -335,6 +361,8 @@ __init int numa_setup(char *opt)
335#ifdef CONFIG_ACPI_NUMA 361#ifdef CONFIG_ACPI_NUMA
336 if (!strncmp(opt,"noacpi",6)) 362 if (!strncmp(opt,"noacpi",6))
337 acpi_numa = -1; 363 acpi_numa = -1;
364 if (!strncmp(opt,"hotadd=", 7))
365 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
338#endif 366#endif
339 return 1; 367 return 1;
340} 368}
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 2eb879590dc4..15ae9fcd65a7 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -15,15 +15,26 @@
15#include <linux/bitmap.h> 15#include <linux/bitmap.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/topology.h> 17#include <linux/topology.h>
18#include <linux/bootmem.h>
19#include <linux/mm.h>
18#include <asm/proto.h> 20#include <asm/proto.h>
19#include <asm/numa.h> 21#include <asm/numa.h>
20#include <asm/e820.h> 22#include <asm/e820.h>
21 23
24#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
25 defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
26 && !defined(CONFIG_MEMORY_HOTPLUG)
27#define RESERVE_HOTADD 1
28#endif
29
22static struct acpi_table_slit *acpi_slit; 30static struct acpi_table_slit *acpi_slit;
23 31
24static nodemask_t nodes_parsed __initdata; 32static nodemask_t nodes_parsed __initdata;
25static nodemask_t nodes_found __initdata; 33static nodemask_t nodes_found __initdata;
26static struct bootnode nodes[MAX_NUMNODES] __initdata; 34static struct bootnode nodes[MAX_NUMNODES] __initdata;
35static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
36static int found_add_area __initdata;
37int hotadd_percent __initdata = 10;
27static u8 pxm2node[256] = { [0 ... 255] = 0xff }; 38static u8 pxm2node[256] = { [0 ... 255] = 0xff };
28 39
29/* Too small nodes confuse the VM badly. Usually they result 40/* Too small nodes confuse the VM badly. Usually they result
@@ -71,6 +82,10 @@ static __init int conflicting_nodes(unsigned long start, unsigned long end)
71static __init void cutoff_node(int i, unsigned long start, unsigned long end) 82static __init void cutoff_node(int i, unsigned long start, unsigned long end)
72{ 83{
73 struct bootnode *nd = &nodes[i]; 84 struct bootnode *nd = &nodes[i];
85
86 if (found_add_area)
87 return;
88
74 if (nd->start < start) { 89 if (nd->start < start) {
75 nd->start = start; 90 nd->start = start;
76 if (nd->end < nd->start) 91 if (nd->end < nd->start)
@@ -90,6 +105,8 @@ static __init void bad_srat(void)
90 acpi_numa = -1; 105 acpi_numa = -1;
91 for (i = 0; i < MAX_LOCAL_APIC; i++) 106 for (i = 0; i < MAX_LOCAL_APIC; i++)
92 apicid_to_node[i] = NUMA_NO_NODE; 107 apicid_to_node[i] = NUMA_NO_NODE;
108 for (i = 0; i < MAX_NUMNODES; i++)
109 nodes_add[i].start = nodes[i].end = 0;
93} 110}
94 111
95static __init inline int srat_disabled(void) 112static __init inline int srat_disabled(void)
@@ -155,11 +172,114 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
155 pxm, pa->apic_id, node); 172 pxm, pa->apic_id, node);
156} 173}
157 174
175#ifdef RESERVE_HOTADD
176/*
177 * Protect against too large hotadd areas that would fill up memory.
178 */
179static int hotadd_enough_memory(struct bootnode *nd)
180{
181 static unsigned long allocated;
182 static unsigned long last_area_end;
183 unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
184 long mem = pages * sizeof(struct page);
185 unsigned long addr;
186 unsigned long allowed;
187 unsigned long oldpages = pages;
188
189 if (mem < 0)
190 return 0;
191 allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
192 allowed = (allowed / 100) * hotadd_percent;
193 if (allocated + mem > allowed) {
194 /* Give them at least part of their hotadd memory upto hotadd_percent
195 It would be better to spread the limit out
196 over multiple hotplug areas, but that is too complicated
197 right now */
198 if (allocated >= allowed)
199 return 0;
200 pages = (allowed - allocated + mem) / sizeof(struct page);
201 mem = pages * sizeof(struct page);
202 nd->end = nd->start + pages*PAGE_SIZE;
203 }
204 /* Not completely fool proof, but a good sanity check */
205 addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
206 if (addr == -1UL)
207 return 0;
208 if (pages != oldpages)
209 printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
210 pages << PAGE_SHIFT);
211 last_area_end = addr + mem;
212 allocated += mem;
213 return 1;
214}
215
216/*
217 * It is fine to add this area to the nodes data it will be used later
218 * This code supports one contigious hot add area per node.
219 */
220static int reserve_hotadd(int node, unsigned long start, unsigned long end)
221{
222 unsigned long s_pfn = start >> PAGE_SHIFT;
223 unsigned long e_pfn = end >> PAGE_SHIFT;
224 int changed = 0;
225 struct bootnode *nd = &nodes_add[node];
226
227 /* I had some trouble with strange memory hotadd regions breaking
228 the boot. Be very strict here and reject anything unexpected.
229 If you want working memory hotadd write correct SRATs.
230
231 The node size check is a basic sanity check to guard against
232 mistakes */
233 if ((signed long)(end - start) < NODE_MIN_SIZE) {
234 printk(KERN_ERR "SRAT: Hotplug area too small\n");
235 return -1;
236 }
237
238 /* This check might be a bit too strict, but I'm keeping it for now. */
239 if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
240 printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
241 return -1;
242 }
243
244 if (!hotadd_enough_memory(&nodes_add[node])) {
245 printk(KERN_ERR "SRAT: Hotplug area too large\n");
246 return -1;
247 }
248
249 /* Looks good */
250
251 found_add_area = 1;
252 if (nd->start == nd->end) {
253 nd->start = start;
254 nd->end = end;
255 changed = 1;
256 } else {
257 if (nd->start == end) {
258 nd->start = start;
259 changed = 1;
260 }
261 if (nd->end == start) {
262 nd->end = end;
263 changed = 1;
264 }
265 if (!changed)
266 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
267 }
268
269 if ((nd->end >> PAGE_SHIFT) > end_pfn)
270 end_pfn = nd->end >> PAGE_SHIFT;
271
272 if (changed)
273 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
274 return 0;
275}
276#endif
277
158/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ 278/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
159void __init 279void __init
160acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) 280acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
161{ 281{
162 struct bootnode *nd; 282 struct bootnode *nd, oldnode;
163 unsigned long start, end; 283 unsigned long start, end;
164 int node, pxm; 284 int node, pxm;
165 int i; 285 int i;
@@ -172,6 +292,8 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
172 } 292 }
173 if (ma->flags.enabled == 0) 293 if (ma->flags.enabled == 0)
174 return; 294 return;
295 if (ma->flags.hot_pluggable && hotadd_percent == 0)
296 return;
175 start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); 297 start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
176 end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); 298 end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
177 pxm = ma->proximity_domain; 299 pxm = ma->proximity_domain;
@@ -181,10 +303,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
181 bad_srat(); 303 bad_srat();
182 return; 304 return;
183 } 305 }
184 /* It is fine to add this area to the nodes data it will be used later*/
185 if (ma->flags.hot_pluggable == 1)
186 printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n",
187 start, end);
188 i = conflicting_nodes(start, end); 306 i = conflicting_nodes(start, end);
189 if (i == node) { 307 if (i == node) {
190 printk(KERN_WARNING 308 printk(KERN_WARNING
@@ -199,6 +317,7 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
199 return; 317 return;
200 } 318 }
201 nd = &nodes[node]; 319 nd = &nodes[node];
320 oldnode = *nd;
202 if (!node_test_and_set(node, nodes_parsed)) { 321 if (!node_test_and_set(node, nodes_parsed)) {
203 nd->start = start; 322 nd->start = start;
204 nd->end = end; 323 nd->end = end;
@@ -208,8 +327,19 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
208 if (nd->end < end) 327 if (nd->end < end)
209 nd->end = end; 328 nd->end = end;
210 } 329 }
330
211 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, 331 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
212 nd->start, nd->end); 332 nd->start, nd->end);
333
334#ifdef RESERVE_HOTADD
335 if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
336 /* Ignore hotadd region. Undo damage */
337 printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
338 *nd = oldnode;
339 if ((nd->start | nd->end) == 0)
340 node_clear(node, nodes_parsed);
341 }
342#endif
213} 343}
214 344
215/* Sanity check to catch more bad SRATs (they are amazingly common). 345/* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -225,6 +355,9 @@ static int nodes_cover_memory(void)
225 unsigned long e = nodes[i].end >> PAGE_SHIFT; 355 unsigned long e = nodes[i].end >> PAGE_SHIFT;
226 pxmram += e - s; 356 pxmram += e - s;
227 pxmram -= e820_hole_size(s, e); 357 pxmram -= e820_hole_size(s, e);
358 pxmram -= nodes_add[i].end - nodes_add[i].start;
359 if ((long)pxmram < 0)
360 pxmram = 0;
228 } 361 }
229 362
230 e820ram = end_pfn - e820_hole_size(0, end_pfn); 363 e820ram = end_pfn - e820_hole_size(0, end_pfn);
@@ -258,7 +391,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
258 391
259 /* First clean up the node list */ 392 /* First clean up the node list */
260 for (i = 0; i < MAX_NUMNODES; i++) { 393 for (i = 0; i < MAX_NUMNODES; i++) {
261 cutoff_node(i, start, end); 394 cutoff_node(i, start, end);
262 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) 395 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
263 unparse_node(i); 396 unparse_node(i);
264 } 397 }
@@ -282,6 +415,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
282 /* Finally register nodes */ 415 /* Finally register nodes */
283 for_each_node_mask(i, nodes_parsed) 416 for_each_node_mask(i, nodes_parsed)
284 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 417 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
418 /* Try again in case setup_node_bootmem missed one due
419 to missing bootmem */
420 for_each_node_mask(i, nodes_parsed)
421 if (!node_online(i))
422 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
423
285 for (i = 0; i < NR_CPUS; i++) { 424 for (i = 0; i < NR_CPUS; i++) {
286 if (cpu_to_node[i] == NUMA_NO_NODE) 425 if (cpu_to_node[i] == NUMA_NO_NODE)
287 continue; 426 continue;
@@ -303,6 +442,25 @@ static int node_to_pxm(int n)
303 return 0; 442 return 0;
304} 443}
305 444
445void __init srat_reserve_add_area(int nodeid)
446{
447 if (found_add_area && nodes_add[nodeid].end) {
448 u64 total_mb;
449
450 printk(KERN_INFO "SRAT: Reserving hot-add memory space "
451 "for node %d at %Lx-%Lx\n",
452 nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
453 total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
454 >> PAGE_SHIFT;
455 total_mb *= sizeof(struct page);
456 total_mb >>= 20;
457 printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
458 "pre-allocated memory.\n", (unsigned long long)total_mb);
459 reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
460 nodes_add[nodeid].end - nodes_add[nodeid].start);
461 }
462}
463
306int __node_distance(int a, int b) 464int __node_distance(int a, int b)
307{ 465{
308 int index; 466 int index;