diff options
Diffstat (limited to 'arch/x86_64/mm')
-rw-r--r-- | arch/x86_64/mm/numa.c | 110 |
1 files changed, 96 insertions, 14 deletions
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 1ec16ea97519..d3f747dd61d3 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c | |||
@@ -272,31 +272,113 @@ void __init numa_init_array(void) | |||
272 | } | 272 | } |
273 | 273 | ||
274 | #ifdef CONFIG_NUMA_EMU | 274 | #ifdef CONFIG_NUMA_EMU |
275 | /* Numa emulation */ | ||
275 | int numa_fake __initdata = 0; | 276 | int numa_fake __initdata = 0; |
276 | 277 | ||
277 | /* Numa emulation */ | 278 | /* |
279 | * This function is used to find out if the start and end correspond to | ||
280 | * different zones. | ||
281 | */ | ||
282 | int zone_cross_over(unsigned long start, unsigned long end) | ||
283 | { | ||
284 | if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && | ||
285 | (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) | ||
286 | return 1; | ||
287 | return 0; | ||
288 | } | ||
289 | |||
278 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | 290 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) |
279 | { | 291 | { |
280 | int i; | 292 | int i, big; |
281 | struct bootnode nodes[MAX_NUMNODES]; | 293 | struct bootnode nodes[MAX_NUMNODES]; |
282 | unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake; | 294 | unsigned long sz, old_sz; |
295 | unsigned long hole_size; | ||
296 | unsigned long start, end; | ||
297 | unsigned long max_addr = (end_pfn << PAGE_SHIFT); | ||
298 | |||
299 | start = (start_pfn << PAGE_SHIFT); | ||
300 | hole_size = e820_hole_size(start, max_addr); | ||
301 | sz = (max_addr - start - hole_size) / numa_fake; | ||
283 | 302 | ||
284 | /* Kludge needed for the hash function */ | 303 | /* Kludge needed for the hash function */ |
285 | if (hweight64(sz) > 1) { | ||
286 | unsigned long x = 1; | ||
287 | while ((x << 1) < sz) | ||
288 | x <<= 1; | ||
289 | if (x < sz/2) | ||
290 | printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n"); | ||
291 | sz = x; | ||
292 | } | ||
293 | 304 | ||
305 | old_sz = sz; | ||
306 | /* | ||
307 | * Round down to the nearest FAKE_NODE_MIN_SIZE. | ||
308 | */ | ||
309 | sz &= FAKE_NODE_MIN_HASH_MASK; | ||
310 | |||
311 | /* | ||
312 | * We ensure that each node is at least 64MB big. Smaller than this | ||
313 | * size can cause VM hiccups. | ||
314 | */ | ||
315 | if (sz == 0) { | ||
316 | printk(KERN_INFO "Not enough memory for %d nodes. Reducing " | ||
317 | "the number of nodes\n", numa_fake); | ||
318 | numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; | ||
319 | printk(KERN_INFO "Number of fake nodes will be = %d\n", | ||
320 | numa_fake); | ||
321 | sz = FAKE_NODE_MIN_SIZE; | ||
322 | } | ||
323 | /* | ||
324 | * Find out how many nodes can get an extra NODE_MIN_SIZE granule. | ||
325 | * This logic ensures the extra memory gets distributed among as many | ||
326 | * nodes as possible (as compared to one single node getting all that | ||
327 | * extra memory. | ||
328 | */ | ||
329 | big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE; | ||
330 | printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: " | ||
331 | "%d\n", | ||
332 | (sz >> 20), (hole_size >> 20), big); | ||
294 | memset(&nodes,0,sizeof(nodes)); | 333 | memset(&nodes,0,sizeof(nodes)); |
334 | end = start; | ||
295 | for (i = 0; i < numa_fake; i++) { | 335 | for (i = 0; i < numa_fake; i++) { |
296 | nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz; | 336 | /* |
337 | * In case we are not able to allocate enough memory for all | ||
338 | * the nodes, we reduce the number of fake nodes. | ||
339 | */ | ||
340 | if (end >= max_addr) { | ||
341 | numa_fake = i - 1; | ||
342 | break; | ||
343 | } | ||
344 | start = nodes[i].start = end; | ||
345 | /* | ||
346 | * Final node can have all the remaining memory. | ||
347 | */ | ||
297 | if (i == numa_fake-1) | 348 | if (i == numa_fake-1) |
298 | sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; | 349 | sz = max_addr - start; |
299 | nodes[i].end = nodes[i].start + sz; | 350 | end = nodes[i].start + sz; |
351 | /* | ||
352 | * Fir "big" number of nodes get extra granule. | ||
353 | */ | ||
354 | if (i < big) | ||
355 | end += FAKE_NODE_MIN_SIZE; | ||
356 | /* | ||
357 | * Iterate over the range to ensure that this node gets at | ||
358 | * least sz amount of RAM (excluding holes) | ||
359 | */ | ||
360 | while ((end - start - e820_hole_size(start, end)) < sz) { | ||
361 | end += FAKE_NODE_MIN_SIZE; | ||
362 | if (end >= max_addr) | ||
363 | break; | ||
364 | } | ||
365 | /* | ||
366 | * Look at the next node to make sure there is some real memory | ||
367 | * to map. Bad things happen when the only memory present | ||
368 | * in a zone on a fake node is IO hole. | ||
369 | */ | ||
370 | while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { | ||
371 | if (zone_cross_over(start, end + sz)) { | ||
372 | end = (MAX_DMA32_PFN << PAGE_SHIFT); | ||
373 | break; | ||
374 | } | ||
375 | if (end >= max_addr) | ||
376 | break; | ||
377 | end += FAKE_NODE_MIN_SIZE; | ||
378 | } | ||
379 | if (end > max_addr) | ||
380 | end = max_addr; | ||
381 | nodes[i].end = end; | ||
300 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", | 382 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", |
301 | i, | 383 | i, |
302 | nodes[i].start, nodes[i].end, | 384 | nodes[i].start, nodes[i].end, |