diff options
-rw-r--r-- | arch/x86_64/kernel/e820.c | 31 | ||||
-rw-r--r-- | arch/x86_64/mm/numa.c | 110 | ||||
-rw-r--r-- | include/asm-x86_64/e820.h | 1 | ||||
-rw-r--r-- | include/asm-x86_64/mmzone.h | 5 |
4 files changed, 133 insertions, 14 deletions
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 9d67955bbc31..4651fd22b213 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c | |||
@@ -191,6 +191,37 @@ unsigned long __init e820_end_of_ram(void) | |||
191 | } | 191 | } |
192 | 192 | ||
193 | /* | 193 | /* |
194 | * Find the hole size in the range. | ||
195 | */ | ||
196 | unsigned long __init e820_hole_size(unsigned long start, unsigned long end) | ||
197 | { | ||
198 | unsigned long ram = 0; | ||
199 | int i; | ||
200 | |||
201 | for (i = 0; i < e820.nr_map; i++) { | ||
202 | struct e820entry *ei = &e820.map[i]; | ||
203 | unsigned long last, addr; | ||
204 | |||
205 | if (ei->type != E820_RAM || | ||
206 | ei->addr+ei->size <= start || | ||
207 | ei->addr >= end) | ||
208 | continue; | ||
209 | |||
210 | addr = round_up(ei->addr, PAGE_SIZE); | ||
211 | if (addr < start) | ||
212 | addr = start; | ||
213 | |||
214 | last = round_down(ei->addr + ei->size, PAGE_SIZE); | ||
215 | if (last >= end) | ||
216 | last = end; | ||
217 | |||
218 | if (last > addr) | ||
219 | ram += last - addr; | ||
220 | } | ||
221 | return ((end - start) - ram); | ||
222 | } | ||
223 | |||
224 | /* | ||
194 | * Mark e820 reserved areas as busy for the resource manager. | 225 | * Mark e820 reserved areas as busy for the resource manager. |
195 | */ | 226 | */ |
196 | void __init e820_reserve_resources(void) | 227 | void __init e820_reserve_resources(void) |
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 1ec16ea97519..d3f747dd61d3 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c | |||
@@ -272,31 +272,113 @@ void __init numa_init_array(void) | |||
272 | } | 272 | } |
273 | 273 | ||
274 | #ifdef CONFIG_NUMA_EMU | 274 | #ifdef CONFIG_NUMA_EMU |
275 | /* Numa emulation */ | ||
275 | int numa_fake __initdata = 0; | 276 | int numa_fake __initdata = 0; |
276 | 277 | ||
277 | /* Numa emulation */ | 278 | /* |
279 | * This function is used to find out if the start and end correspond to | ||
280 | * different zones. | ||
281 | */ | ||
282 | int zone_cross_over(unsigned long start, unsigned long end) | ||
283 | { | ||
284 | if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && | ||
285 | (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) | ||
286 | return 1; | ||
287 | return 0; | ||
288 | } | ||
289 | |||
278 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | 290 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) |
279 | { | 291 | { |
280 | int i; | 292 | int i, big; |
281 | struct bootnode nodes[MAX_NUMNODES]; | 293 | struct bootnode nodes[MAX_NUMNODES]; |
282 | unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake; | 294 | unsigned long sz, old_sz; |
295 | unsigned long hole_size; | ||
296 | unsigned long start, end; | ||
297 | unsigned long max_addr = (end_pfn << PAGE_SHIFT); | ||
298 | |||
299 | start = (start_pfn << PAGE_SHIFT); | ||
300 | hole_size = e820_hole_size(start, max_addr); | ||
301 | sz = (max_addr - start - hole_size) / numa_fake; | ||
283 | 302 | ||
284 | /* Kludge needed for the hash function */ | 303 | /* Kludge needed for the hash function */ |
285 | if (hweight64(sz) > 1) { | ||
286 | unsigned long x = 1; | ||
287 | while ((x << 1) < sz) | ||
288 | x <<= 1; | ||
289 | if (x < sz/2) | ||
290 | printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n"); | ||
291 | sz = x; | ||
292 | } | ||
293 | 304 | ||
305 | old_sz = sz; | ||
306 | /* | ||
307 | * Round down to the nearest FAKE_NODE_MIN_SIZE. | ||
308 | */ | ||
309 | sz &= FAKE_NODE_MIN_HASH_MASK; | ||
310 | |||
311 | /* | ||
312 | * We ensure that each node is at least 64MB big. Smaller than this | ||
313 | * size can cause VM hiccups. | ||
314 | */ | ||
315 | if (sz == 0) { | ||
316 | printk(KERN_INFO "Not enough memory for %d nodes. Reducing " | ||
317 | "the number of nodes\n", numa_fake); | ||
318 | numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; | ||
319 | printk(KERN_INFO "Number of fake nodes will be = %d\n", | ||
320 | numa_fake); | ||
321 | sz = FAKE_NODE_MIN_SIZE; | ||
322 | } | ||
323 | /* | ||
324 | * Find out how many nodes can get an extra NODE_MIN_SIZE granule. | ||
325 | * This logic ensures the extra memory gets distributed among as many | ||
326 | * nodes as possible (as compared to one single node getting all that | ||
327 | * extra memory. | ||
328 | */ | ||
329 | big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE; | ||
330 | printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: " | ||
331 | "%d\n", | ||
332 | (sz >> 20), (hole_size >> 20), big); | ||
294 | memset(&nodes,0,sizeof(nodes)); | 333 | memset(&nodes,0,sizeof(nodes)); |
334 | end = start; | ||
295 | for (i = 0; i < numa_fake; i++) { | 335 | for (i = 0; i < numa_fake; i++) { |
296 | nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz; | 336 | /* |
337 | * In case we are not able to allocate enough memory for all | ||
338 | * the nodes, we reduce the number of fake nodes. | ||
339 | */ | ||
340 | if (end >= max_addr) { | ||
341 | numa_fake = i - 1; | ||
342 | break; | ||
343 | } | ||
344 | start = nodes[i].start = end; | ||
345 | /* | ||
346 | * Final node can have all the remaining memory. | ||
347 | */ | ||
297 | if (i == numa_fake-1) | 348 | if (i == numa_fake-1) |
298 | sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; | 349 | sz = max_addr - start; |
299 | nodes[i].end = nodes[i].start + sz; | 350 | end = nodes[i].start + sz; |
351 | /* | ||
352 | * Fir "big" number of nodes get extra granule. | ||
353 | */ | ||
354 | if (i < big) | ||
355 | end += FAKE_NODE_MIN_SIZE; | ||
356 | /* | ||
357 | * Iterate over the range to ensure that this node gets at | ||
358 | * least sz amount of RAM (excluding holes) | ||
359 | */ | ||
360 | while ((end - start - e820_hole_size(start, end)) < sz) { | ||
361 | end += FAKE_NODE_MIN_SIZE; | ||
362 | if (end >= max_addr) | ||
363 | break; | ||
364 | } | ||
365 | /* | ||
366 | * Look at the next node to make sure there is some real memory | ||
367 | * to map. Bad things happen when the only memory present | ||
368 | * in a zone on a fake node is IO hole. | ||
369 | */ | ||
370 | while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { | ||
371 | if (zone_cross_over(start, end + sz)) { | ||
372 | end = (MAX_DMA32_PFN << PAGE_SHIFT); | ||
373 | break; | ||
374 | } | ||
375 | if (end >= max_addr) | ||
376 | break; | ||
377 | end += FAKE_NODE_MIN_SIZE; | ||
378 | } | ||
379 | if (end > max_addr) | ||
380 | end = max_addr; | ||
381 | nodes[i].end = end; | ||
300 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", | 382 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", |
301 | i, | 383 | i, |
302 | nodes[i].start, nodes[i].end, | 384 | nodes[i].start, nodes[i].end, |
diff --git a/include/asm-x86_64/e820.h b/include/asm-x86_64/e820.h index 855fb4a454b6..6216fa3f2802 100644 --- a/include/asm-x86_64/e820.h +++ b/include/asm-x86_64/e820.h | |||
@@ -46,6 +46,7 @@ extern void e820_mark_nosave_regions(void); | |||
46 | extern void e820_print_map(char *who); | 46 | extern void e820_print_map(char *who); |
47 | extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type); | 47 | extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type); |
48 | extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type); | 48 | extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type); |
49 | extern unsigned long e820_hole_size(unsigned long start, unsigned long end); | ||
49 | 50 | ||
50 | extern void e820_setup_gap(void); | 51 | extern void e820_setup_gap(void); |
51 | extern void e820_register_active_regions(int nid, | 52 | extern void e820_register_active_regions(int nid, |
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h index 39ef106986eb..fb558fb1d211 100644 --- a/include/asm-x86_64/mmzone.h +++ b/include/asm-x86_64/mmzone.h | |||
@@ -47,5 +47,10 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) | |||
47 | extern int pfn_valid(unsigned long pfn); | 47 | extern int pfn_valid(unsigned long pfn); |
48 | #endif | 48 | #endif |
49 | 49 | ||
50 | #ifdef CONFIG_NUMA_EMU | ||
51 | #define FAKE_NODE_MIN_SIZE (64*1024*1024) | ||
52 | #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1ul)) | ||
53 | #endif | ||
54 | |||
50 | #endif | 55 | #endif |
51 | #endif | 56 | #endif |