aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86_64/kernel/e820.c31
-rw-r--r--arch/x86_64/mm/numa.c110
-rw-r--r--include/asm-x86_64/e820.h1
-rw-r--r--include/asm-x86_64/mmzone.h5
4 files changed, 133 insertions, 14 deletions
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 9d67955bbc31..4651fd22b213 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -191,6 +191,37 @@ unsigned long __init e820_end_of_ram(void)
191} 191}
192 192
193/* 193/*
194 * Find the hole size in the range.
195 */
196unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
197{
198 unsigned long ram = 0;
199 int i;
200
201 for (i = 0; i < e820.nr_map; i++) {
202 struct e820entry *ei = &e820.map[i];
203 unsigned long last, addr;
204
205 if (ei->type != E820_RAM ||
206 ei->addr+ei->size <= start ||
207 ei->addr >= end)
208 continue;
209
210 addr = round_up(ei->addr, PAGE_SIZE);
211 if (addr < start)
212 addr = start;
213
214 last = round_down(ei->addr + ei->size, PAGE_SIZE);
215 if (last >= end)
216 last = end;
217
218 if (last > addr)
219 ram += last - addr;
220 }
221 return ((end - start) - ram);
222}
223
224/*
194 * Mark e820 reserved areas as busy for the resource manager. 225 * Mark e820 reserved areas as busy for the resource manager.
195 */ 226 */
196void __init e820_reserve_resources(void) 227void __init e820_reserve_resources(void)
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 1ec16ea97519..d3f747dd61d3 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -272,31 +272,113 @@ void __init numa_init_array(void)
272} 272}
273 273
274#ifdef CONFIG_NUMA_EMU 274#ifdef CONFIG_NUMA_EMU
275/* Numa emulation */
275int numa_fake __initdata = 0; 276int numa_fake __initdata = 0;
276 277
277/* Numa emulation */ 278/*
279 * This function is used to find out if the start and end correspond to
280 * different zones.
281 */
282int zone_cross_over(unsigned long start, unsigned long end)
283{
284 if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) &&
285 (end >= (MAX_DMA32_PFN << PAGE_SHIFT)))
286 return 1;
287 return 0;
288}
289
278static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 290static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
279{ 291{
280 int i; 292 int i, big;
281 struct bootnode nodes[MAX_NUMNODES]; 293 struct bootnode nodes[MAX_NUMNODES];
282 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake; 294 unsigned long sz, old_sz;
295 unsigned long hole_size;
296 unsigned long start, end;
297 unsigned long max_addr = (end_pfn << PAGE_SHIFT);
298
299 start = (start_pfn << PAGE_SHIFT);
300 hole_size = e820_hole_size(start, max_addr);
301 sz = (max_addr - start - hole_size) / numa_fake;
283 302
284 /* Kludge needed for the hash function */ 303 /* Kludge needed for the hash function */
285 if (hweight64(sz) > 1) {
286 unsigned long x = 1;
287 while ((x << 1) < sz)
288 x <<= 1;
289 if (x < sz/2)
290 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
291 sz = x;
292 }
293 304
305 old_sz = sz;
306 /*
307 * Round down to the nearest FAKE_NODE_MIN_SIZE.
308 */
309 sz &= FAKE_NODE_MIN_HASH_MASK;
310
311 /*
312 * We ensure that each node is at least 64MB big. Smaller than this
313 * size can cause VM hiccups.
314 */
315 if (sz == 0) {
316 printk(KERN_INFO "Not enough memory for %d nodes. Reducing "
317 "the number of nodes\n", numa_fake);
318 numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE;
319 printk(KERN_INFO "Number of fake nodes will be = %d\n",
320 numa_fake);
321 sz = FAKE_NODE_MIN_SIZE;
322 }
323 /*
324 * Find out how many nodes can get an extra NODE_MIN_SIZE granule.
325 * This logic ensures the extra memory gets distributed among as many
326 * nodes as possible (as compared to one single node getting all that
327 * extra memory.
328 */
329 big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE;
330 printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: "
331 "%d\n",
332 (sz >> 20), (hole_size >> 20), big);
294 memset(&nodes,0,sizeof(nodes)); 333 memset(&nodes,0,sizeof(nodes));
334 end = start;
295 for (i = 0; i < numa_fake; i++) { 335 for (i = 0; i < numa_fake; i++) {
296 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz; 336 /*
337 * In case we are not able to allocate enough memory for all
338 * the nodes, we reduce the number of fake nodes.
339 */
340 if (end >= max_addr) {
341 numa_fake = i - 1;
342 break;
343 }
344 start = nodes[i].start = end;
345 /*
346 * Final node can have all the remaining memory.
347 */
297 if (i == numa_fake-1) 348 if (i == numa_fake-1)
298 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; 349 sz = max_addr - start;
299 nodes[i].end = nodes[i].start + sz; 350 end = nodes[i].start + sz;
351 /*
352 * Fir "big" number of nodes get extra granule.
353 */
354 if (i < big)
355 end += FAKE_NODE_MIN_SIZE;
356 /*
357 * Iterate over the range to ensure that this node gets at
358 * least sz amount of RAM (excluding holes)
359 */
360 while ((end - start - e820_hole_size(start, end)) < sz) {
361 end += FAKE_NODE_MIN_SIZE;
362 if (end >= max_addr)
363 break;
364 }
365 /*
366 * Look at the next node to make sure there is some real memory
367 * to map. Bad things happen when the only memory present
368 * in a zone on a fake node is IO hole.
369 */
370 while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) {
371 if (zone_cross_over(start, end + sz)) {
372 end = (MAX_DMA32_PFN << PAGE_SHIFT);
373 break;
374 }
375 if (end >= max_addr)
376 break;
377 end += FAKE_NODE_MIN_SIZE;
378 }
379 if (end > max_addr)
380 end = max_addr;
381 nodes[i].end = end;
300 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", 382 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
301 i, 383 i,
302 nodes[i].start, nodes[i].end, 384 nodes[i].start, nodes[i].end,
diff --git a/include/asm-x86_64/e820.h b/include/asm-x86_64/e820.h
index 855fb4a454b6..6216fa3f2802 100644
--- a/include/asm-x86_64/e820.h
+++ b/include/asm-x86_64/e820.h
@@ -46,6 +46,7 @@ extern void e820_mark_nosave_regions(void);
46extern void e820_print_map(char *who); 46extern void e820_print_map(char *who);
47extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type); 47extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
48extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type); 48extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
49extern unsigned long e820_hole_size(unsigned long start, unsigned long end);
49 50
50extern void e820_setup_gap(void); 51extern void e820_setup_gap(void);
51extern void e820_register_active_regions(int nid, 52extern void e820_register_active_regions(int nid,
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index 39ef106986eb..fb558fb1d211 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -47,5 +47,10 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
47extern int pfn_valid(unsigned long pfn); 47extern int pfn_valid(unsigned long pfn);
48#endif 48#endif
49 49
50#ifdef CONFIG_NUMA_EMU
51#define FAKE_NODE_MIN_SIZE (64*1024*1024)
52#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1ul))
53#endif
54
50#endif 55#endif
51#endif 56#endif