diff options
Diffstat (limited to 'arch/x86_64/mm/numa.c')
-rw-r--r-- | arch/x86_64/mm/numa.c | 306 |
1 files changed, 199 insertions, 107 deletions
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 41b8fb069924..51548947ad3b 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c | |||
@@ -273,125 +273,213 @@ void __init numa_init_array(void) | |||
273 | 273 | ||
274 | #ifdef CONFIG_NUMA_EMU | 274 | #ifdef CONFIG_NUMA_EMU |
275 | /* Numa emulation */ | 275 | /* Numa emulation */ |
276 | int numa_fake __initdata = 0; | 276 | #define E820_ADDR_HOLE_SIZE(start, end) \ |
277 | (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \ | ||
278 | PAGE_SHIFT) | ||
279 | char *cmdline __initdata; | ||
277 | 280 | ||
278 | /* | 281 | /* |
279 | * This function is used to find out if the start and end correspond to | 282 | * Setups up nid to range from addr to addr + size. If the end boundary is |
280 | * different zones. | 283 | * greater than max_addr, then max_addr is used instead. The return value is 0 |
284 | * if there is additional memory left for allocation past addr and -1 otherwise. | ||
285 | * addr is adjusted to be at the end of the node. | ||
281 | */ | 286 | */ |
282 | int zone_cross_over(unsigned long start, unsigned long end) | 287 | static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, |
288 | u64 size, u64 max_addr) | ||
283 | { | 289 | { |
284 | if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && | 290 | int ret = 0; |
285 | (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) | 291 | nodes[nid].start = *addr; |
286 | return 1; | 292 | *addr += size; |
287 | return 0; | 293 | if (*addr >= max_addr) { |
294 | *addr = max_addr; | ||
295 | ret = -1; | ||
296 | } | ||
297 | nodes[nid].end = *addr; | ||
298 | node_set(nid, node_possible_map); | ||
299 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, | ||
300 | nodes[nid].start, nodes[nid].end, | ||
301 | (nodes[nid].end - nodes[nid].start) >> 20); | ||
302 | return ret; | ||
288 | } | 303 | } |
289 | 304 | ||
290 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | 305 | /* |
306 | * Splits num_nodes nodes up equally starting at node_start. The return value | ||
307 | * is the number of nodes split up and addr is adjusted to be at the end of the | ||
308 | * last node allocated. | ||
309 | */ | ||
310 | static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, | ||
311 | u64 max_addr, int node_start, | ||
312 | int num_nodes) | ||
291 | { | 313 | { |
292 | int i, big; | 314 | unsigned int big; |
293 | struct bootnode nodes[MAX_NUMNODES]; | 315 | u64 size; |
294 | unsigned long sz, old_sz; | 316 | int i; |
295 | unsigned long hole_size; | ||
296 | unsigned long start, end; | ||
297 | unsigned long max_addr = (end_pfn << PAGE_SHIFT); | ||
298 | |||
299 | start = (start_pfn << PAGE_SHIFT); | ||
300 | hole_size = e820_hole_size(start, max_addr); | ||
301 | sz = (max_addr - start - hole_size) / numa_fake; | ||
302 | |||
303 | /* Kludge needed for the hash function */ | ||
304 | |||
305 | old_sz = sz; | ||
306 | /* | ||
307 | * Round down to the nearest FAKE_NODE_MIN_SIZE. | ||
308 | */ | ||
309 | sz &= FAKE_NODE_MIN_HASH_MASK; | ||
310 | 317 | ||
318 | if (num_nodes <= 0) | ||
319 | return -1; | ||
320 | if (num_nodes > MAX_NUMNODES) | ||
321 | num_nodes = MAX_NUMNODES; | ||
322 | size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) / | ||
323 | num_nodes; | ||
311 | /* | 324 | /* |
312 | * We ensure that each node is at least 64MB big. Smaller than this | 325 | * Calculate the number of big nodes that can be allocated as a result |
313 | * size can cause VM hiccups. | 326 | * of consolidating the leftovers. |
314 | */ | 327 | */ |
315 | if (sz == 0) { | 328 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / |
316 | printk(KERN_INFO "Not enough memory for %d nodes. Reducing " | 329 | FAKE_NODE_MIN_SIZE; |
317 | "the number of nodes\n", numa_fake); | 330 | |
318 | numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; | 331 | /* Round down to nearest FAKE_NODE_MIN_SIZE. */ |
319 | printk(KERN_INFO "Number of fake nodes will be = %d\n", | 332 | size &= FAKE_NODE_MIN_HASH_MASK; |
320 | numa_fake); | 333 | if (!size) { |
321 | sz = FAKE_NODE_MIN_SIZE; | 334 | printk(KERN_ERR "Not enough memory for each node. " |
335 | "NUMA emulation disabled.\n"); | ||
336 | return -1; | ||
322 | } | 337 | } |
323 | /* | 338 | |
324 | * Find out how many nodes can get an extra NODE_MIN_SIZE granule. | 339 | for (i = node_start; i < num_nodes + node_start; i++) { |
325 | * This logic ensures the extra memory gets distributed among as many | 340 | u64 end = *addr + size; |
326 | * nodes as possible (as compared to one single node getting all that | ||
327 | * extra memory. | ||
328 | */ | ||
329 | big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE; | ||
330 | printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: " | ||
331 | "%d\n", | ||
332 | (sz >> 20), (hole_size >> 20), big); | ||
333 | memset(&nodes,0,sizeof(nodes)); | ||
334 | end = start; | ||
335 | for (i = 0; i < numa_fake; i++) { | ||
336 | /* | ||
337 | * In case we are not able to allocate enough memory for all | ||
338 | * the nodes, we reduce the number of fake nodes. | ||
339 | */ | ||
340 | if (end >= max_addr) { | ||
341 | numa_fake = i - 1; | ||
342 | break; | ||
343 | } | ||
344 | start = nodes[i].start = end; | ||
345 | /* | ||
346 | * Final node can have all the remaining memory. | ||
347 | */ | ||
348 | if (i == numa_fake-1) | ||
349 | sz = max_addr - start; | ||
350 | end = nodes[i].start + sz; | ||
351 | /* | ||
352 | * Fir "big" number of nodes get extra granule. | ||
353 | */ | ||
354 | if (i < big) | 341 | if (i < big) |
355 | end += FAKE_NODE_MIN_SIZE; | 342 | end += FAKE_NODE_MIN_SIZE; |
356 | /* | 343 | /* |
357 | * Iterate over the range to ensure that this node gets at | 344 | * The final node can have the remaining system RAM. Other |
358 | * least sz amount of RAM (excluding holes) | 345 | * nodes receive roughly the same amount of available pages. |
359 | */ | 346 | */ |
360 | while ((end - start - e820_hole_size(start, end)) < sz) { | 347 | if (i == num_nodes + node_start - 1) |
361 | end += FAKE_NODE_MIN_SIZE; | 348 | end = max_addr; |
362 | if (end >= max_addr) | 349 | else |
363 | break; | 350 | while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) < |
351 | size) { | ||
352 | end += FAKE_NODE_MIN_SIZE; | ||
353 | if (end > max_addr) { | ||
354 | end = max_addr; | ||
355 | break; | ||
356 | } | ||
357 | } | ||
358 | if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) | ||
359 | break; | ||
360 | } | ||
361 | return i - node_start + 1; | ||
362 | } | ||
363 | |||
364 | /* | ||
365 | * Splits the remaining system RAM into chunks of size. The remaining memory is | ||
366 | * always assigned to a final node and can be asymmetric. Returns the number of | ||
367 | * nodes split. | ||
368 | */ | ||
369 | static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, | ||
370 | u64 max_addr, int node_start, u64 size) | ||
371 | { | ||
372 | int i = node_start; | ||
373 | size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; | ||
374 | while (!setup_node_range(i++, nodes, addr, size, max_addr)) | ||
375 | ; | ||
376 | return i - node_start; | ||
377 | } | ||
378 | |||
379 | /* | ||
380 | * Sets up the system RAM area from start_pfn to end_pfn according to the | ||
381 | * numa=fake command-line option. | ||
382 | */ | ||
383 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | ||
384 | { | ||
385 | struct bootnode nodes[MAX_NUMNODES]; | ||
386 | u64 addr = start_pfn << PAGE_SHIFT; | ||
387 | u64 max_addr = end_pfn << PAGE_SHIFT; | ||
388 | int num_nodes = 0; | ||
389 | int coeff_flag; | ||
390 | int coeff = -1; | ||
391 | int num = 0; | ||
392 | u64 size; | ||
393 | int i; | ||
394 | |||
395 | memset(&nodes, 0, sizeof(nodes)); | ||
396 | /* | ||
397 | * If the numa=fake command-line is just a single number N, split the | ||
398 | * system RAM into N fake nodes. | ||
399 | */ | ||
400 | if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { | ||
401 | num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, | ||
402 | simple_strtol(cmdline, NULL, 0)); | ||
403 | if (num_nodes < 0) | ||
404 | return num_nodes; | ||
405 | goto out; | ||
406 | } | ||
407 | |||
408 | /* Parse the command line. */ | ||
409 | for (coeff_flag = 0; ; cmdline++) { | ||
410 | if (*cmdline && isdigit(*cmdline)) { | ||
411 | num = num * 10 + *cmdline - '0'; | ||
412 | continue; | ||
364 | } | 413 | } |
365 | /* | 414 | if (*cmdline == '*') { |
366 | * Look at the next node to make sure there is some real memory | 415 | if (num > 0) |
367 | * to map. Bad things happen when the only memory present | 416 | coeff = num; |
368 | * in a zone on a fake node is IO hole. | 417 | coeff_flag = 1; |
369 | */ | 418 | } |
370 | while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { | 419 | if (!*cmdline || *cmdline == ',') { |
371 | if (zone_cross_over(start, end + sz)) { | 420 | if (!coeff_flag) |
372 | end = (MAX_DMA32_PFN << PAGE_SHIFT); | 421 | coeff = 1; |
422 | /* | ||
423 | * Round down to the nearest FAKE_NODE_MIN_SIZE. | ||
424 | * Command-line coefficients are in megabytes. | ||
425 | */ | ||
426 | size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; | ||
427 | if (size) | ||
428 | for (i = 0; i < coeff; i++, num_nodes++) | ||
429 | if (setup_node_range(num_nodes, nodes, | ||
430 | &addr, size, max_addr) < 0) | ||
431 | goto done; | ||
432 | if (!*cmdline) | ||
373 | break; | 433 | break; |
374 | } | 434 | coeff_flag = 0; |
375 | if (end >= max_addr) | 435 | coeff = -1; |
436 | } | ||
437 | num = 0; | ||
438 | } | ||
439 | done: | ||
440 | if (!num_nodes) | ||
441 | return -1; | ||
442 | /* Fill remainder of system RAM, if appropriate. */ | ||
443 | if (addr < max_addr) { | ||
444 | if (coeff_flag && coeff < 0) { | ||
445 | /* Split remaining nodes into num-sized chunks */ | ||
446 | num_nodes += split_nodes_by_size(nodes, &addr, max_addr, | ||
447 | num_nodes, num); | ||
448 | goto out; | ||
449 | } | ||
450 | switch (*(cmdline - 1)) { | ||
451 | case '*': | ||
452 | /* Split remaining nodes into coeff chunks */ | ||
453 | if (coeff <= 0) | ||
376 | break; | 454 | break; |
377 | end += FAKE_NODE_MIN_SIZE; | 455 | num_nodes += split_nodes_equally(nodes, &addr, max_addr, |
456 | num_nodes, coeff); | ||
457 | break; | ||
458 | case ',': | ||
459 | /* Do not allocate remaining system RAM */ | ||
460 | break; | ||
461 | default: | ||
462 | /* Give one final node */ | ||
463 | setup_node_range(num_nodes, nodes, &addr, | ||
464 | max_addr - addr, max_addr); | ||
465 | num_nodes++; | ||
378 | } | 466 | } |
379 | if (end > max_addr) | 467 | } |
380 | end = max_addr; | 468 | out: |
381 | nodes[i].end = end; | 469 | memnode_shift = compute_hash_shift(nodes, num_nodes); |
382 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", | 470 | if (memnode_shift < 0) { |
383 | i, | 471 | memnode_shift = 0; |
384 | nodes[i].start, nodes[i].end, | 472 | printk(KERN_ERR "No NUMA hash function found. NUMA emulation " |
385 | (nodes[i].end - nodes[i].start) >> 20); | 473 | "disabled.\n"); |
386 | node_set_online(i); | 474 | return -1; |
387 | } | 475 | } |
388 | memnode_shift = compute_hash_shift(nodes, numa_fake); | 476 | |
389 | if (memnode_shift < 0) { | 477 | /* |
390 | memnode_shift = 0; | 478 | * We need to vacate all active ranges that may have been registered by |
391 | printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); | 479 | * SRAT. |
392 | return -1; | 480 | */ |
393 | } | 481 | remove_all_active_ranges(); |
394 | for_each_online_node(i) { | 482 | for_each_node_mask(i, node_possible_map) { |
395 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, | 483 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, |
396 | nodes[i].end >> PAGE_SHIFT); | 484 | nodes[i].end >> PAGE_SHIFT); |
397 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | 485 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); |
@@ -399,26 +487,32 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | |||
399 | numa_init_array(); | 487 | numa_init_array(); |
400 | return 0; | 488 | return 0; |
401 | } | 489 | } |
402 | #endif | 490 | #undef E820_ADDR_HOLE_SIZE |
491 | #endif /* CONFIG_NUMA_EMU */ | ||
403 | 492 | ||
404 | void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | 493 | void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) |
405 | { | 494 | { |
406 | int i; | 495 | int i; |
407 | 496 | ||
497 | nodes_clear(node_possible_map); | ||
498 | |||
408 | #ifdef CONFIG_NUMA_EMU | 499 | #ifdef CONFIG_NUMA_EMU |
409 | if (numa_fake && !numa_emulation(start_pfn, end_pfn)) | 500 | if (cmdline && !numa_emulation(start_pfn, end_pfn)) |
410 | return; | 501 | return; |
502 | nodes_clear(node_possible_map); | ||
411 | #endif | 503 | #endif |
412 | 504 | ||
413 | #ifdef CONFIG_ACPI_NUMA | 505 | #ifdef CONFIG_ACPI_NUMA |
414 | if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, | 506 | if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, |
415 | end_pfn << PAGE_SHIFT)) | 507 | end_pfn << PAGE_SHIFT)) |
416 | return; | 508 | return; |
509 | nodes_clear(node_possible_map); | ||
417 | #endif | 510 | #endif |
418 | 511 | ||
419 | #ifdef CONFIG_K8_NUMA | 512 | #ifdef CONFIG_K8_NUMA |
420 | if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) | 513 | if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) |
421 | return; | 514 | return; |
515 | nodes_clear(node_possible_map); | ||
422 | #endif | 516 | #endif |
423 | printk(KERN_INFO "%s\n", | 517 | printk(KERN_INFO "%s\n", |
424 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); | 518 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); |
@@ -432,6 +526,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | |||
432 | memnodemap[0] = 0; | 526 | memnodemap[0] = 0; |
433 | nodes_clear(node_online_map); | 527 | nodes_clear(node_online_map); |
434 | node_set_online(0); | 528 | node_set_online(0); |
529 | node_set(0, node_possible_map); | ||
435 | for (i = 0; i < NR_CPUS; i++) | 530 | for (i = 0; i < NR_CPUS; i++) |
436 | numa_set_node(i, 0); | 531 | numa_set_node(i, 0); |
437 | node_to_cpumask[0] = cpumask_of_cpu(0); | 532 | node_to_cpumask[0] = cpumask_of_cpu(0); |
@@ -486,11 +581,8 @@ static __init int numa_setup(char *opt) | |||
486 | if (!strncmp(opt,"off",3)) | 581 | if (!strncmp(opt,"off",3)) |
487 | numa_off = 1; | 582 | numa_off = 1; |
488 | #ifdef CONFIG_NUMA_EMU | 583 | #ifdef CONFIG_NUMA_EMU |
489 | if(!strncmp(opt, "fake=", 5)) { | 584 | if (!strncmp(opt, "fake=", 5)) |
490 | numa_fake = simple_strtoul(opt+5,NULL,0); ; | 585 | cmdline = opt + 5; |
491 | if (numa_fake >= MAX_NUMNODES) | ||
492 | numa_fake = MAX_NUMNODES; | ||
493 | } | ||
494 | #endif | 586 | #endif |
495 | #ifdef CONFIG_ACPI_NUMA | 587 | #ifdef CONFIG_ACPI_NUMA |
496 | if (!strncmp(opt,"noacpi",6)) | 588 | if (!strncmp(opt,"noacpi",6)) |