diff options
author | KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> | 2011-04-15 07:39:01 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-04-15 14:28:19 -0400 |
commit | 7d6b46707f2491a94f4bd3b4329d2d7f809e9368 (patch) | |
tree | e5a5085c170c1dd673788a9f48c96cf13102be93 /arch | |
parent | 9d90e49da57fe73a2f35334fdd2fb60dbf3933ed (diff) |
x86, NUMA: Fix fakenuma boot failure
Currently, numa=fake boot parameter is broken. If it's used,
kernel may panic due to devide by zero error depending on CPU
configuration
Call Trace:
[<ffffffff8104ad4c>] find_busiest_group+0x38c/0xd30
[<ffffffff81086aff>] ? local_clock+0x6f/0x80
[<ffffffff81050533>] load_balance+0xa3/0x600
[<ffffffff81050f53>] idle_balance+0xf3/0x180
[<ffffffff81550092>] schedule+0x722/0x7d0
[<ffffffff81550538>] ? wait_for_common+0x128/0x190
[<ffffffff81550a65>] schedule_timeout+0x265/0x320
[<ffffffff81095815>] ? lock_release_holdtime+0x35/0x1a0
[<ffffffff81550538>] ? wait_for_common+0x128/0x190
[<ffffffff8109bb6c>] ? __lock_release+0x9c/0x1d0
[<ffffffff815534e0>] ? _raw_spin_unlock_irq+0x30/0x40
[<ffffffff815534e0>] ? _raw_spin_unlock_irq+0x30/0x40
[<ffffffff81550540>] wait_for_common+0x130/0x190
[<ffffffff81051920>] ? try_to_wake_up+0x510/0x510
[<ffffffff8155067d>] wait_for_completion+0x1d/0x20
[<ffffffff8107f36c>] kthread_create_on_node+0xac/0x150
[<ffffffff81077bb0>] ? process_scheduled_works+0x40/0x40
[<ffffffff8155045f>] ? wait_for_common+0x4f/0x190
[<ffffffff8107a283>] __alloc_workqueue_key+0x1a3/0x590
[<ffffffff81e0cce2>] cpuset_init_smp+0x6b/0x7b
[<ffffffff81df3d07>] kernel_init+0xc3/0x182
[<ffffffff8155d5e4>] kernel_thread_helper+0x4/0x10
[<ffffffff81553cd4>] ? retint_restore_args+0x13/0x13
[<ffffffff81df3c44>] ? start_kernel+0x400/0x400
[<ffffffff8155d5e0>] ? gs_change+0x13/0x13
The divede by zero is caused by the following line,
group->cpu_power==0:
kernel/sched_fair.c::update_sg_lb_stats()
/* Adjust by relative CPU power of the group */
sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
This regression was caused by commit e23bba6044 ("x86-64, NUMA: Unify
emulated distance mapping") because it changes cpu -> node
mapping in the process of dropping fake_physnodes().
old) all cpus are assinged node 0
now) cpus are assigned round robin
(the logic is implemented by numa_init_array())
Note: The change in behavior only happens if the system doesn't
have neither ACPI SRAT table nor AMD northbridge NUMA
information.
Round robin assignment doesn't work because init_numa_sched_groups_power()
assumes all logical cpus in the same physical cpu share the same node
(then it only accounts for group_first_cpu()), and the simple round robin
breaks the above assumption.
Thus, this patch implements a reassignment of node-ids if buggy firmware
or numa emulation makes wrong cpu node map. Tt enforce all logical cpus
in the same physical cpu share the same node.
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shaohui Zheng <shaohui.zheng@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/20110415203928.1303.A69D9226@jp.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/kernel/smpboot.c | 23 |
1 files changed, 23 insertions, 0 deletions
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c2871d3c71b6..8ed8908cc9f7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -312,6 +312,26 @@ void __cpuinit smp_store_cpu_info(int id) | |||
312 | identify_secondary_cpu(c); | 312 | identify_secondary_cpu(c); |
313 | } | 313 | } |
314 | 314 | ||
315 | static void __cpuinit check_cpu_siblings_on_same_node(int cpu1, int cpu2) | ||
316 | { | ||
317 | int node1 = early_cpu_to_node(cpu1); | ||
318 | int node2 = early_cpu_to_node(cpu2); | ||
319 | |||
320 | /* | ||
321 | * Our CPU scheduler assumes all logical cpus in the same physical cpu | ||
322 | * share the same node. But, buggy ACPI or NUMA emulation might assign | ||
323 | * them to different node. Fix it. | ||
324 | */ | ||
325 | if (node1 != node2) { | ||
326 | pr_warning("CPU %d in node %d and CPU %d in node %d are in the same physical CPU. forcing same node %d\n", | ||
327 | cpu1, node1, cpu2, node2, node2); | ||
328 | |||
329 | numa_remove_cpu(cpu1); | ||
330 | numa_set_node(cpu1, node2); | ||
331 | numa_add_cpu(cpu1); | ||
332 | } | ||
333 | } | ||
334 | |||
315 | static void __cpuinit link_thread_siblings(int cpu1, int cpu2) | 335 | static void __cpuinit link_thread_siblings(int cpu1, int cpu2) |
316 | { | 336 | { |
317 | cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); | 337 | cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); |
@@ -320,6 +340,7 @@ static void __cpuinit link_thread_siblings(int cpu1, int cpu2) | |||
320 | cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); | 340 | cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); |
321 | cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); | 341 | cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); |
322 | cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); | 342 | cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); |
343 | check_cpu_siblings_on_same_node(cpu1, cpu2); | ||
323 | } | 344 | } |
324 | 345 | ||
325 | 346 | ||
@@ -361,10 +382,12 @@ void __cpuinit set_cpu_sibling_map(int cpu) | |||
361 | per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { | 382 | per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { |
362 | cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); | 383 | cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); |
363 | cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); | 384 | cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); |
385 | check_cpu_siblings_on_same_node(cpu, i); | ||
364 | } | 386 | } |
365 | if (c->phys_proc_id == cpu_data(i).phys_proc_id) { | 387 | if (c->phys_proc_id == cpu_data(i).phys_proc_id) { |
366 | cpumask_set_cpu(i, cpu_core_mask(cpu)); | 388 | cpumask_set_cpu(i, cpu_core_mask(cpu)); |
367 | cpumask_set_cpu(cpu, cpu_core_mask(i)); | 389 | cpumask_set_cpu(cpu, cpu_core_mask(i)); |
390 | check_cpu_siblings_on_same_node(cpu, i); | ||
368 | /* | 391 | /* |
369 | * Does this new cpu bringup a new core? | 392 | * Does this new cpu bringup a new core? |
370 | */ | 393 | */ |