x86, NUMA: Fix fakenuma boot failure

Currently, numa=fake boot parameter is broken. If it's used, kernel may panic due to devide by zero error depending on CPU configuration Call Trace: [<ffffffff8104ad4c>] find_busiest_group+0x38c/0xd30 [<ffffffff81086aff>] ? local_clock+0x6f/0x80 [<ffffffff81050533>] load_balance+0xa3/0x600 [<ffffffff81050f53>] idle_balance+0xf3/0x180 [<ffffffff81550092>] schedule+0x722/0x7d0 [<ffffffff81550538>] ? wait_for_common+0x128/0x190 [<ffffffff81550a65>] schedule_timeout+0x265/0x320 [<ffffffff81095815>] ? lock_release_holdtime+0x35/0x1a0 [<ffffffff81550538>] ? wait_for_common+0x128/0x190 [<ffffffff8109bb6c>] ? __lock_release+0x9c/0x1d0 [<ffffffff815534e0>] ? _raw_spin_unlock_irq+0x30/0x40 [<ffffffff815534e0>] ? _raw_spin_unlock_irq+0x30/0x40 [<ffffffff81550540>] wait_for_common+0x130/0x190 [<ffffffff81051920>] ? try_to_wake_up+0x510/0x510 [<ffffffff8155067d>] wait_for_completion+0x1d/0x20 [<ffffffff8107f36c>] kthread_create_on_node+0xac/0x150 [<ffffffff81077bb0>] ? process_scheduled_works+0x40/0x40 [<ffffffff8155045f>] ? wait_for_common+0x4f/0x190 [<ffffffff8107a283>] __alloc_workqueue_key+0x1a3/0x590 [<ffffffff81e0cce2>] cpuset_init_smp+0x6b/0x7b [<ffffffff81df3d07>] kernel_init+0xc3/0x182 [<ffffffff8155d5e4>] kernel_thread_helper+0x4/0x10 [<ffffffff81553cd4>] ? retint_restore_args+0x13/0x13 [<ffffffff81df3c44>] ? start_kernel+0x400/0x400 [<ffffffff8155d5e0>] ? gs_change+0x13/0x13 The divede by zero is caused by the following line, group->cpu_power==0: kernel/sched_fair.c::update_sg_lb_stats() /* Adjust by relative CPU power of the group */ sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; This regression was caused by commit e23bba6044 ("x86-64, NUMA: Unify emulated distance mapping") because it changes cpu -> node mapping in the process of dropping fake_physnodes(). old) all cpus are assinged node 0 now) cpus are assigned round robin (the logic is implemented by numa_init_array()) Note: The change in behavior only happens if the system doesn't have neither ACPI SRAT table nor AMD northbridge NUMA information. Round robin assignment doesn't work because init_numa_sched_groups_power() assumes all logical cpus in the same physical cpu share the same node (then it only accounts for group_first_cpu()), and the simple round robin breaks the above assumption. Thus, this patch implements a reassignment of node-ids if buggy firmware or numa emulation makes wrong cpu node map. Tt enforce all logical cpus in the same physical cpu share the same node. Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Acked-by: Tejun Heo <tj@kernel.org> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Brian Gerst <brgerst@gmail.com> Cc: Cyrill Gorcunov <gorcunov@gmail.com> Cc: Shaohui Zheng <shaohui.zheng@intel.com> Cc: David Rientjes <rientjes@google.com> Cc: H. Peter Anvin <hpa@linux.intel.com> Link: http://lkml.kernel.org/r/20110415203928.1303.A69D9226@jp.fujitsu.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> 2011-04-15 07:39:01 -0400
committer: Ingo Molnar <mingo@elte.hu> 2011-04-15 14:28:19 -0400
commit: 7d6b46707f2491a94f4bd3b4329d2d7f809e9368 (patch)
tree: e5a5085c170c1dd673788a9f48c96cf13102be93
parent: 9d90e49da57fe73a2f35334fdd2fb60dbf3933ed (diff)
1 files changed, 23 insertions, 0 deletions
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c2871d3c71b..8ed8908cc9f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -312,6 +312,26 @@ void __cpuinit smp_store_cpu_info(int id)
                identify_secondary_cpu(c);
 }
+static void __cpuinit check_cpu_siblings_on_same_node(int cpu1, int cpu2)
+{
+        int node1 = early_cpu_to_node(cpu1);
+        int node2 = early_cpu_to_node(cpu2);
+        /*
+         * Our CPU scheduler assumes all logical cpus in the same physical cpu
+         * share the same node. But, buggy ACPI or NUMA emulation might assign
+         * them to different node. Fix it.
+         */
+        if (node1 != node2) {
+                pr_warning("CPU %d in node %d and CPU %d in node %d are in the same physical CPU. forcing same node %d\n",
+                           cpu1, node1, cpu2, node2, node2);
+                numa_remove_cpu(cpu1);
+                numa_set_node(cpu1, node2);
+                numa_add_cpu(cpu1);
+        }
+}
 static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
 {
        cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
@@ -320,6 +340,7 @@ static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
        cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
        cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
        cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
+        check_cpu_siblings_on_same_node(cpu1, cpu2);
 }
@@ -361,10 +382,12 @@ void __cpuinit set_cpu_sibling_map(int cpu)
                    per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
                        cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
                        cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
+                        check_cpu_siblings_on_same_node(cpu, i);
                }
                if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
                        cpumask_set_cpu(i, cpu_core_mask(cpu));
                        cpumask_set_cpu(cpu, cpu_core_mask(i));
+                        check_cpu_siblings_on_same_node(cpu, i);
                        /*
                         *  Does this new cpu bringup a new core?
                         */
author	KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>	2011-04-15 07:39:01 -0400
committer	Ingo Molnar <mingo@elte.hu>	2011-04-15 14:28:19 -0400
commit	7d6b46707f2491a94f4bd3b4329d2d7f809e9368 (patch)
tree	e5a5085c170c1dd673788a9f48c96cf13102be93
parent	9d90e49da57fe73a2f35334fdd2fb60dbf3933ed (diff)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c2871d3c71b..8ed8908cc9f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c
@@ -312,6 +312,26 @@ void __cpuinit smp_store_cpu_info(int id)
312	identify_secondary_cpu(c);	312	identify_secondary_cpu(c);
313	}	313	}
314		314
		315	static void __cpuinit check_cpu_siblings_on_same_node(int cpu1, int cpu2)
		316	{
		317	int node1 = early_cpu_to_node(cpu1);
		318	int node2 = early_cpu_to_node(cpu2);
		319
		320	/*
		321	* Our CPU scheduler assumes all logical cpus in the same physical cpu
		322	* share the same node. But, buggy ACPI or NUMA emulation might assign
		323	* them to different node. Fix it.
		324	*/
		325	if (node1 != node2) {
		326	pr_warning("CPU %d in node %d and CPU %d in node %d are in the same physical CPU. forcing same node %d\n",
		327	cpu1, node1, cpu2, node2, node2);
		328
		329	numa_remove_cpu(cpu1);
		330	numa_set_node(cpu1, node2);
		331	numa_add_cpu(cpu1);
		332	}
		333	}
		334
315	static void __cpuinit link_thread_siblings(int cpu1, int cpu2)	335	static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
316	{	336	{
317	cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));	337	cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
@@ -320,6 +340,7 @@ static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
320	cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));	340	cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
321	cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));	341	cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
322	cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));	342	cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
		343	check_cpu_siblings_on_same_node(cpu1, cpu2);
323	}	344	}
324		345
325		346
@@ -361,10 +382,12 @@ void __cpuinit set_cpu_sibling_map(int cpu)
361	per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {	382	per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
362	cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));	383	cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
363	cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));	384	cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
		385	check_cpu_siblings_on_same_node(cpu, i);
364	}	386	}
365	if (c->phys_proc_id == cpu_data(i).phys_proc_id) {	387	if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
366	cpumask_set_cpu(i, cpu_core_mask(cpu));	388	cpumask_set_cpu(i, cpu_core_mask(cpu));
367	cpumask_set_cpu(cpu, cpu_core_mask(i));	389	cpumask_set_cpu(cpu, cpu_core_mask(i));
		390	check_cpu_siblings_on_same_node(cpu, i);
368	/*	391	/*
369	* Does this new cpu bringup a new core?	392	* Does this new cpu bringup a new core?
370	*/	393	*/