x86, x2apic: Minimize IPI register writes using cluster groups

In the case of x2apic cluster mode we can group IPI register writes based on the cluster group instead of individual per-cpu destination messages. This reduces the apic register writes and reduces the amount of IPI messages (in the best case we can reduce it by a factor of 16). With this change, the cost of flush_tlb_others(), with the flush tlb IPI being sent from a cpu in the socket-1 to all the logical cpus in socket-2 (on a Westmere-EX system that has 20 logical cpus in a socket) is 3x times better now (compared to the former 'send one-by-one' algorithm). Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Cc: steiner@sgi.com Cc: yinghai@kernel.org Link: http://lkml.kernel.org/r/20110519234637.512271057@sbsiddha-MOBL3.sc.intel.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Cyrill Gorcunov <gorcunov@openvz.org> 2011-05-19 19:45:49 -0400
committer: Ingo Molnar <mingo@elte.hu> 2011-05-20 07:41:09 -0400
commit: 9d0fa6c5f43f2d9c6966dcab7af96a717682fdec (patch)
tree: e3fdd94ba55748fbfc41ad5e38c32169638f2091 /arch/x86/kernel/apic
parent: a39d1f3f67f6a3d72b24f0d8bf9a295a27ea448e (diff)
1 files changed, 44 insertions, 14 deletions
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 4b2bb1381ffa..4dfe9363ff4e 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -5,6 +5,7 @@
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/dmar.h>
+#include <linux/cpu.h>
 #include <asm/smp.h>
 #include <asm/apic.h>
@@ -12,6 +13,7 @@
 static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
 static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
+static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
@@ -54,30 +56,52 @@ static inline u32 x2apic_cluster(int cpu)
        return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
 }
-/*
- * for now, we send the IPI's one by one in the cpumask.
- * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
- * at once. We have 16 cpu's in a cluster. This will minimize IPI register
- * writes.
- */
 static void
 __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
 {
-        unsigned long query_cpu;
+        struct cpumask *cpus_in_cluster_ptr;
-        unsigned long this_cpu;
+        struct cpumask *ipi_mask_ptr;
+        unsigned int cpu, this_cpu;
        unsigned long flags;
+        u32 dest;
        x2apic_wrmsr_fence();
        local_irq_save(flags);
        this_cpu = smp_processor_id();
-        for_each_cpu(query_cpu, mask) {
-                if (apic_dest == APIC_DEST_ALLBUT && query_cpu == this_cpu)
+        /*
+         * We are to modify mask, so we need an own copy
+         * and be sure it's manipulated with irq off.
+         */
+        ipi_mask_ptr = __raw_get_cpu_var(ipi_mask);
+        cpumask_copy(ipi_mask_ptr, mask);
+        /*
+         * The idea is to send one IPI per cluster.
+         */
+        for_each_cpu(cpu, ipi_mask_ptr) {
+                unsigned long i;
+                cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
+                dest = 0;
+                /* Collect cpus in cluster. */
+                for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) {
+                        if (apic_dest == APIC_DEST_ALLINC || i != this_cpu)
+                                dest |= per_cpu(x86_cpu_to_logical_apicid, i);
+                }
+                if (!dest)
                        continue;
-                __x2apic_send_IPI_dest(
-                        per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+                __x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
-                        vector, apic->dest_logical);
+                /*
+                 * Cluster sibling cpus should be discared now so
+                 * we would not send IPI them second time.
+                 */
+                cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr);
        }
        local_irq_restore(flags);
@@ -198,6 +222,10 @@ update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
                if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu),
                                        GFP_KERNEL)) {
                        err = -ENOMEM;
+                } else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu),
+                                               GFP_KERNEL)) {
+                        free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+                        err = -ENOMEM;
                }
                break;
        case CPU_UP_CANCELED:
@@ -210,6 +238,7 @@ update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
                        __cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
                }
                free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+                free_cpumask_var(per_cpu(ipi_mask, this_cpu));
                break;
        }
@@ -225,8 +254,9 @@ static int x2apic_init_cpu_notifier(void)
        int cpu = smp_processor_id();
        zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
+        zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
-        BUG_ON(!per_cpu(cpus_in_cluster, cpu));
+        BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
        __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
        register_hotcpu_notifier(&x2apic_cpu_notifier);
author	Cyrill Gorcunov <gorcunov@openvz.org>	2011-05-19 19:45:49 -0400
committer	Ingo Molnar <mingo@elte.hu>	2011-05-20 07:41:09 -0400
commit	9d0fa6c5f43f2d9c6966dcab7af96a717682fdec (patch)
tree	e3fdd94ba55748fbfc41ad5e38c32169638f2091 /arch/x86/kernel/apic
parent	a39d1f3f67f6a3d72b24f0d8bf9a295a27ea448e (diff)

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 4b2bb1381ffa..4dfe9363ff4e 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -5,6 +5,7 @@
5	#include <linux/ctype.h>	5	#include <linux/ctype.h>
6	#include <linux/init.h>	6	#include <linux/init.h>
7	#include <linux/dmar.h>	7	#include <linux/dmar.h>
		8	#include <linux/cpu.h>
8		9
9	#include <asm/smp.h>	10	#include <asm/smp.h>
10	#include <asm/apic.h>	11	#include <asm/apic.h>
@@ -12,6 +13,7 @@
12		13
13	static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);	14	static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14	static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);	15	static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
		16	static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
15		17
16	static int x2apic_acpi_madt_oem_check(char oem_id, char oem_table_id)	18	static int x2apic_acpi_madt_oem_check(char oem_id, char oem_table_id)
17	{	19	{
@@ -54,30 +56,52 @@ static inline u32 x2apic_cluster(int cpu)
54	return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;	56	return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
55	}	57	}
56		58
57	/*
58	* for now, we send the IPI's one by one in the cpumask.
59	* TBD: Based on the cpu mask, we can send the IPI's to the cluster group
60	* at once. We have 16 cpu's in a cluster. This will minimize IPI register
61	* writes.
62	*/
63	static void	59	static void
64	__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)	60	__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
65	{	61	{
66	unsigned long query_cpu;	62	struct cpumask *cpus_in_cluster_ptr;
67	unsigned long this_cpu;	63	struct cpumask *ipi_mask_ptr;
		64	unsigned int cpu, this_cpu;
68	unsigned long flags;	65	unsigned long flags;
		66	u32 dest;
69		67
70	x2apic_wrmsr_fence();	68	x2apic_wrmsr_fence();
71		69
72	local_irq_save(flags);	70	local_irq_save(flags);
73		71
74	this_cpu = smp_processor_id();	72	this_cpu = smp_processor_id();
75	for_each_cpu(query_cpu, mask) {	73
76	if (apic_dest == APIC_DEST_ALLBUT && query_cpu == this_cpu)	74	/*
		75	* We are to modify mask, so we need an own copy
		76	* and be sure it's manipulated with irq off.
		77	*/
		78	ipi_mask_ptr = __raw_get_cpu_var(ipi_mask);
		79	cpumask_copy(ipi_mask_ptr, mask);
		80
		81	/*
		82	* The idea is to send one IPI per cluster.
		83	*/
		84	for_each_cpu(cpu, ipi_mask_ptr) {
		85	unsigned long i;
		86
		87	cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
		88	dest = 0;
		89
		90	/* Collect cpus in cluster. */
		91	for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) {
		92	if (apic_dest == APIC_DEST_ALLINC \|\| i != this_cpu)
		93	dest \|= per_cpu(x86_cpu_to_logical_apicid, i);
		94	}
		95
		96	if (!dest)
77	continue;	97	continue;
78	__x2apic_send_IPI_dest(	98
79	per_cpu(x86_cpu_to_logical_apicid, query_cpu),	99	__x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
80	vector, apic->dest_logical);	100	/*
		101	* Cluster sibling cpus should be discared now so
		102	* we would not send IPI them second time.
		103	*/
		104	cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr);
81	}	105	}
82		106
83	local_irq_restore(flags);	107	local_irq_restore(flags);
@@ -198,6 +222,10 @@ update_clusterinfo(struct notifier_block nfb, unsigned long action, void hcpu)
198	if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu),	222	if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu),
199	GFP_KERNEL)) {	223	GFP_KERNEL)) {
200	err = -ENOMEM;	224	err = -ENOMEM;
		225	} else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu),
		226	GFP_KERNEL)) {
		227	free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
		228	err = -ENOMEM;
201	}	229	}
202	break;	230	break;
203	case CPU_UP_CANCELED:	231	case CPU_UP_CANCELED:
@@ -210,6 +238,7 @@ update_clusterinfo(struct notifier_block nfb, unsigned long action, void hcpu)
210	__cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));	238	__cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
211	}	239	}
212	free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));	240	free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
		241	free_cpumask_var(per_cpu(ipi_mask, this_cpu));
213	break;	242	break;
214	}	243	}
215		244
@@ -225,8 +254,9 @@ static int x2apic_init_cpu_notifier(void)
225	int cpu = smp_processor_id();	254	int cpu = smp_processor_id();
226		255
227	zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);	256	zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
		257	zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
228		258
229	BUG_ON(!per_cpu(cpus_in_cluster, cpu));	259	BUG_ON(!per_cpu(cpus_in_cluster, cpu) \|\| !per_cpu(ipi_mask, cpu));
230		260
231	__cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));	261	__cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
232	register_hotcpu_notifier(&x2apic_cpu_notifier);	262	register_hotcpu_notifier(&x2apic_cpu_notifier);