smp: make smp_call_function_many() use logic similar to smp_call_function_single()

I'm testing swapout workload in a two-socket Xeon machine. The workload has 10 threads, each thread sequentially accesses separate memory region. TLB flush overhead is very big in the workload. For each page, page reclaim need move it from active lru list and then unmap it. Both need a TLB flush. And this is a multthread workload, TLB flush happens in 10 CPUs. In X86, TLB flush uses generic smp_call)function. So this workload stress smp_call_function_many heavily. Without patch, perf shows: + 24.49% [k] generic_smp_call_function_interrupt - 21.72% [k] _raw_spin_lock - _raw_spin_lock + 79.80% __page_check_address + 6.42% generic_smp_call_function_interrupt + 3.31% get_swap_page + 2.37% free_pcppages_bulk + 1.75% handle_pte_fault + 1.54% put_super + 1.41% grab_super_passive + 1.36% __swap_duplicate + 0.68% blk_flush_plug_list + 0.62% swap_info_get + 6.55% [k] flush_tlb_func + 6.46% [k] smp_call_function_many + 5.09% [k] call_function_interrupt + 4.75% [k] default_send_IPI_mask_sequence_phys + 2.18% [k] find_next_bit swapout throughput is around 1300M/s. With the patch, perf shows: - 27.23% [k] _raw_spin_lock - _raw_spin_lock + 80.53% __page_check_address + 8.39% generic_smp_call_function_single_interrupt + 2.44% get_swap_page + 1.76% free_pcppages_bulk + 1.40% handle_pte_fault + 1.15% __swap_duplicate + 1.05% put_super + 0.98% grab_super_passive + 0.86% blk_flush_plug_list + 0.57% swap_info_get + 8.25% [k] default_send_IPI_mask_sequence_phys + 7.55% [k] call_function_interrupt + 7.47% [k] smp_call_function_many + 7.25% [k] flush_tlb_func + 3.81% [k] _raw_spin_lock_irqsave + 3.78% [k] generic_smp_call_function_single_interrupt swapout throughput is around 1400M/s. So there is around a 7% improvement, and total cpu utilization doesn't change. Without the patch, cfd_data is shared by all CPUs. generic_smp_call_function_interrupt does read/write cfd_data several times which will create a lot of cache ping-pong. With the patch, the data becomes per-cpu. The ping-pong is avoided. And from the perf data, this doesn't make call_single_queue lock contend. Next step is to remove generic_smp_call_function_interrupt() from arch code. Signed-off-by: Shaohua Li <shli@fusionio.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Ingo Molnar <mingo@elte.hu> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Shaohua Li <shli@kernel.org> 2013-02-21 19:43:03 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-02-21 20:22:20 -0500
commit: 9a46ad6d6df3b547d057c39db13f69d7170a99e9 (patch)
tree: 684a14d248639c2411c46f53dc7acba4a81c357c /kernel
parent: 6d1c7ccae946f5c959f5c9236813d9c33ae48537 (diff)
1 files changed, 30 insertions, 153 deletions
diff --git a/kernel/smp.c b/kernel/smp.c
index 69f38bd98b42..8e451f3ff51b 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -16,22 +16,12 @@
 #include "smpboot.h"
 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
-static struct {
-        struct list_head        queue;
-        raw_spinlock_t          lock;
-} call_function __cacheline_aligned_in_smp =
-        {
-                .queue          = LIST_HEAD_INIT(call_function.queue),
-                .lock           = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
-        };
 enum {
        CSD_FLAG_LOCK           = 0x01,
 };
 struct call_function_data {
-        struct call_single_data csd;
+        struct call_single_data __percpu *csd;
-        atomic_t                refs;
        cpumask_var_t           cpumask;
        cpumask_var_t           cpumask_ipi;
 };
@@ -60,6 +50,11 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
                if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
                                cpu_to_node(cpu)))
                        return notifier_from_errno(-ENOMEM);
+                cfd->csd = alloc_percpu(struct call_single_data);
+                if (!cfd->csd) {
+                        free_cpumask_var(cfd->cpumask);
+                        return notifier_from_errno(-ENOMEM);
+                }
                break;
 #ifdef CONFIG_HOTPLUG_CPU
@@ -70,6 +65,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
        case CPU_DEAD_FROZEN:
                free_cpumask_var(cfd->cpumask);
                free_cpumask_var(cfd->cpumask_ipi);
+                free_percpu(cfd->csd);
                break;
 #endif
        };
@@ -171,85 +167,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
 }
 /*
- * Invoked by arch to handle an IPI for call function. Must be called with
- * interrupts disabled.
- */
-void generic_smp_call_function_interrupt(void)
-{
-        struct call_function_data *data;
-        int cpu = smp_processor_id();
-        /*
-         * Shouldn't receive this interrupt on a cpu that is not yet online.
-         */
-        WARN_ON_ONCE(!cpu_online(cpu));
-        /*
-         * Ensure entry is visible on call_function_queue after we have
-         * entered the IPI. See comment in smp_call_function_many.
-         * If we don't have this, then we may miss an entry on the list
-         * and never get another IPI to process it.
-         */
-        smp_mb();
-        /*
-         * It's ok to use list_for_each_rcu() here even though we may
-         * delete 'pos', since list_del_rcu() doesn't clear ->next
-         */
-        list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
-                int refs;
-                smp_call_func_t func;
-                /*
-                 * Since we walk the list without any locks, we might
-                 * see an entry that was completed, removed from the
-                 * list and is in the process of being reused.
-                 *
-                 * We must check that the cpu is in the cpumask before
-                 * checking the refs, and both must be set before
-                 * executing the callback on this cpu.
-                 */
-                if (!cpumask_test_cpu(cpu, data->cpumask))
-                        continue;
-                smp_rmb();
-                if (atomic_read(&data->refs) == 0)
-                        continue;
-                func = data->csd.func;          /* save for later warn */
-                func(data->csd.info);
-                /*
-                 * If the cpu mask is not still set then func enabled
-                 * interrupts (BUG), and this cpu took another smp call
-                 * function interrupt and executed func(info) twice
-                 * on this cpu.  That nested execution decremented refs.
-                 */
-                if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
-                        WARN(1, "%pf enabled interrupts and double executed\n", func);
-                        continue;
-                }
-                refs = atomic_dec_return(&data->refs);
-                WARN_ON(refs < 0);
-                if (refs)
-                        continue;
-                WARN_ON(!cpumask_empty(data->cpumask));
-                raw_spin_lock(&call_function.lock);
-                list_del_rcu(&data->csd.list);
-                raw_spin_unlock(&call_function.lock);
-                csd_unlock(&data->csd);
-        }
-}
-/*
 * Invoked by arch to handle an IPI for call function single. Must be
 * called from the arch with interrupts disabled.
 */
@@ -453,8 +370,7 @@ void smp_call_function_many(const struct cpumask *mask,
                            smp_call_func_t func, void *info, bool wait)
 {
        struct call_function_data *data;
-        unsigned long flags;
+        int cpu, next_cpu, this_cpu = smp_processor_id();
-        int refs, cpu, next_cpu, this_cpu = smp_processor_id();
        /*
         * Can deadlock when called with interrupts disabled.
@@ -486,50 +402,13 @@ void smp_call_function_many(const struct cpumask *mask,
        }
        data = &__get_cpu_var(cfd_data);
-        csd_lock(&data->csd);
-        /* This BUG_ON verifies our reuse assertions and can be removed */
-        BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
-        /*
-         * The global call function queue list add and delete are protected
-         * by a lock, but the list is traversed without any lock, relying
-         * on the rcu list add and delete to allow safe concurrent traversal.
-         * We reuse the call function data without waiting for any grace
-         * period after some other cpu removes it from the global queue.
-         * This means a cpu might find our data block as it is being
-         * filled out.
-         *
-         * We hold off the interrupt handler on the other cpu by
-         * ordering our writes to the cpu mask vs our setting of the
-         * refs counter.  We assert only the cpu owning the data block
-         * will set a bit in cpumask, and each bit will only be cleared
-         * by the subject cpu.  Each cpu must first find its bit is
-         * set and then check that refs is set indicating the element is
-         * ready to be processed, otherwise it must skip the entry.
-         *
-         * On the previous iteration refs was set to 0 by another cpu.
-         * To avoid the use of transitivity, set the counter to 0 here
-         * so the wmb will pair with the rmb in the interrupt handler.
-         */
-        atomic_set(&data->refs, 0);     /* convert 3rd to 1st party write */
-        data->csd.func = func;
-        data->csd.info = info;
-        /* Ensure 0 refs is visible before mask.  Also orders func and info */
-        smp_wmb();
-        /* We rely on the "and" being processed before the store */
        cpumask_and(data->cpumask, mask, cpu_online_mask);
        cpumask_clear_cpu(this_cpu, data->cpumask);
-        refs = cpumask_weight(data->cpumask);
        /* Some callers race with other cpus changing the passed mask */
-        if (unlikely(!refs)) {
+        if (unlikely(!cpumask_weight(data->cpumask)))
-                csd_unlock(&data->csd);
                return;
-        }
        /*
         * After we put an entry into the list, data->cpumask
@@ -537,34 +416,32 @@ void smp_call_function_many(const struct cpumask *mask,
         * a SMP function call, so data->cpumask will be zero.
         */
        cpumask_copy(data->cpumask_ipi, data->cpumask);
-        raw_spin_lock_irqsave(&call_function.lock, flags);
-        /*
-         * Place entry at the _HEAD_ of the list, so that any cpu still
-         * observing the entry in generic_smp_call_function_interrupt()
-         * will not miss any other list entries:
-         */
-        list_add_rcu(&data->csd.list, &call_function.queue);
-        /*
-         * We rely on the wmb() in list_add_rcu to complete our writes
-         * to the cpumask before this write to refs, which indicates
-         * data is on the list and is ready to be processed.
-         */
-        atomic_set(&data->refs, refs);
-        raw_spin_unlock_irqrestore(&call_function.lock, flags);
-        /*
+        for_each_cpu(cpu, data->cpumask) {
-         * Make the list addition visible before sending the ipi.
+                struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
-         * (IPIs must obey or appear to obey normal Linux cache
+                struct call_single_queue *dst =
-         * coherency rules -- see comment in generic_exec_single).
+                                        &per_cpu(call_single_queue, cpu);
-         */
+                unsigned long flags;
-        smp_mb();
+                csd_lock(csd);
+                csd->func = func;
+                csd->info = info;
+                raw_spin_lock_irqsave(&dst->lock, flags);
+                list_add_tail(&csd->list, &dst->list);
+                raw_spin_unlock_irqrestore(&dst->lock, flags);
+        }
        /* Send a message to all CPUs in the map */
        arch_send_call_function_ipi_mask(data->cpumask_ipi);
-        /* Optionally wait for the CPUs to complete */
+        if (wait) {
-        if (wait)
+                for_each_cpu(cpu, data->cpumask) {
-                csd_lock_wait(&data->csd);
+                        struct call_single_data *csd =
+                                        per_cpu_ptr(data->csd, cpu);
+                        csd_lock_wait(csd);
+                }
+        }
 }
 EXPORT_SYMBOL(smp_call_function_many);
author	Shaohua Li <shli@kernel.org>	2013-02-21 19:43:03 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-02-21 20:22:20 -0500
commit	9a46ad6d6df3b547d057c39db13f69d7170a99e9 (patch)
tree	684a14d248639c2411c46f53dc7acba4a81c357c /kernel
parent	6d1c7ccae946f5c959f5c9236813d9c33ae48537 (diff)

diff --git a/kernel/smp.c b/kernel/smp.c index 69f38bd98b42..8e451f3ff51b 100644 --- a/kernel/smp.c +++ b/kernel/smp.c
@@ -16,22 +16,12 @@
16	#include "smpboot.h"	16	#include "smpboot.h"
17		17
18	#ifdef CONFIG_USE_GENERIC_SMP_HELPERS	18	#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
19	static struct {
20	struct list_head queue;
21	raw_spinlock_t lock;
22	} call_function __cacheline_aligned_in_smp =
23	{
24	.queue = LIST_HEAD_INIT(call_function.queue),
25	.lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
26	};
27
28	enum {	19	enum {
29	CSD_FLAG_LOCK = 0x01,	20	CSD_FLAG_LOCK = 0x01,
30	};	21	};
31		22
32	struct call_function_data {	23	struct call_function_data {
33	struct call_single_data csd;	24	struct call_single_data __percpu *csd;
34	atomic_t refs;
35	cpumask_var_t cpumask;	25	cpumask_var_t cpumask;
36	cpumask_var_t cpumask_ipi;	26	cpumask_var_t cpumask_ipi;
37	};	27	};
@@ -60,6 +50,11 @@ hotplug_cfd(struct notifier_block nfb, unsigned long action, void hcpu)
60	if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,	50	if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
61	cpu_to_node(cpu)))	51	cpu_to_node(cpu)))
62	return notifier_from_errno(-ENOMEM);	52	return notifier_from_errno(-ENOMEM);
		53	cfd->csd = alloc_percpu(struct call_single_data);
		54	if (!cfd->csd) {
		55	free_cpumask_var(cfd->cpumask);
		56	return notifier_from_errno(-ENOMEM);
		57	}
63	break;	58	break;
64		59
65	#ifdef CONFIG_HOTPLUG_CPU	60	#ifdef CONFIG_HOTPLUG_CPU
@@ -70,6 +65,7 @@ hotplug_cfd(struct notifier_block nfb, unsigned long action, void hcpu)
70	case CPU_DEAD_FROZEN:	65	case CPU_DEAD_FROZEN:
71	free_cpumask_var(cfd->cpumask);	66	free_cpumask_var(cfd->cpumask);
72	free_cpumask_var(cfd->cpumask_ipi);	67	free_cpumask_var(cfd->cpumask_ipi);
		68	free_percpu(cfd->csd);
73	break;	69	break;
74	#endif	70	#endif
75	};	71	};
@@ -171,85 +167,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
171	}	167	}
172		168
173	/*	169	/*
174	* Invoked by arch to handle an IPI for call function. Must be called with
175	* interrupts disabled.
176	*/
177	void generic_smp_call_function_interrupt(void)
178	{
179	struct call_function_data *data;
180	int cpu = smp_processor_id();
181
182	/*
183	* Shouldn't receive this interrupt on a cpu that is not yet online.
184	*/
185	WARN_ON_ONCE(!cpu_online(cpu));
186
187	/*
188	* Ensure entry is visible on call_function_queue after we have
189	* entered the IPI. See comment in smp_call_function_many.
190	* If we don't have this, then we may miss an entry on the list
191	* and never get another IPI to process it.
192	*/
193	smp_mb();
194
195	/*
196	* It's ok to use list_for_each_rcu() here even though we may
197	* delete 'pos', since list_del_rcu() doesn't clear ->next
198	*/
199	list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
200	int refs;
201	smp_call_func_t func;
202
203	/*
204	* Since we walk the list without any locks, we might
205	* see an entry that was completed, removed from the
206	* list and is in the process of being reused.
207	*
208	* We must check that the cpu is in the cpumask before
209	* checking the refs, and both must be set before
210	* executing the callback on this cpu.
211	*/
212
213	if (!cpumask_test_cpu(cpu, data->cpumask))
214	continue;
215
216	smp_rmb();
217
218	if (atomic_read(&data->refs) == 0)
219	continue;
220
221	func = data->csd.func; /* save for later warn */
222	func(data->csd.info);
223
224	/*
225	* If the cpu mask is not still set then func enabled
226	* interrupts (BUG), and this cpu took another smp call
227	* function interrupt and executed func(info) twice
228	* on this cpu. That nested execution decremented refs.
229	*/
230	if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
231	WARN(1, "%pf enabled interrupts and double executed\n", func);
232	continue;
233	}
234
235	refs = atomic_dec_return(&data->refs);
236	WARN_ON(refs < 0);
237
238	if (refs)
239	continue;
240
241	WARN_ON(!cpumask_empty(data->cpumask));
242
243	raw_spin_lock(&call_function.lock);
244	list_del_rcu(&data->csd.list);
245	raw_spin_unlock(&call_function.lock);
246
247	csd_unlock(&data->csd);
248	}
249
250	}
251
252	/*
253	* Invoked by arch to handle an IPI for call function single. Must be	170	* Invoked by arch to handle an IPI for call function single. Must be
254	* called from the arch with interrupts disabled.	171	* called from the arch with interrupts disabled.
255	*/	172	*/
@@ -453,8 +370,7 @@ void smp_call_function_many(const struct cpumask *mask,
453	smp_call_func_t func, void *info, bool wait)	370	smp_call_func_t func, void *info, bool wait)
454	{	371	{
455	struct call_function_data *data;	372	struct call_function_data *data;
456	unsigned long flags;	373	int cpu, next_cpu, this_cpu = smp_processor_id();
457	int refs, cpu, next_cpu, this_cpu = smp_processor_id();
458		374
459	/*	375	/*
460	* Can deadlock when called with interrupts disabled.	376	* Can deadlock when called with interrupts disabled.
@@ -486,50 +402,13 @@ void smp_call_function_many(const struct cpumask *mask,
486	}	402	}
487		403
488	data = &__get_cpu_var(cfd_data);	404	data = &__get_cpu_var(cfd_data);
489	csd_lock(&data->csd);
490
491	/* This BUG_ON verifies our reuse assertions and can be removed */
492	BUG_ON(atomic_read(&data->refs) \|\| !cpumask_empty(data->cpumask));
493
494	/*
495	* The global call function queue list add and delete are protected
496	* by a lock, but the list is traversed without any lock, relying
497	* on the rcu list add and delete to allow safe concurrent traversal.
498	* We reuse the call function data without waiting for any grace
499	* period after some other cpu removes it from the global queue.
500	* This means a cpu might find our data block as it is being
501	* filled out.
502	*
503	* We hold off the interrupt handler on the other cpu by
504	* ordering our writes to the cpu mask vs our setting of the
505	* refs counter. We assert only the cpu owning the data block
506	* will set a bit in cpumask, and each bit will only be cleared
507	* by the subject cpu. Each cpu must first find its bit is
508	* set and then check that refs is set indicating the element is
509	* ready to be processed, otherwise it must skip the entry.
510	*
511	* On the previous iteration refs was set to 0 by another cpu.
512	* To avoid the use of transitivity, set the counter to 0 here
513	* so the wmb will pair with the rmb in the interrupt handler.
514	*/
515	atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
516
517	data->csd.func = func;
518	data->csd.info = info;
519		405
520	/* Ensure 0 refs is visible before mask. Also orders func and info */
521	smp_wmb();
522
523	/* We rely on the "and" being processed before the store */
524	cpumask_and(data->cpumask, mask, cpu_online_mask);	406	cpumask_and(data->cpumask, mask, cpu_online_mask);
525	cpumask_clear_cpu(this_cpu, data->cpumask);	407	cpumask_clear_cpu(this_cpu, data->cpumask);
526	refs = cpumask_weight(data->cpumask);
527		408
528	/* Some callers race with other cpus changing the passed mask */	409	/* Some callers race with other cpus changing the passed mask */
529	if (unlikely(!refs)) {	410	if (unlikely(!cpumask_weight(data->cpumask)))
530	csd_unlock(&data->csd);
531	return;	411	return;
532	}
533		412
534	/*	413	/*
535	* After we put an entry into the list, data->cpumask	414	* After we put an entry into the list, data->cpumask
@@ -537,34 +416,32 @@ void smp_call_function_many(const struct cpumask *mask,
537	* a SMP function call, so data->cpumask will be zero.	416	* a SMP function call, so data->cpumask will be zero.
538	*/	417	*/
539	cpumask_copy(data->cpumask_ipi, data->cpumask);	418	cpumask_copy(data->cpumask_ipi, data->cpumask);
540	raw_spin_lock_irqsave(&call_function.lock, flags);
541	/*
542	* Place entry at the _HEAD_ of the list, so that any cpu still
543	* observing the entry in generic_smp_call_function_interrupt()
544	* will not miss any other list entries:
545	*/
546	list_add_rcu(&data->csd.list, &call_function.queue);
547	/*
548	* We rely on the wmb() in list_add_rcu to complete our writes
549	* to the cpumask before this write to refs, which indicates
550	* data is on the list and is ready to be processed.
551	*/
552	atomic_set(&data->refs, refs);
553	raw_spin_unlock_irqrestore(&call_function.lock, flags);
554		419
555	/*	420	for_each_cpu(cpu, data->cpumask) {
556	* Make the list addition visible before sending the ipi.	421	struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
557	* (IPIs must obey or appear to obey normal Linux cache	422	struct call_single_queue *dst =
558	* coherency rules -- see comment in generic_exec_single).	423	&per_cpu(call_single_queue, cpu);
559	*/	424	unsigned long flags;
560	smp_mb();	425
		426	csd_lock(csd);
		427	csd->func = func;
		428	csd->info = info;
		429
		430	raw_spin_lock_irqsave(&dst->lock, flags);
		431	list_add_tail(&csd->list, &dst->list);
		432	raw_spin_unlock_irqrestore(&dst->lock, flags);
		433	}
561		434
562	/* Send a message to all CPUs in the map */	435	/* Send a message to all CPUs in the map */
563	arch_send_call_function_ipi_mask(data->cpumask_ipi);	436	arch_send_call_function_ipi_mask(data->cpumask_ipi);
564		437
565	/* Optionally wait for the CPUs to complete */	438	if (wait) {
566	if (wait)	439	for_each_cpu(cpu, data->cpumask) {
567	csd_lock_wait(&data->csd);	440	struct call_single_data *csd =
		441	per_cpu_ptr(data->csd, cpu);
		442	csd_lock_wait(csd);
		443	}
		444	}
568	}	445	}
569	EXPORT_SYMBOL(smp_call_function_many);	446	EXPORT_SYMBOL(smp_call_function_many);
570		447