aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShaohua Li <shli@kernel.org>2013-02-21 19:43:03 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-02-21 20:22:20 -0500
commit9a46ad6d6df3b547d057c39db13f69d7170a99e9 (patch)
tree684a14d248639c2411c46f53dc7acba4a81c357c
parent6d1c7ccae946f5c959f5c9236813d9c33ae48537 (diff)
smp: make smp_call_function_many() use logic similar to smp_call_function_single()
I'm testing swapout workload in a two-socket Xeon machine. The workload has 10 threads, each thread sequentially accesses separate memory region. TLB flush overhead is very big in the workload. For each page, page reclaim need move it from active lru list and then unmap it. Both need a TLB flush. And this is a multthread workload, TLB flush happens in 10 CPUs. In X86, TLB flush uses generic smp_call)function. So this workload stress smp_call_function_many heavily. Without patch, perf shows: + 24.49% [k] generic_smp_call_function_interrupt - 21.72% [k] _raw_spin_lock - _raw_spin_lock + 79.80% __page_check_address + 6.42% generic_smp_call_function_interrupt + 3.31% get_swap_page + 2.37% free_pcppages_bulk + 1.75% handle_pte_fault + 1.54% put_super + 1.41% grab_super_passive + 1.36% __swap_duplicate + 0.68% blk_flush_plug_list + 0.62% swap_info_get + 6.55% [k] flush_tlb_func + 6.46% [k] smp_call_function_many + 5.09% [k] call_function_interrupt + 4.75% [k] default_send_IPI_mask_sequence_phys + 2.18% [k] find_next_bit swapout throughput is around 1300M/s. With the patch, perf shows: - 27.23% [k] _raw_spin_lock - _raw_spin_lock + 80.53% __page_check_address + 8.39% generic_smp_call_function_single_interrupt + 2.44% get_swap_page + 1.76% free_pcppages_bulk + 1.40% handle_pte_fault + 1.15% __swap_duplicate + 1.05% put_super + 0.98% grab_super_passive + 0.86% blk_flush_plug_list + 0.57% swap_info_get + 8.25% [k] default_send_IPI_mask_sequence_phys + 7.55% [k] call_function_interrupt + 7.47% [k] smp_call_function_many + 7.25% [k] flush_tlb_func + 3.81% [k] _raw_spin_lock_irqsave + 3.78% [k] generic_smp_call_function_single_interrupt swapout throughput is around 1400M/s. So there is around a 7% improvement, and total cpu utilization doesn't change. Without the patch, cfd_data is shared by all CPUs. generic_smp_call_function_interrupt does read/write cfd_data several times which will create a lot of cache ping-pong. With the patch, the data becomes per-cpu. The ping-pong is avoided. And from the perf data, this doesn't make call_single_queue lock contend. Next step is to remove generic_smp_call_function_interrupt() from arch code. Signed-off-by: Shaohua Li <shli@fusionio.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Ingo Molnar <mingo@elte.hu> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/smp.h3
-rw-r--r--kernel/smp.c183
2 files changed, 32 insertions, 154 deletions
diff --git a/include/linux/smp.h b/include/linux/smp.h
index dd6f06be3c9f..3e07a7df6478 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -89,7 +89,8 @@ void kick_all_cpus_sync(void);
89#ifdef CONFIG_USE_GENERIC_SMP_HELPERS 89#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
90void __init call_function_init(void); 90void __init call_function_init(void);
91void generic_smp_call_function_single_interrupt(void); 91void generic_smp_call_function_single_interrupt(void);
92void generic_smp_call_function_interrupt(void); 92#define generic_smp_call_function_interrupt \
93 generic_smp_call_function_single_interrupt
93#else 94#else
94static inline void call_function_init(void) { } 95static inline void call_function_init(void) { }
95#endif 96#endif
diff --git a/kernel/smp.c b/kernel/smp.c
index 69f38bd98b42..8e451f3ff51b 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -16,22 +16,12 @@
16#include "smpboot.h" 16#include "smpboot.h"
17 17
18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS 18#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
19static struct {
20 struct list_head queue;
21 raw_spinlock_t lock;
22} call_function __cacheline_aligned_in_smp =
23 {
24 .queue = LIST_HEAD_INIT(call_function.queue),
25 .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
26 };
27
28enum { 19enum {
29 CSD_FLAG_LOCK = 0x01, 20 CSD_FLAG_LOCK = 0x01,
30}; 21};
31 22
32struct call_function_data { 23struct call_function_data {
33 struct call_single_data csd; 24 struct call_single_data __percpu *csd;
34 atomic_t refs;
35 cpumask_var_t cpumask; 25 cpumask_var_t cpumask;
36 cpumask_var_t cpumask_ipi; 26 cpumask_var_t cpumask_ipi;
37}; 27};
@@ -60,6 +50,11 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
60 if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, 50 if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
61 cpu_to_node(cpu))) 51 cpu_to_node(cpu)))
62 return notifier_from_errno(-ENOMEM); 52 return notifier_from_errno(-ENOMEM);
53 cfd->csd = alloc_percpu(struct call_single_data);
54 if (!cfd->csd) {
55 free_cpumask_var(cfd->cpumask);
56 return notifier_from_errno(-ENOMEM);
57 }
63 break; 58 break;
64 59
65#ifdef CONFIG_HOTPLUG_CPU 60#ifdef CONFIG_HOTPLUG_CPU
@@ -70,6 +65,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
70 case CPU_DEAD_FROZEN: 65 case CPU_DEAD_FROZEN:
71 free_cpumask_var(cfd->cpumask); 66 free_cpumask_var(cfd->cpumask);
72 free_cpumask_var(cfd->cpumask_ipi); 67 free_cpumask_var(cfd->cpumask_ipi);
68 free_percpu(cfd->csd);
73 break; 69 break;
74#endif 70#endif
75 }; 71 };
@@ -171,85 +167,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
171} 167}
172 168
173/* 169/*
174 * Invoked by arch to handle an IPI for call function. Must be called with
175 * interrupts disabled.
176 */
177void generic_smp_call_function_interrupt(void)
178{
179 struct call_function_data *data;
180 int cpu = smp_processor_id();
181
182 /*
183 * Shouldn't receive this interrupt on a cpu that is not yet online.
184 */
185 WARN_ON_ONCE(!cpu_online(cpu));
186
187 /*
188 * Ensure entry is visible on call_function_queue after we have
189 * entered the IPI. See comment in smp_call_function_many.
190 * If we don't have this, then we may miss an entry on the list
191 * and never get another IPI to process it.
192 */
193 smp_mb();
194
195 /*
196 * It's ok to use list_for_each_rcu() here even though we may
197 * delete 'pos', since list_del_rcu() doesn't clear ->next
198 */
199 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
200 int refs;
201 smp_call_func_t func;
202
203 /*
204 * Since we walk the list without any locks, we might
205 * see an entry that was completed, removed from the
206 * list and is in the process of being reused.
207 *
208 * We must check that the cpu is in the cpumask before
209 * checking the refs, and both must be set before
210 * executing the callback on this cpu.
211 */
212
213 if (!cpumask_test_cpu(cpu, data->cpumask))
214 continue;
215
216 smp_rmb();
217
218 if (atomic_read(&data->refs) == 0)
219 continue;
220
221 func = data->csd.func; /* save for later warn */
222 func(data->csd.info);
223
224 /*
225 * If the cpu mask is not still set then func enabled
226 * interrupts (BUG), and this cpu took another smp call
227 * function interrupt and executed func(info) twice
228 * on this cpu. That nested execution decremented refs.
229 */
230 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
231 WARN(1, "%pf enabled interrupts and double executed\n", func);
232 continue;
233 }
234
235 refs = atomic_dec_return(&data->refs);
236 WARN_ON(refs < 0);
237
238 if (refs)
239 continue;
240
241 WARN_ON(!cpumask_empty(data->cpumask));
242
243 raw_spin_lock(&call_function.lock);
244 list_del_rcu(&data->csd.list);
245 raw_spin_unlock(&call_function.lock);
246
247 csd_unlock(&data->csd);
248 }
249
250}
251
252/*
253 * Invoked by arch to handle an IPI for call function single. Must be 170 * Invoked by arch to handle an IPI for call function single. Must be
254 * called from the arch with interrupts disabled. 171 * called from the arch with interrupts disabled.
255 */ 172 */
@@ -453,8 +370,7 @@ void smp_call_function_many(const struct cpumask *mask,
453 smp_call_func_t func, void *info, bool wait) 370 smp_call_func_t func, void *info, bool wait)
454{ 371{
455 struct call_function_data *data; 372 struct call_function_data *data;
456 unsigned long flags; 373 int cpu, next_cpu, this_cpu = smp_processor_id();
457 int refs, cpu, next_cpu, this_cpu = smp_processor_id();
458 374
459 /* 375 /*
460 * Can deadlock when called with interrupts disabled. 376 * Can deadlock when called with interrupts disabled.
@@ -486,50 +402,13 @@ void smp_call_function_many(const struct cpumask *mask,
486 } 402 }
487 403
488 data = &__get_cpu_var(cfd_data); 404 data = &__get_cpu_var(cfd_data);
489 csd_lock(&data->csd);
490
491 /* This BUG_ON verifies our reuse assertions and can be removed */
492 BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
493
494 /*
495 * The global call function queue list add and delete are protected
496 * by a lock, but the list is traversed without any lock, relying
497 * on the rcu list add and delete to allow safe concurrent traversal.
498 * We reuse the call function data without waiting for any grace
499 * period after some other cpu removes it from the global queue.
500 * This means a cpu might find our data block as it is being
501 * filled out.
502 *
503 * We hold off the interrupt handler on the other cpu by
504 * ordering our writes to the cpu mask vs our setting of the
505 * refs counter. We assert only the cpu owning the data block
506 * will set a bit in cpumask, and each bit will only be cleared
507 * by the subject cpu. Each cpu must first find its bit is
508 * set and then check that refs is set indicating the element is
509 * ready to be processed, otherwise it must skip the entry.
510 *
511 * On the previous iteration refs was set to 0 by another cpu.
512 * To avoid the use of transitivity, set the counter to 0 here
513 * so the wmb will pair with the rmb in the interrupt handler.
514 */
515 atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
516
517 data->csd.func = func;
518 data->csd.info = info;
519 405
520 /* Ensure 0 refs is visible before mask. Also orders func and info */
521 smp_wmb();
522
523 /* We rely on the "and" being processed before the store */
524 cpumask_and(data->cpumask, mask, cpu_online_mask); 406 cpumask_and(data->cpumask, mask, cpu_online_mask);
525 cpumask_clear_cpu(this_cpu, data->cpumask); 407 cpumask_clear_cpu(this_cpu, data->cpumask);
526 refs = cpumask_weight(data->cpumask);
527 408
528 /* Some callers race with other cpus changing the passed mask */ 409 /* Some callers race with other cpus changing the passed mask */
529 if (unlikely(!refs)) { 410 if (unlikely(!cpumask_weight(data->cpumask)))
530 csd_unlock(&data->csd);
531 return; 411 return;
532 }
533 412
534 /* 413 /*
535 * After we put an entry into the list, data->cpumask 414 * After we put an entry into the list, data->cpumask
@@ -537,34 +416,32 @@ void smp_call_function_many(const struct cpumask *mask,
537 * a SMP function call, so data->cpumask will be zero. 416 * a SMP function call, so data->cpumask will be zero.
538 */ 417 */
539 cpumask_copy(data->cpumask_ipi, data->cpumask); 418 cpumask_copy(data->cpumask_ipi, data->cpumask);
540 raw_spin_lock_irqsave(&call_function.lock, flags);
541 /*
542 * Place entry at the _HEAD_ of the list, so that any cpu still
543 * observing the entry in generic_smp_call_function_interrupt()
544 * will not miss any other list entries:
545 */
546 list_add_rcu(&data->csd.list, &call_function.queue);
547 /*
548 * We rely on the wmb() in list_add_rcu to complete our writes
549 * to the cpumask before this write to refs, which indicates
550 * data is on the list and is ready to be processed.
551 */
552 atomic_set(&data->refs, refs);
553 raw_spin_unlock_irqrestore(&call_function.lock, flags);
554 419
555 /* 420 for_each_cpu(cpu, data->cpumask) {
556 * Make the list addition visible before sending the ipi. 421 struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
557 * (IPIs must obey or appear to obey normal Linux cache 422 struct call_single_queue *dst =
558 * coherency rules -- see comment in generic_exec_single). 423 &per_cpu(call_single_queue, cpu);
559 */ 424 unsigned long flags;
560 smp_mb(); 425
426 csd_lock(csd);
427 csd->func = func;
428 csd->info = info;
429
430 raw_spin_lock_irqsave(&dst->lock, flags);
431 list_add_tail(&csd->list, &dst->list);
432 raw_spin_unlock_irqrestore(&dst->lock, flags);
433 }
561 434
562 /* Send a message to all CPUs in the map */ 435 /* Send a message to all CPUs in the map */
563 arch_send_call_function_ipi_mask(data->cpumask_ipi); 436 arch_send_call_function_ipi_mask(data->cpumask_ipi);
564 437
565 /* Optionally wait for the CPUs to complete */ 438 if (wait) {
566 if (wait) 439 for_each_cpu(cpu, data->cpumask) {
567 csd_lock_wait(&data->csd); 440 struct call_single_data *csd =
441 per_cpu_ptr(data->csd, cpu);
442 csd_lock_wait(csd);
443 }
444 }
568} 445}
569EXPORT_SYMBOL(smp_call_function_many); 446EXPORT_SYMBOL(smp_call_function_many);
570 447