aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2014-06-04 13:31:18 -0400
committerIngo Molnar <mingo@kernel.org>2014-06-05 06:09:53 -0400
commite3baac47f0e82c4be632f4f97215bb93bf16b342 (patch)
treecae0a8012654966d9c295f517661c77b2dab2f95 /kernel
parent67b9ca70c3030e832999e8d1cdba2984c7bb5bfc (diff)
sched/idle: Optimize try-to-wake-up IPI
[ This series reduces the number of IPIs on Andy's workload by something like 99%. It's down from many hundreds per second to very few. The basic idea behind this series is to make TIF_POLLING_NRFLAG be a reliable indication that the idle task is polling. Once that's done, the rest is reasonably straightforward. ] When enqueueing tasks on remote LLC domains, we send an IPI to do the work 'locally' and avoid bouncing all the cachelines over. However, when the remote CPU is idle (and polling, say x86 mwait), we don't need to send an IPI, we can simply kick the TIF word to wake it up and have the 'idle' loop do the work. So when _TIF_POLLING_NRFLAG is set, but _TIF_NEED_RESCHED is not (yet) set, set _TIF_NEED_RESCHED and avoid sending the IPI. Much-requested-by: Andy Lutomirski <luto@amacapital.net> Signed-off-by: Peter Zijlstra <peterz@infradead.org> [Edited by Andy Lutomirski, but this is mostly Peter Zijlstra's code.] Signed-off-by: Andy Lutomirski <luto@amacapital.net> Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: Mike Galbraith <umgwanakikbuti@gmail.com> Cc: umgwanakikbuti@gmail.com Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/ce06f8b02e7e337be63e97597fc4b248d3aa6f9b.1401902905.git.luto@amacapital.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/core.c54
-rw-r--r--kernel/sched/idle.c10
-rw-r--r--kernel/sched/sched.h6
3 files changed, 61 insertions, 9 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6afbfeefddd6..60d4e05d64dd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -519,7 +519,7 @@ static inline void init_hrtick(void)
519 __old; \ 519 __old; \
520}) 520})
521 521
522#ifdef TIF_POLLING_NRFLAG 522#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
523/* 523/*
524 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, 524 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
525 * this avoids any races wrt polling state changes and thereby avoids 525 * this avoids any races wrt polling state changes and thereby avoids
@@ -530,12 +530,44 @@ static bool set_nr_and_not_polling(struct task_struct *p)
530 struct thread_info *ti = task_thread_info(p); 530 struct thread_info *ti = task_thread_info(p);
531 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); 531 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
532} 532}
533
534/*
535 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
536 *
537 * If this returns true, then the idle task promises to call
538 * sched_ttwu_pending() and reschedule soon.
539 */
540static bool set_nr_if_polling(struct task_struct *p)
541{
542 struct thread_info *ti = task_thread_info(p);
543 typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
544
545 for (;;) {
546 if (!(val & _TIF_POLLING_NRFLAG))
547 return false;
548 if (val & _TIF_NEED_RESCHED)
549 return true;
550 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
551 if (old == val)
552 break;
553 val = old;
554 }
555 return true;
556}
557
533#else 558#else
534static bool set_nr_and_not_polling(struct task_struct *p) 559static bool set_nr_and_not_polling(struct task_struct *p)
535{ 560{
536 set_tsk_need_resched(p); 561 set_tsk_need_resched(p);
537 return true; 562 return true;
538} 563}
564
565#ifdef CONFIG_SMP
566static bool set_nr_if_polling(struct task_struct *p)
567{
568 return false;
569}
570#endif
539#endif 571#endif
540 572
541/* 573/*
@@ -1490,13 +1522,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
1490} 1522}
1491 1523
1492#ifdef CONFIG_SMP 1524#ifdef CONFIG_SMP
1493static void sched_ttwu_pending(void) 1525void sched_ttwu_pending(void)
1494{ 1526{
1495 struct rq *rq = this_rq(); 1527 struct rq *rq = this_rq();
1496 struct llist_node *llist = llist_del_all(&rq->wake_list); 1528 struct llist_node *llist = llist_del_all(&rq->wake_list);
1497 struct task_struct *p; 1529 struct task_struct *p;
1530 unsigned long flags;
1498 1531
1499 raw_spin_lock(&rq->lock); 1532 if (!llist)
1533 return;
1534
1535 raw_spin_lock_irqsave(&rq->lock, flags);
1500 1536
1501 while (llist) { 1537 while (llist) {
1502 p = llist_entry(llist, struct task_struct, wake_entry); 1538 p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1504,7 +1540,7 @@ static void sched_ttwu_pending(void)
1504 ttwu_do_activate(rq, p, 0); 1540 ttwu_do_activate(rq, p, 0);
1505 } 1541 }
1506 1542
1507 raw_spin_unlock(&rq->lock); 1543 raw_spin_unlock_irqrestore(&rq->lock, flags);
1508} 1544}
1509 1545
1510void scheduler_ipi(void) 1546void scheduler_ipi(void)
@@ -1550,8 +1586,14 @@ void scheduler_ipi(void)
1550 1586
1551static void ttwu_queue_remote(struct task_struct *p, int cpu) 1587static void ttwu_queue_remote(struct task_struct *p, int cpu)
1552{ 1588{
1553 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) 1589 struct rq *rq = cpu_rq(cpu);
1554 smp_send_reschedule(cpu); 1590
1591 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
1592 if (!set_nr_if_polling(rq->idle))
1593 smp_send_reschedule(cpu);
1594 else
1595 trace_sched_wake_idle_without_ipi(cpu);
1596 }
1555} 1597}
1556 1598
1557bool cpus_share_cache(int this_cpu, int that_cpu) 1599bool cpus_share_cache(int this_cpu, int that_cpu)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index fe4b24bf33ca..cf009fb0bc25 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -12,6 +12,8 @@
12 12
13#include <trace/events/power.h> 13#include <trace/events/power.h>
14 14
15#include "sched.h"
16
15static int __read_mostly cpu_idle_force_poll; 17static int __read_mostly cpu_idle_force_poll;
16 18
17void cpu_idle_poll_ctrl(bool enable) 19void cpu_idle_poll_ctrl(bool enable)
@@ -237,12 +239,14 @@ static void cpu_idle_loop(void)
237 __current_clr_polling(); 239 __current_clr_polling();
238 240
239 /* 241 /*
240 * We promise to reschedule if need_resched is set while 242 * We promise to call sched_ttwu_pending and reschedule
241 * polling is set. That means that clearing polling 243 * if need_resched is set while polling is set. That
242 * needs to be visible before rescheduling. 244 * means that clearing polling needs to be visible
245 * before doing these things.
243 */ 246 */
244 smp_mb__after_atomic(); 247 smp_mb__after_atomic();
245 248
249 sched_ttwu_pending();
246 schedule_preempt_disabled(); 250 schedule_preempt_disabled();
247 } 251 }
248} 252}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 956b8ca24893..2f8636199b83 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -670,6 +670,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
670 670
671#ifdef CONFIG_SMP 671#ifdef CONFIG_SMP
672 672
673extern void sched_ttwu_pending(void);
674
673#define rcu_dereference_check_sched_domain(p) \ 675#define rcu_dereference_check_sched_domain(p) \
674 rcu_dereference_check((p), \ 676 rcu_dereference_check((p), \
675 lockdep_is_held(&sched_domains_mutex)) 677 lockdep_is_held(&sched_domains_mutex))
@@ -787,6 +789,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
787 789
788extern int group_balance_cpu(struct sched_group *sg); 790extern int group_balance_cpu(struct sched_group *sg);
789 791
792#else
793
794static inline void sched_ttwu_pending(void) { }
795
790#endif /* CONFIG_SMP */ 796#endif /* CONFIG_SMP */
791 797
792#include "stats.h" 798#include "stats.h"