aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2016-11-29 02:03:05 -0500
committerRafael J. Wysocki <rafael.j.wysocki@intel.com>2016-11-29 08:02:21 -0500
commitc1de45ca831acee9b72c9320dde447edafadb43f (patch)
treebd4f7de61de5db4208a9701dfc3c5e9390665c12
parentbb8313b603eb8fd52de48a079bfcd72dcab2ef1e (diff)
sched/idle: Add support for tasks that inject idle
Idle injection drivers such as Intel powerclamp and ACPI PAD drivers use realtime tasks to take control of CPU then inject idle. There are two issues with this approach: 1. Low efficiency: injected idle task is treated as busy so sched ticks do not stop during injected idle period, the result of these unwanted wakeups can be ~20% loss in power savings. 2. Idle accounting: injected idle time is presented to user as busy. This patch addresses the issues by introducing a new PF_IDLE flag which allows any given task to be treated as idle task while the flag is set. Therefore, idle injection tasks can run through the normal flow of NOHZ idle enter/exit to get the correct accounting as well as tick stop when possible. The implication is that idle task is then no longer limited to PID == 0. Acked-by: Ingo Molnar <mingo@kernel.org> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-rw-r--r--include/linux/cpu.h2
-rw-r--r--include/linux/sched.h3
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/sched/idle.c162
5 files changed, 107 insertions, 63 deletions
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index b886dc17f2f3..ac0efae38072 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -245,6 +245,8 @@ void arch_cpu_idle_dead(void);
245int cpu_report_state(int cpu); 245int cpu_report_state(int cpu);
246int cpu_check_up_prepare(int cpu); 246int cpu_check_up_prepare(int cpu);
247void cpu_set_state_online(int cpu); 247void cpu_set_state_online(int cpu);
248void play_idle(unsigned long duration_ms);
249
248#ifdef CONFIG_HOTPLUG_CPU 250#ifdef CONFIG_HOTPLUG_CPU
249bool cpu_wait_death(unsigned int cpu, int seconds); 251bool cpu_wait_death(unsigned int cpu, int seconds);
250bool cpu_report_death(void); 252bool cpu_report_death(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 348f51b0ec92..114c7fcb6af6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2254,6 +2254,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
2254/* 2254/*
2255 * Per process flags 2255 * Per process flags
2256 */ 2256 */
2257#define PF_IDLE 0x00000002 /* I am an IDLE thread */
2257#define PF_EXITING 0x00000004 /* getting shut down */ 2258#define PF_EXITING 0x00000004 /* getting shut down */
2258#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ 2259#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
2259#define PF_VCPU 0x00000010 /* I'm a virtual CPU */ 2260#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
@@ -2609,7 +2610,7 @@ extern struct task_struct *idle_task(int cpu);
2609 */ 2610 */
2610static inline bool is_idle_task(const struct task_struct *p) 2611static inline bool is_idle_task(const struct task_struct *p)
2611{ 2612{
2612 return p->pid == 0; 2613 return !!(p->flags & PF_IDLE);
2613} 2614}
2614extern struct task_struct *curr_task(int cpu); 2615extern struct task_struct *curr_task(int cpu);
2615extern void ia64_set_curr_task(int cpu, struct task_struct *p); 2616extern void ia64_set_curr_task(int cpu, struct task_struct *p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 623259fc794d..5074b2f0827b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1537,7 +1537,7 @@ static __latent_entropy struct task_struct *copy_process(
1537 goto bad_fork_cleanup_count; 1537 goto bad_fork_cleanup_count;
1538 1538
1539 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 1539 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1540 p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); 1540 p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
1541 p->flags |= PF_FORKNOEXEC; 1541 p->flags |= PF_FORKNOEXEC;
1542 INIT_LIST_HEAD(&p->children); 1542 INIT_LIST_HEAD(&p->children);
1543 INIT_LIST_HEAD(&p->sibling); 1543 INIT_LIST_HEAD(&p->sibling);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94732d1ab00a..63b3a8a49884 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5285,6 +5285,7 @@ void init_idle(struct task_struct *idle, int cpu)
5285 __sched_fork(0, idle); 5285 __sched_fork(0, idle);
5286 idle->state = TASK_RUNNING; 5286 idle->state = TASK_RUNNING;
5287 idle->se.exec_start = sched_clock(); 5287 idle->se.exec_start = sched_clock();
5288 idle->flags |= PF_IDLE;
5288 5289
5289 kasan_unpoison_task_stack(idle); 5290 kasan_unpoison_task_stack(idle);
5290 5291
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 513e4dfeeae7..6a4bae0a649d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -205,76 +205,65 @@ exit_idle:
205 * 205 *
206 * Called with polling cleared. 206 * Called with polling cleared.
207 */ 207 */
208static void cpu_idle_loop(void) 208static void do_idle(void)
209{ 209{
210 int cpu = smp_processor_id(); 210 /*
211 211 * If the arch has a polling bit, we maintain an invariant:
212 while (1) { 212 *
213 /* 213 * Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
214 * If the arch has a polling bit, we maintain an invariant: 214 * rq->idle). This means that, if rq->idle has the polling bit set,
215 * 215 * then setting need_resched is guaranteed to cause the CPU to
216 * Our polling bit is clear if we're not scheduled (i.e. if 216 * reschedule.
217 * rq->curr != rq->idle). This means that, if rq->idle has 217 */
218 * the polling bit set, then setting need_resched is
219 * guaranteed to cause the cpu to reschedule.
220 */
221
222 __current_set_polling();
223 quiet_vmstat();
224 tick_nohz_idle_enter();
225 218
226 while (!need_resched()) { 219 __current_set_polling();
227 check_pgt_cache(); 220 tick_nohz_idle_enter();
228 rmb();
229 221
230 if (cpu_is_offline(cpu)) { 222 while (!need_resched()) {
231 cpuhp_report_idle_dead(); 223 check_pgt_cache();
232 arch_cpu_idle_dead(); 224 rmb();
233 }
234 225
235 local_irq_disable(); 226 if (cpu_is_offline(smp_processor_id())) {
236 arch_cpu_idle_enter(); 227 cpuhp_report_idle_dead();
237 228 arch_cpu_idle_dead();
238 /*
239 * In poll mode we reenable interrupts and spin.
240 *
241 * Also if we detected in the wakeup from idle
242 * path that the tick broadcast device expired
243 * for us, we don't want to go deep idle as we
244 * know that the IPI is going to arrive right
245 * away
246 */
247 if (cpu_idle_force_poll || tick_check_broadcast_expired())
248 cpu_idle_poll();
249 else
250 cpuidle_idle_call();
251
252 arch_cpu_idle_exit();
253 } 229 }
254 230
255 /* 231 local_irq_disable();
256 * Since we fell out of the loop above, we know 232 arch_cpu_idle_enter();
257 * TIF_NEED_RESCHED must be set, propagate it into
258 * PREEMPT_NEED_RESCHED.
259 *
260 * This is required because for polling idle loops we will
261 * not have had an IPI to fold the state for us.
262 */
263 preempt_set_need_resched();
264 tick_nohz_idle_exit();
265 __current_clr_polling();
266 233
267 /* 234 /*
268 * We promise to call sched_ttwu_pending and reschedule 235 * In poll mode we reenable interrupts and spin. Also if we
269 * if need_resched is set while polling is set. That 236 * detected in the wakeup from idle path that the tick
270 * means that clearing polling needs to be visible 237 * broadcast device expired for us, we don't want to go deep
271 * before doing these things. 238 * idle as we know that the IPI is going to arrive right away.
272 */ 239 */
273 smp_mb__after_atomic(); 240 if (cpu_idle_force_poll || tick_check_broadcast_expired())
274 241 cpu_idle_poll();
275 sched_ttwu_pending(); 242 else
276 schedule_preempt_disabled(); 243 cpuidle_idle_call();
244 arch_cpu_idle_exit();
277 } 245 }
246
247 /*
248 * Since we fell out of the loop above, we know TIF_NEED_RESCHED must
249 * be set, propagate it into PREEMPT_NEED_RESCHED.
250 *
251 * This is required because for polling idle loops we will not have had
252 * an IPI to fold the state for us.
253 */
254 preempt_set_need_resched();
255 tick_nohz_idle_exit();
256 __current_clr_polling();
257
258 /*
259 * We promise to call sched_ttwu_pending() and reschedule if
260 * need_resched() is set while polling is set. That means that clearing
261 * polling needs to be visible before doing these things.
262 */
263 smp_mb__after_atomic();
264
265 sched_ttwu_pending();
266 schedule_preempt_disabled();
278} 267}
279 268
280bool cpu_in_idle(unsigned long pc) 269bool cpu_in_idle(unsigned long pc)
@@ -283,6 +272,56 @@ bool cpu_in_idle(unsigned long pc)
283 pc < (unsigned long)__cpuidle_text_end; 272 pc < (unsigned long)__cpuidle_text_end;
284} 273}
285 274
275struct idle_timer {
276 struct hrtimer timer;
277 int done;
278};
279
280static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
281{
282 struct idle_timer *it = container_of(timer, struct idle_timer, timer);
283
284 WRITE_ONCE(it->done, 1);
285 set_tsk_need_resched(current);
286
287 return HRTIMER_NORESTART;
288}
289
290void play_idle(unsigned long duration_ms)
291{
292 struct idle_timer it;
293
294 /*
295 * Only FIFO tasks can disable the tick since they don't need the forced
296 * preemption.
297 */
298 WARN_ON_ONCE(current->policy != SCHED_FIFO);
299 WARN_ON_ONCE(current->nr_cpus_allowed != 1);
300 WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
301 WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
302 WARN_ON_ONCE(!duration_ms);
303
304 rcu_sleep_check();
305 preempt_disable();
306 current->flags |= PF_IDLE;
307 cpuidle_use_deepest_state(true);
308
309 it.done = 0;
310 hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
311 it.timer.function = idle_inject_timer_fn;
312 hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED);
313
314 while (!READ_ONCE(it.done))
315 do_idle();
316
317 cpuidle_use_deepest_state(false);
318 current->flags &= ~PF_IDLE;
319
320 preempt_fold_need_resched();
321 preempt_enable();
322}
323EXPORT_SYMBOL_GPL(play_idle);
324
286void cpu_startup_entry(enum cpuhp_state state) 325void cpu_startup_entry(enum cpuhp_state state)
287{ 326{
288 /* 327 /*
@@ -302,5 +341,6 @@ void cpu_startup_entry(enum cpuhp_state state)
302#endif 341#endif
303 arch_cpu_idle_prepare(); 342 arch_cpu_idle_prepare();
304 cpuhp_online_idle(state); 343 cpuhp_online_idle(state);
305 cpu_idle_loop(); 344 while (1)
345 do_idle();
306} 346}