diff options
Diffstat (limited to 'kernel/time/tick-sched.c')
-rw-r--r-- | kernel/time/tick-sched.c | 563 |
1 files changed, 563 insertions, 0 deletions
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c new file mode 100644 index 000000000000..95e41f7f850b --- /dev/null +++ b/kernel/time/tick-sched.c | |||
@@ -0,0 +1,563 @@ | |||
1 | /* | ||
2 | * linux/kernel/time/tick-sched.c | ||
3 | * | ||
4 | * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> | ||
5 | * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar | ||
6 | * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner | ||
7 | * | ||
8 | * No idle tick implementation for low and high resolution timers | ||
9 | * | ||
10 | * Started by: Thomas Gleixner and Ingo Molnar | ||
11 | * | ||
12 | * For licencing details see kernel-base/COPYING | ||
13 | */ | ||
14 | #include <linux/cpu.h> | ||
15 | #include <linux/err.h> | ||
16 | #include <linux/hrtimer.h> | ||
17 | #include <linux/interrupt.h> | ||
18 | #include <linux/kernel_stat.h> | ||
19 | #include <linux/percpu.h> | ||
20 | #include <linux/profile.h> | ||
21 | #include <linux/sched.h> | ||
22 | #include <linux/tick.h> | ||
23 | |||
24 | #include "tick-internal.h" | ||
25 | |||
26 | /* | ||
27 | * Per cpu nohz control structure | ||
28 | */ | ||
29 | static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); | ||
30 | |||
31 | /* | ||
32 | * The time, when the last jiffy update happened. Protected by xtime_lock. | ||
33 | */ | ||
34 | static ktime_t last_jiffies_update; | ||
35 | |||
36 | struct tick_sched *tick_get_tick_sched(int cpu) | ||
37 | { | ||
38 | return &per_cpu(tick_cpu_sched, cpu); | ||
39 | } | ||
40 | |||
41 | /* | ||
42 | * Must be called with interrupts disabled ! | ||
43 | */ | ||
44 | static void tick_do_update_jiffies64(ktime_t now) | ||
45 | { | ||
46 | unsigned long ticks = 0; | ||
47 | ktime_t delta; | ||
48 | |||
49 | /* Reevalute with xtime_lock held */ | ||
50 | write_seqlock(&xtime_lock); | ||
51 | |||
52 | delta = ktime_sub(now, last_jiffies_update); | ||
53 | if (delta.tv64 >= tick_period.tv64) { | ||
54 | |||
55 | delta = ktime_sub(delta, tick_period); | ||
56 | last_jiffies_update = ktime_add(last_jiffies_update, | ||
57 | tick_period); | ||
58 | |||
59 | /* Slow path for long timeouts */ | ||
60 | if (unlikely(delta.tv64 >= tick_period.tv64)) { | ||
61 | s64 incr = ktime_to_ns(tick_period); | ||
62 | |||
63 | ticks = ktime_divns(delta, incr); | ||
64 | |||
65 | last_jiffies_update = ktime_add_ns(last_jiffies_update, | ||
66 | incr * ticks); | ||
67 | } | ||
68 | do_timer(++ticks); | ||
69 | } | ||
70 | write_sequnlock(&xtime_lock); | ||
71 | } | ||
72 | |||
73 | /* | ||
74 | * Initialize and return retrieve the jiffies update. | ||
75 | */ | ||
76 | static ktime_t tick_init_jiffy_update(void) | ||
77 | { | ||
78 | ktime_t period; | ||
79 | |||
80 | write_seqlock(&xtime_lock); | ||
81 | /* Did we start the jiffies update yet ? */ | ||
82 | if (last_jiffies_update.tv64 == 0) | ||
83 | last_jiffies_update = tick_next_period; | ||
84 | period = last_jiffies_update; | ||
85 | write_sequnlock(&xtime_lock); | ||
86 | return period; | ||
87 | } | ||
88 | |||
89 | /* | ||
90 | * NOHZ - aka dynamic tick functionality | ||
91 | */ | ||
92 | #ifdef CONFIG_NO_HZ | ||
93 | /* | ||
94 | * NO HZ enabled ? | ||
95 | */ | ||
96 | static int tick_nohz_enabled __read_mostly = 1; | ||
97 | |||
98 | /* | ||
99 | * Enable / Disable tickless mode | ||
100 | */ | ||
101 | static int __init setup_tick_nohz(char *str) | ||
102 | { | ||
103 | if (!strcmp(str, "off")) | ||
104 | tick_nohz_enabled = 0; | ||
105 | else if (!strcmp(str, "on")) | ||
106 | tick_nohz_enabled = 1; | ||
107 | else | ||
108 | return 0; | ||
109 | return 1; | ||
110 | } | ||
111 | |||
112 | __setup("nohz=", setup_tick_nohz); | ||
113 | |||
114 | /** | ||
115 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted | ||
116 | * | ||
117 | * Called from interrupt entry when the CPU was idle | ||
118 | * | ||
119 | * In case the sched_tick was stopped on this CPU, we have to check if jiffies | ||
120 | * must be updated. Otherwise an interrupt handler could use a stale jiffy | ||
121 | * value. We do this unconditionally on any cpu, as we don't know whether the | ||
122 | * cpu, which has the update task assigned is in a long sleep. | ||
123 | */ | ||
124 | void tick_nohz_update_jiffies(void) | ||
125 | { | ||
126 | int cpu = smp_processor_id(); | ||
127 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
128 | unsigned long flags; | ||
129 | ktime_t now; | ||
130 | |||
131 | if (!ts->tick_stopped) | ||
132 | return; | ||
133 | |||
134 | cpu_clear(cpu, nohz_cpu_mask); | ||
135 | now = ktime_get(); | ||
136 | |||
137 | local_irq_save(flags); | ||
138 | tick_do_update_jiffies64(now); | ||
139 | local_irq_restore(flags); | ||
140 | } | ||
141 | |||
142 | /** | ||
143 | * tick_nohz_stop_sched_tick - stop the idle tick from the idle task | ||
144 | * | ||
145 | * When the next event is more than a tick into the future, stop the idle tick | ||
146 | * Called either from the idle loop or from irq_exit() when an idle period was | ||
147 | * just interrupted by an interrupt which did not cause a reschedule. | ||
148 | */ | ||
149 | void tick_nohz_stop_sched_tick(void) | ||
150 | { | ||
151 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; | ||
152 | struct tick_sched *ts; | ||
153 | ktime_t last_update, expires, now, delta; | ||
154 | int cpu; | ||
155 | |||
156 | local_irq_save(flags); | ||
157 | |||
158 | cpu = smp_processor_id(); | ||
159 | ts = &per_cpu(tick_cpu_sched, cpu); | ||
160 | |||
161 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) | ||
162 | goto end; | ||
163 | |||
164 | if (need_resched()) | ||
165 | goto end; | ||
166 | |||
167 | cpu = smp_processor_id(); | ||
168 | BUG_ON(local_softirq_pending()); | ||
169 | |||
170 | now = ktime_get(); | ||
171 | /* | ||
172 | * When called from irq_exit we need to account the idle sleep time | ||
173 | * correctly. | ||
174 | */ | ||
175 | if (ts->tick_stopped) { | ||
176 | delta = ktime_sub(now, ts->idle_entrytime); | ||
177 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
178 | } | ||
179 | |||
180 | ts->idle_entrytime = now; | ||
181 | ts->idle_calls++; | ||
182 | |||
183 | /* Read jiffies and the time when jiffies were updated last */ | ||
184 | do { | ||
185 | seq = read_seqbegin(&xtime_lock); | ||
186 | last_update = last_jiffies_update; | ||
187 | last_jiffies = jiffies; | ||
188 | } while (read_seqretry(&xtime_lock, seq)); | ||
189 | |||
190 | /* Get the next timer wheel timer */ | ||
191 | next_jiffies = get_next_timer_interrupt(last_jiffies); | ||
192 | delta_jiffies = next_jiffies - last_jiffies; | ||
193 | |||
194 | /* | ||
195 | * Do not stop the tick, if we are only one off | ||
196 | * or if the cpu is required for rcu | ||
197 | */ | ||
198 | if (!ts->tick_stopped && (delta_jiffies == 1 || rcu_needs_cpu(cpu))) | ||
199 | goto out; | ||
200 | |||
201 | /* Schedule the tick, if we are at least one jiffie off */ | ||
202 | if ((long)delta_jiffies >= 1) { | ||
203 | |||
204 | if (rcu_needs_cpu(cpu)) | ||
205 | delta_jiffies = 1; | ||
206 | else | ||
207 | cpu_set(cpu, nohz_cpu_mask); | ||
208 | /* | ||
209 | * nohz_stop_sched_tick can be called several times before | ||
210 | * the nohz_restart_sched_tick is called. This happens when | ||
211 | * interrupts arrive which do not cause a reschedule. In the | ||
212 | * first call we save the current tick time, so we can restart | ||
213 | * the scheduler tick in nohz_restart_sched_tick. | ||
214 | */ | ||
215 | if (!ts->tick_stopped) { | ||
216 | ts->idle_tick = ts->sched_timer.expires; | ||
217 | ts->tick_stopped = 1; | ||
218 | ts->idle_jiffies = last_jiffies; | ||
219 | } | ||
220 | /* | ||
221 | * calculate the expiry time for the next timer wheel | ||
222 | * timer | ||
223 | */ | ||
224 | expires = ktime_add_ns(last_update, tick_period.tv64 * | ||
225 | delta_jiffies); | ||
226 | ts->idle_expires = expires; | ||
227 | ts->idle_sleeps++; | ||
228 | |||
229 | if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { | ||
230 | hrtimer_start(&ts->sched_timer, expires, | ||
231 | HRTIMER_MODE_ABS); | ||
232 | /* Check, if the timer was already in the past */ | ||
233 | if (hrtimer_active(&ts->sched_timer)) | ||
234 | goto out; | ||
235 | } else if(!tick_program_event(expires, 0)) | ||
236 | goto out; | ||
237 | /* | ||
238 | * We are past the event already. So we crossed a | ||
239 | * jiffie boundary. Update jiffies and raise the | ||
240 | * softirq. | ||
241 | */ | ||
242 | tick_do_update_jiffies64(ktime_get()); | ||
243 | cpu_clear(cpu, nohz_cpu_mask); | ||
244 | } | ||
245 | raise_softirq_irqoff(TIMER_SOFTIRQ); | ||
246 | out: | ||
247 | ts->next_jiffies = next_jiffies; | ||
248 | ts->last_jiffies = last_jiffies; | ||
249 | end: | ||
250 | local_irq_restore(flags); | ||
251 | } | ||
252 | |||
253 | /** | ||
254 | * nohz_restart_sched_tick - restart the idle tick from the idle task | ||
255 | * | ||
256 | * Restart the idle tick when the CPU is woken up from idle | ||
257 | */ | ||
258 | void tick_nohz_restart_sched_tick(void) | ||
259 | { | ||
260 | int cpu = smp_processor_id(); | ||
261 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
262 | unsigned long ticks; | ||
263 | ktime_t now, delta; | ||
264 | |||
265 | if (!ts->tick_stopped) | ||
266 | return; | ||
267 | |||
268 | /* Update jiffies first */ | ||
269 | now = ktime_get(); | ||
270 | |||
271 | local_irq_disable(); | ||
272 | tick_do_update_jiffies64(now); | ||
273 | cpu_clear(cpu, nohz_cpu_mask); | ||
274 | |||
275 | /* Account the idle time */ | ||
276 | delta = ktime_sub(now, ts->idle_entrytime); | ||
277 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
278 | |||
279 | /* | ||
280 | * We stopped the tick in idle. Update process times would miss the | ||
281 | * time we slept as update_process_times does only a 1 tick | ||
282 | * accounting. Enforce that this is accounted to idle ! | ||
283 | */ | ||
284 | ticks = jiffies - ts->idle_jiffies; | ||
285 | /* | ||
286 | * We might be one off. Do not randomly account a huge number of ticks! | ||
287 | */ | ||
288 | if (ticks && ticks < LONG_MAX) { | ||
289 | add_preempt_count(HARDIRQ_OFFSET); | ||
290 | account_system_time(current, HARDIRQ_OFFSET, | ||
291 | jiffies_to_cputime(ticks)); | ||
292 | sub_preempt_count(HARDIRQ_OFFSET); | ||
293 | } | ||
294 | |||
295 | /* | ||
296 | * Cancel the scheduled timer and restore the tick | ||
297 | */ | ||
298 | ts->tick_stopped = 0; | ||
299 | hrtimer_cancel(&ts->sched_timer); | ||
300 | ts->sched_timer.expires = ts->idle_tick; | ||
301 | |||
302 | while (1) { | ||
303 | /* Forward the time to expire in the future */ | ||
304 | hrtimer_forward(&ts->sched_timer, now, tick_period); | ||
305 | |||
306 | if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { | ||
307 | hrtimer_start(&ts->sched_timer, | ||
308 | ts->sched_timer.expires, | ||
309 | HRTIMER_MODE_ABS); | ||
310 | /* Check, if the timer was already in the past */ | ||
311 | if (hrtimer_active(&ts->sched_timer)) | ||
312 | break; | ||
313 | } else { | ||
314 | if (!tick_program_event(ts->sched_timer.expires, 0)) | ||
315 | break; | ||
316 | } | ||
317 | /* Update jiffies and reread time */ | ||
318 | tick_do_update_jiffies64(now); | ||
319 | now = ktime_get(); | ||
320 | } | ||
321 | local_irq_enable(); | ||
322 | } | ||
323 | |||
324 | static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) | ||
325 | { | ||
326 | hrtimer_forward(&ts->sched_timer, now, tick_period); | ||
327 | return tick_program_event(ts->sched_timer.expires, 0); | ||
328 | } | ||
329 | |||
330 | /* | ||
331 | * The nohz low res interrupt handler | ||
332 | */ | ||
333 | static void tick_nohz_handler(struct clock_event_device *dev) | ||
334 | { | ||
335 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
336 | struct pt_regs *regs = get_irq_regs(); | ||
337 | ktime_t now = ktime_get(); | ||
338 | |||
339 | dev->next_event.tv64 = KTIME_MAX; | ||
340 | |||
341 | /* Check, if the jiffies need an update */ | ||
342 | tick_do_update_jiffies64(now); | ||
343 | |||
344 | /* | ||
345 | * When we are idle and the tick is stopped, we have to touch | ||
346 | * the watchdog as we might not schedule for a really long | ||
347 | * time. This happens on complete idle SMP systems while | ||
348 | * waiting on the login prompt. We also increment the "start | ||
349 | * of idle" jiffy stamp so the idle accounting adjustment we | ||
350 | * do when we go busy again does not account too much ticks. | ||
351 | */ | ||
352 | if (ts->tick_stopped) { | ||
353 | touch_softlockup_watchdog(); | ||
354 | ts->idle_jiffies++; | ||
355 | } | ||
356 | |||
357 | update_process_times(user_mode(regs)); | ||
358 | profile_tick(CPU_PROFILING); | ||
359 | |||
360 | /* Do not restart, when we are in the idle loop */ | ||
361 | if (ts->tick_stopped) | ||
362 | return; | ||
363 | |||
364 | while (tick_nohz_reprogram(ts, now)) { | ||
365 | now = ktime_get(); | ||
366 | tick_do_update_jiffies64(now); | ||
367 | } | ||
368 | } | ||
369 | |||
370 | /** | ||
371 | * tick_nohz_switch_to_nohz - switch to nohz mode | ||
372 | */ | ||
373 | static void tick_nohz_switch_to_nohz(void) | ||
374 | { | ||
375 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
376 | ktime_t next; | ||
377 | |||
378 | if (!tick_nohz_enabled) | ||
379 | return; | ||
380 | |||
381 | local_irq_disable(); | ||
382 | if (tick_switch_to_oneshot(tick_nohz_handler)) { | ||
383 | local_irq_enable(); | ||
384 | return; | ||
385 | } | ||
386 | |||
387 | ts->nohz_mode = NOHZ_MODE_LOWRES; | ||
388 | |||
389 | /* | ||
390 | * Recycle the hrtimer in ts, so we can share the | ||
391 | * hrtimer_forward with the highres code. | ||
392 | */ | ||
393 | hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | ||
394 | /* Get the next period */ | ||
395 | next = tick_init_jiffy_update(); | ||
396 | |||
397 | for (;;) { | ||
398 | ts->sched_timer.expires = next; | ||
399 | if (!tick_program_event(next, 0)) | ||
400 | break; | ||
401 | next = ktime_add(next, tick_period); | ||
402 | } | ||
403 | local_irq_enable(); | ||
404 | |||
405 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", | ||
406 | smp_processor_id()); | ||
407 | } | ||
408 | |||
409 | #else | ||
410 | |||
411 | static inline void tick_nohz_switch_to_nohz(void) { } | ||
412 | |||
413 | #endif /* NO_HZ */ | ||
414 | |||
415 | /* | ||
416 | * High resolution timer specific code | ||
417 | */ | ||
418 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
419 | /* | ||
420 | * We rearm the timer until we get disabled by the idle code | ||
421 | * Called with interrupts disabled and timer->base->cpu_base->lock held. | ||
422 | */ | ||
423 | static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | ||
424 | { | ||
425 | struct tick_sched *ts = | ||
426 | container_of(timer, struct tick_sched, sched_timer); | ||
427 | struct hrtimer_cpu_base *base = timer->base->cpu_base; | ||
428 | struct pt_regs *regs = get_irq_regs(); | ||
429 | ktime_t now = ktime_get(); | ||
430 | |||
431 | /* Check, if the jiffies need an update */ | ||
432 | tick_do_update_jiffies64(now); | ||
433 | |||
434 | /* | ||
435 | * Do not call, when we are not in irq context and have | ||
436 | * no valid regs pointer | ||
437 | */ | ||
438 | if (regs) { | ||
439 | /* | ||
440 | * When we are idle and the tick is stopped, we have to touch | ||
441 | * the watchdog as we might not schedule for a really long | ||
442 | * time. This happens on complete idle SMP systems while | ||
443 | * waiting on the login prompt. We also increment the "start of | ||
444 | * idle" jiffy stamp so the idle accounting adjustment we do | ||
445 | * when we go busy again does not account too much ticks. | ||
446 | */ | ||
447 | if (ts->tick_stopped) { | ||
448 | touch_softlockup_watchdog(); | ||
449 | ts->idle_jiffies++; | ||
450 | } | ||
451 | /* | ||
452 | * update_process_times() might take tasklist_lock, hence | ||
453 | * drop the base lock. sched-tick hrtimers are per-CPU and | ||
454 | * never accessible by userspace APIs, so this is safe to do. | ||
455 | */ | ||
456 | spin_unlock(&base->lock); | ||
457 | update_process_times(user_mode(regs)); | ||
458 | profile_tick(CPU_PROFILING); | ||
459 | spin_lock(&base->lock); | ||
460 | } | ||
461 | |||
462 | /* Do not restart, when we are in the idle loop */ | ||
463 | if (ts->tick_stopped) | ||
464 | return HRTIMER_NORESTART; | ||
465 | |||
466 | hrtimer_forward(timer, now, tick_period); | ||
467 | |||
468 | return HRTIMER_RESTART; | ||
469 | } | ||
470 | |||
471 | /** | ||
472 | * tick_setup_sched_timer - setup the tick emulation timer | ||
473 | */ | ||
474 | void tick_setup_sched_timer(void) | ||
475 | { | ||
476 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
477 | ktime_t now = ktime_get(); | ||
478 | |||
479 | /* | ||
480 | * Emulate tick processing via per-CPU hrtimers: | ||
481 | */ | ||
482 | hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | ||
483 | ts->sched_timer.function = tick_sched_timer; | ||
484 | ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | ||
485 | |||
486 | /* Get the next period */ | ||
487 | ts->sched_timer.expires = tick_init_jiffy_update(); | ||
488 | |||
489 | for (;;) { | ||
490 | hrtimer_forward(&ts->sched_timer, now, tick_period); | ||
491 | hrtimer_start(&ts->sched_timer, ts->sched_timer.expires, | ||
492 | HRTIMER_MODE_ABS); | ||
493 | /* Check, if the timer was already in the past */ | ||
494 | if (hrtimer_active(&ts->sched_timer)) | ||
495 | break; | ||
496 | now = ktime_get(); | ||
497 | } | ||
498 | |||
499 | #ifdef CONFIG_NO_HZ | ||
500 | if (tick_nohz_enabled) | ||
501 | ts->nohz_mode = NOHZ_MODE_HIGHRES; | ||
502 | #endif | ||
503 | } | ||
504 | |||
505 | void tick_cancel_sched_timer(int cpu) | ||
506 | { | ||
507 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
508 | |||
509 | if (ts->sched_timer.base) | ||
510 | hrtimer_cancel(&ts->sched_timer); | ||
511 | ts->tick_stopped = 0; | ||
512 | ts->nohz_mode = NOHZ_MODE_INACTIVE; | ||
513 | } | ||
514 | #endif /* HIGH_RES_TIMERS */ | ||
515 | |||
516 | /** | ||
517 | * Async notification about clocksource changes | ||
518 | */ | ||
519 | void tick_clock_notify(void) | ||
520 | { | ||
521 | int cpu; | ||
522 | |||
523 | for_each_possible_cpu(cpu) | ||
524 | set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); | ||
525 | } | ||
526 | |||
527 | /* | ||
528 | * Async notification about clock event changes | ||
529 | */ | ||
530 | void tick_oneshot_notify(void) | ||
531 | { | ||
532 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
533 | |||
534 | set_bit(0, &ts->check_clocks); | ||
535 | } | ||
536 | |||
537 | /** | ||
538 | * Check, if a change happened, which makes oneshot possible. | ||
539 | * | ||
540 | * Called cyclic from the hrtimer softirq (driven by the timer | ||
541 | * softirq) allow_nohz signals, that we can switch into low-res nohz | ||
542 | * mode, because high resolution timers are disabled (either compile | ||
543 | * or runtime). | ||
544 | */ | ||
545 | int tick_check_oneshot_change(int allow_nohz) | ||
546 | { | ||
547 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
548 | |||
549 | if (!test_and_clear_bit(0, &ts->check_clocks)) | ||
550 | return 0; | ||
551 | |||
552 | if (ts->nohz_mode != NOHZ_MODE_INACTIVE) | ||
553 | return 0; | ||
554 | |||
555 | if (!timekeeping_is_continuous() || !tick_is_oneshot_available()) | ||
556 | return 0; | ||
557 | |||
558 | if (!allow_nohz) | ||
559 | return 1; | ||
560 | |||
561 | tick_nohz_switch_to_nohz(); | ||
562 | return 0; | ||
563 | } | ||