aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2007-08-23 09:18:02 -0400
committerIngo Molnar <mingo@elte.hu>2007-08-23 09:18:02 -0400
commit2aa44d0567ed21b47b87d68819415d48194cb923 (patch)
tree7be2a8a30a23b363e1e2aecd41934e75f581e115
parentb377fd3982ad957c796758a90e2988401a884241 (diff)
sched: sched_clock_idle_[sleep|wakeup]_event()
construct a more or less wall-clock time out of sched_clock(), by using ACPI-idle's existing knowledge about how much time we spent idling. This allows the rq clock to work around TSC-stops-in-C2, TSC-gets-corrupted-in-C3 type of problems. ( Besides the scheduler's statistics this also benefits blktrace and printk-timestamps as well. ) Furthermore, the precise before-C2/C3-sleep and after-C2/C3-wakeup callbacks allow the scheduler to get out the most of the period where the CPU has a reliable TSC. This results in slightly more precise task statistics. the ACPI bits were acked by Len. Signed-off-by: Ingo Molnar <mingo@elte.hu> Acked-by: Len Brown <len.brown@intel.com>
-rw-r--r--arch/i386/kernel/tsc.c1
-rw-r--r--drivers/acpi/processor_idle.c32
-rw-r--r--include/linux/sched.h3
-rw-r--r--kernel/sched.c41
-rw-r--r--kernel/sched_debug.c3
5 files changed, 61 insertions, 19 deletions
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index debd7dbb4158..a39280b4dd3a 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -292,7 +292,6 @@ static struct clocksource clocksource_tsc = {
292 292
293void mark_tsc_unstable(char *reason) 293void mark_tsc_unstable(char *reason)
294{ 294{
295 sched_clock_unstable_event();
296 if (!tsc_unstable) { 295 if (!tsc_unstable) {
297 tsc_unstable = 1; 296 tsc_unstable = 1;
298 tsc_enabled = 0; 297 tsc_enabled = 0;
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index a8634a0655fc..d9b8af763e1e 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -63,6 +63,7 @@
63ACPI_MODULE_NAME("processor_idle"); 63ACPI_MODULE_NAME("processor_idle");
64#define ACPI_PROCESSOR_FILE_POWER "power" 64#define ACPI_PROCESSOR_FILE_POWER "power"
65#define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000) 65#define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
66#define PM_TIMER_TICK_NS (1000000000ULL/PM_TIMER_FREQUENCY)
66#define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */ 67#define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */
67#define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */ 68#define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */
68static void (*pm_idle_save) (void) __read_mostly; 69static void (*pm_idle_save) (void) __read_mostly;
@@ -462,6 +463,9 @@ static void acpi_processor_idle(void)
462 * TBD: Can't get time duration while in C1, as resumes 463 * TBD: Can't get time duration while in C1, as resumes
463 * go to an ISR rather than here. Need to instrument 464 * go to an ISR rather than here. Need to instrument
464 * base interrupt handler. 465 * base interrupt handler.
466 *
467 * Note: the TSC better not stop in C1, sched_clock() will
468 * skew otherwise.
465 */ 469 */
466 sleep_ticks = 0xFFFFFFFF; 470 sleep_ticks = 0xFFFFFFFF;
467 break; 471 break;
@@ -469,6 +473,8 @@ static void acpi_processor_idle(void)
469 case ACPI_STATE_C2: 473 case ACPI_STATE_C2:
470 /* Get start time (ticks) */ 474 /* Get start time (ticks) */
471 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); 475 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
476 /* Tell the scheduler that we are going deep-idle: */
477 sched_clock_idle_sleep_event();
472 /* Invoke C2 */ 478 /* Invoke C2 */
473 acpi_state_timer_broadcast(pr, cx, 1); 479 acpi_state_timer_broadcast(pr, cx, 1);
474 acpi_cstate_enter(cx); 480 acpi_cstate_enter(cx);
@@ -479,17 +485,22 @@ static void acpi_processor_idle(void)
479 /* TSC halts in C2, so notify users */ 485 /* TSC halts in C2, so notify users */
480 mark_tsc_unstable("possible TSC halt in C2"); 486 mark_tsc_unstable("possible TSC halt in C2");
481#endif 487#endif
488 /* Compute time (ticks) that we were actually asleep */
489 sleep_ticks = ticks_elapsed(t1, t2);
490
491 /* Tell the scheduler how much we idled: */
492 sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
493
482 /* Re-enable interrupts */ 494 /* Re-enable interrupts */
483 local_irq_enable(); 495 local_irq_enable();
496 /* Do not account our idle-switching overhead: */
497 sleep_ticks -= cx->latency_ticks + C2_OVERHEAD;
498
484 current_thread_info()->status |= TS_POLLING; 499 current_thread_info()->status |= TS_POLLING;
485 /* Compute time (ticks) that we were actually asleep */
486 sleep_ticks =
487 ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
488 acpi_state_timer_broadcast(pr, cx, 0); 500 acpi_state_timer_broadcast(pr, cx, 0);
489 break; 501 break;
490 502
491 case ACPI_STATE_C3: 503 case ACPI_STATE_C3:
492
493 /* 504 /*
494 * disable bus master 505 * disable bus master
495 * bm_check implies we need ARB_DIS 506 * bm_check implies we need ARB_DIS
@@ -518,6 +529,8 @@ static void acpi_processor_idle(void)
518 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); 529 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
519 /* Invoke C3 */ 530 /* Invoke C3 */
520 acpi_state_timer_broadcast(pr, cx, 1); 531 acpi_state_timer_broadcast(pr, cx, 1);
532 /* Tell the scheduler that we are going deep-idle: */
533 sched_clock_idle_sleep_event();
521 acpi_cstate_enter(cx); 534 acpi_cstate_enter(cx);
522 /* Get end time (ticks) */ 535 /* Get end time (ticks) */
523 t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); 536 t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
@@ -531,12 +544,17 @@ static void acpi_processor_idle(void)
531 /* TSC halts in C3, so notify users */ 544 /* TSC halts in C3, so notify users */
532 mark_tsc_unstable("TSC halts in C3"); 545 mark_tsc_unstable("TSC halts in C3");
533#endif 546#endif
547 /* Compute time (ticks) that we were actually asleep */
548 sleep_ticks = ticks_elapsed(t1, t2);
549 /* Tell the scheduler how much we idled: */
550 sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
551
534 /* Re-enable interrupts */ 552 /* Re-enable interrupts */
535 local_irq_enable(); 553 local_irq_enable();
554 /* Do not account our idle-switching overhead: */
555 sleep_ticks -= cx->latency_ticks + C3_OVERHEAD;
556
536 current_thread_info()->status |= TS_POLLING; 557 current_thread_info()->status |= TS_POLLING;
537 /* Compute time (ticks) that we were actually asleep */
538 sleep_ticks =
539 ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD;
540 acpi_state_timer_broadcast(pr, cx, 0); 558 acpi_state_timer_broadcast(pr, cx, 0);
541 break; 559 break;
542 560
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 682ef87da6eb..1845b2e99a87 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1388,7 +1388,8 @@ extern void sched_exec(void);
1388#define sched_exec() {} 1388#define sched_exec() {}
1389#endif 1389#endif
1390 1390
1391extern void sched_clock_unstable_event(void); 1391extern void sched_clock_idle_sleep_event(void);
1392extern void sched_clock_idle_wakeup_event(u64 delta_ns);
1392 1393
1393#ifdef CONFIG_HOTPLUG_CPU 1394#ifdef CONFIG_HOTPLUG_CPU
1394extern void idle_task_exit(void); 1395extern void idle_task_exit(void);
diff --git a/kernel/sched.c b/kernel/sched.c
index 45e17b83b7f1..48e7586168ef 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -262,7 +262,8 @@ struct rq {
262 s64 clock_max_delta; 262 s64 clock_max_delta;
263 263
264 unsigned int clock_warps, clock_overflows; 264 unsigned int clock_warps, clock_overflows;
265 unsigned int clock_unstable_events; 265 u64 idle_clock;
266 unsigned int clock_deep_idle_events;
266 u64 tick_timestamp; 267 u64 tick_timestamp;
267 268
268 atomic_t nr_iowait; 269 atomic_t nr_iowait;
@@ -556,18 +557,40 @@ static inline struct rq *this_rq_lock(void)
556} 557}
557 558
558/* 559/*
559 * CPU frequency is/was unstable - start new by setting prev_clock_raw: 560 * We are going deep-idle (irqs are disabled):
560 */ 561 */
561void sched_clock_unstable_event(void) 562void sched_clock_idle_sleep_event(void)
562{ 563{
563 unsigned long flags; 564 struct rq *rq = cpu_rq(smp_processor_id());
564 struct rq *rq;
565 565
566 rq = task_rq_lock(current, &flags); 566 spin_lock(&rq->lock);
567 rq->prev_clock_raw = sched_clock(); 567 __update_rq_clock(rq);
568 rq->clock_unstable_events++; 568 spin_unlock(&rq->lock);
569 task_rq_unlock(rq, &flags); 569 rq->clock_deep_idle_events++;
570}
571EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
572
573/*
574 * We just idled delta nanoseconds (called with irqs disabled):
575 */
576void sched_clock_idle_wakeup_event(u64 delta_ns)
577{
578 struct rq *rq = cpu_rq(smp_processor_id());
579 u64 now = sched_clock();
580
581 rq->idle_clock += delta_ns;
582 /*
583 * Override the previous timestamp and ignore all
584 * sched_clock() deltas that occured while we idled,
585 * and use the PM-provided delta_ns to advance the
586 * rq clock:
587 */
588 spin_lock(&rq->lock);
589 rq->prev_clock_raw = now;
590 rq->clock += delta_ns;
591 spin_unlock(&rq->lock);
570} 592}
593EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
571 594
572/* 595/*
573 * resched_task - mark a task 'to be rescheduled now'. 596 * resched_task - mark a task 'to be rescheduled now'.
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 87e524762b85..ab18f45f2ab2 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -154,10 +154,11 @@ static void print_cpu(struct seq_file *m, int cpu)
154 P(next_balance); 154 P(next_balance);
155 P(curr->pid); 155 P(curr->pid);
156 P(clock); 156 P(clock);
157 P(idle_clock);
157 P(prev_clock_raw); 158 P(prev_clock_raw);
158 P(clock_warps); 159 P(clock_warps);
159 P(clock_overflows); 160 P(clock_overflows);
160 P(clock_unstable_events); 161 P(clock_deep_idle_events);
161 P(clock_max_delta); 162 P(clock_max_delta);
162 P(cpu_load[0]); 163 P(cpu_load[0]);
163 P(cpu_load[1]); 164 P(cpu_load[1]);