diff options
author | Ingo Molnar <mingo@elte.hu> | 2007-08-23 09:18:02 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2007-08-23 09:18:02 -0400 |
commit | 2aa44d0567ed21b47b87d68819415d48194cb923 (patch) | |
tree | 7be2a8a30a23b363e1e2aecd41934e75f581e115 | |
parent | b377fd3982ad957c796758a90e2988401a884241 (diff) |
sched: sched_clock_idle_[sleep|wakeup]_event()
construct a more or less wall-clock time out of sched_clock(), by
using ACPI-idle's existing knowledge about how much time we spent
idling. This allows the rq clock to work around TSC-stops-in-C2,
TSC-gets-corrupted-in-C3 type of problems.
( Besides the scheduler's statistics this also benefits blktrace and
printk-timestamps as well. )
Furthermore, the precise before-C2/C3-sleep and after-C2/C3-wakeup
callbacks allow the scheduler to get out the most of the period where
the CPU has a reliable TSC. This results in slightly more precise
task statistics.
the ACPI bits were acked by Len.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Len Brown <len.brown@intel.com>
-rw-r--r-- | arch/i386/kernel/tsc.c | 1 | ||||
-rw-r--r-- | drivers/acpi/processor_idle.c | 32 | ||||
-rw-r--r-- | include/linux/sched.h | 3 | ||||
-rw-r--r-- | kernel/sched.c | 41 | ||||
-rw-r--r-- | kernel/sched_debug.c | 3 |
5 files changed, 61 insertions, 19 deletions
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index debd7dbb4158..a39280b4dd3a 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c | |||
@@ -292,7 +292,6 @@ static struct clocksource clocksource_tsc = { | |||
292 | 292 | ||
293 | void mark_tsc_unstable(char *reason) | 293 | void mark_tsc_unstable(char *reason) |
294 | { | 294 | { |
295 | sched_clock_unstable_event(); | ||
296 | if (!tsc_unstable) { | 295 | if (!tsc_unstable) { |
297 | tsc_unstable = 1; | 296 | tsc_unstable = 1; |
298 | tsc_enabled = 0; | 297 | tsc_enabled = 0; |
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index a8634a0655fc..d9b8af763e1e 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c | |||
@@ -63,6 +63,7 @@ | |||
63 | ACPI_MODULE_NAME("processor_idle"); | 63 | ACPI_MODULE_NAME("processor_idle"); |
64 | #define ACPI_PROCESSOR_FILE_POWER "power" | 64 | #define ACPI_PROCESSOR_FILE_POWER "power" |
65 | #define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000) | 65 | #define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000) |
66 | #define PM_TIMER_TICK_NS (1000000000ULL/PM_TIMER_FREQUENCY) | ||
66 | #define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */ | 67 | #define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */ |
67 | #define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */ | 68 | #define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */ |
68 | static void (*pm_idle_save) (void) __read_mostly; | 69 | static void (*pm_idle_save) (void) __read_mostly; |
@@ -462,6 +463,9 @@ static void acpi_processor_idle(void) | |||
462 | * TBD: Can't get time duration while in C1, as resumes | 463 | * TBD: Can't get time duration while in C1, as resumes |
463 | * go to an ISR rather than here. Need to instrument | 464 | * go to an ISR rather than here. Need to instrument |
464 | * base interrupt handler. | 465 | * base interrupt handler. |
466 | * | ||
467 | * Note: the TSC better not stop in C1, sched_clock() will | ||
468 | * skew otherwise. | ||
465 | */ | 469 | */ |
466 | sleep_ticks = 0xFFFFFFFF; | 470 | sleep_ticks = 0xFFFFFFFF; |
467 | break; | 471 | break; |
@@ -469,6 +473,8 @@ static void acpi_processor_idle(void) | |||
469 | case ACPI_STATE_C2: | 473 | case ACPI_STATE_C2: |
470 | /* Get start time (ticks) */ | 474 | /* Get start time (ticks) */ |
471 | t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); | 475 | t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); |
476 | /* Tell the scheduler that we are going deep-idle: */ | ||
477 | sched_clock_idle_sleep_event(); | ||
472 | /* Invoke C2 */ | 478 | /* Invoke C2 */ |
473 | acpi_state_timer_broadcast(pr, cx, 1); | 479 | acpi_state_timer_broadcast(pr, cx, 1); |
474 | acpi_cstate_enter(cx); | 480 | acpi_cstate_enter(cx); |
@@ -479,17 +485,22 @@ static void acpi_processor_idle(void) | |||
479 | /* TSC halts in C2, so notify users */ | 485 | /* TSC halts in C2, so notify users */ |
480 | mark_tsc_unstable("possible TSC halt in C2"); | 486 | mark_tsc_unstable("possible TSC halt in C2"); |
481 | #endif | 487 | #endif |
488 | /* Compute time (ticks) that we were actually asleep */ | ||
489 | sleep_ticks = ticks_elapsed(t1, t2); | ||
490 | |||
491 | /* Tell the scheduler how much we idled: */ | ||
492 | sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS); | ||
493 | |||
482 | /* Re-enable interrupts */ | 494 | /* Re-enable interrupts */ |
483 | local_irq_enable(); | 495 | local_irq_enable(); |
496 | /* Do not account our idle-switching overhead: */ | ||
497 | sleep_ticks -= cx->latency_ticks + C2_OVERHEAD; | ||
498 | |||
484 | current_thread_info()->status |= TS_POLLING; | 499 | current_thread_info()->status |= TS_POLLING; |
485 | /* Compute time (ticks) that we were actually asleep */ | ||
486 | sleep_ticks = | ||
487 | ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD; | ||
488 | acpi_state_timer_broadcast(pr, cx, 0); | 500 | acpi_state_timer_broadcast(pr, cx, 0); |
489 | break; | 501 | break; |
490 | 502 | ||
491 | case ACPI_STATE_C3: | 503 | case ACPI_STATE_C3: |
492 | |||
493 | /* | 504 | /* |
494 | * disable bus master | 505 | * disable bus master |
495 | * bm_check implies we need ARB_DIS | 506 | * bm_check implies we need ARB_DIS |
@@ -518,6 +529,8 @@ static void acpi_processor_idle(void) | |||
518 | t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); | 529 | t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); |
519 | /* Invoke C3 */ | 530 | /* Invoke C3 */ |
520 | acpi_state_timer_broadcast(pr, cx, 1); | 531 | acpi_state_timer_broadcast(pr, cx, 1); |
532 | /* Tell the scheduler that we are going deep-idle: */ | ||
533 | sched_clock_idle_sleep_event(); | ||
521 | acpi_cstate_enter(cx); | 534 | acpi_cstate_enter(cx); |
522 | /* Get end time (ticks) */ | 535 | /* Get end time (ticks) */ |
523 | t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); | 536 | t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); |
@@ -531,12 +544,17 @@ static void acpi_processor_idle(void) | |||
531 | /* TSC halts in C3, so notify users */ | 544 | /* TSC halts in C3, so notify users */ |
532 | mark_tsc_unstable("TSC halts in C3"); | 545 | mark_tsc_unstable("TSC halts in C3"); |
533 | #endif | 546 | #endif |
547 | /* Compute time (ticks) that we were actually asleep */ | ||
548 | sleep_ticks = ticks_elapsed(t1, t2); | ||
549 | /* Tell the scheduler how much we idled: */ | ||
550 | sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS); | ||
551 | |||
534 | /* Re-enable interrupts */ | 552 | /* Re-enable interrupts */ |
535 | local_irq_enable(); | 553 | local_irq_enable(); |
554 | /* Do not account our idle-switching overhead: */ | ||
555 | sleep_ticks -= cx->latency_ticks + C3_OVERHEAD; | ||
556 | |||
536 | current_thread_info()->status |= TS_POLLING; | 557 | current_thread_info()->status |= TS_POLLING; |
537 | /* Compute time (ticks) that we were actually asleep */ | ||
538 | sleep_ticks = | ||
539 | ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD; | ||
540 | acpi_state_timer_broadcast(pr, cx, 0); | 558 | acpi_state_timer_broadcast(pr, cx, 0); |
541 | break; | 559 | break; |
542 | 560 | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index 682ef87da6eb..1845b2e99a87 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1388,7 +1388,8 @@ extern void sched_exec(void); | |||
1388 | #define sched_exec() {} | 1388 | #define sched_exec() {} |
1389 | #endif | 1389 | #endif |
1390 | 1390 | ||
1391 | extern void sched_clock_unstable_event(void); | 1391 | extern void sched_clock_idle_sleep_event(void); |
1392 | extern void sched_clock_idle_wakeup_event(u64 delta_ns); | ||
1392 | 1393 | ||
1393 | #ifdef CONFIG_HOTPLUG_CPU | 1394 | #ifdef CONFIG_HOTPLUG_CPU |
1394 | extern void idle_task_exit(void); | 1395 | extern void idle_task_exit(void); |
diff --git a/kernel/sched.c b/kernel/sched.c index 45e17b83b7f1..48e7586168ef 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -262,7 +262,8 @@ struct rq { | |||
262 | s64 clock_max_delta; | 262 | s64 clock_max_delta; |
263 | 263 | ||
264 | unsigned int clock_warps, clock_overflows; | 264 | unsigned int clock_warps, clock_overflows; |
265 | unsigned int clock_unstable_events; | 265 | u64 idle_clock; |
266 | unsigned int clock_deep_idle_events; | ||
266 | u64 tick_timestamp; | 267 | u64 tick_timestamp; |
267 | 268 | ||
268 | atomic_t nr_iowait; | 269 | atomic_t nr_iowait; |
@@ -556,18 +557,40 @@ static inline struct rq *this_rq_lock(void) | |||
556 | } | 557 | } |
557 | 558 | ||
558 | /* | 559 | /* |
559 | * CPU frequency is/was unstable - start new by setting prev_clock_raw: | 560 | * We are going deep-idle (irqs are disabled): |
560 | */ | 561 | */ |
561 | void sched_clock_unstable_event(void) | 562 | void sched_clock_idle_sleep_event(void) |
562 | { | 563 | { |
563 | unsigned long flags; | 564 | struct rq *rq = cpu_rq(smp_processor_id()); |
564 | struct rq *rq; | ||
565 | 565 | ||
566 | rq = task_rq_lock(current, &flags); | 566 | spin_lock(&rq->lock); |
567 | rq->prev_clock_raw = sched_clock(); | 567 | __update_rq_clock(rq); |
568 | rq->clock_unstable_events++; | 568 | spin_unlock(&rq->lock); |
569 | task_rq_unlock(rq, &flags); | 569 | rq->clock_deep_idle_events++; |
570 | } | ||
571 | EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); | ||
572 | |||
573 | /* | ||
574 | * We just idled delta nanoseconds (called with irqs disabled): | ||
575 | */ | ||
576 | void sched_clock_idle_wakeup_event(u64 delta_ns) | ||
577 | { | ||
578 | struct rq *rq = cpu_rq(smp_processor_id()); | ||
579 | u64 now = sched_clock(); | ||
580 | |||
581 | rq->idle_clock += delta_ns; | ||
582 | /* | ||
583 | * Override the previous timestamp and ignore all | ||
584 | * sched_clock() deltas that occured while we idled, | ||
585 | * and use the PM-provided delta_ns to advance the | ||
586 | * rq clock: | ||
587 | */ | ||
588 | spin_lock(&rq->lock); | ||
589 | rq->prev_clock_raw = now; | ||
590 | rq->clock += delta_ns; | ||
591 | spin_unlock(&rq->lock); | ||
570 | } | 592 | } |
593 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | ||
571 | 594 | ||
572 | /* | 595 | /* |
573 | * resched_task - mark a task 'to be rescheduled now'. | 596 | * resched_task - mark a task 'to be rescheduled now'. |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 87e524762b85..ab18f45f2ab2 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -154,10 +154,11 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
154 | P(next_balance); | 154 | P(next_balance); |
155 | P(curr->pid); | 155 | P(curr->pid); |
156 | P(clock); | 156 | P(clock); |
157 | P(idle_clock); | ||
157 | P(prev_clock_raw); | 158 | P(prev_clock_raw); |
158 | P(clock_warps); | 159 | P(clock_warps); |
159 | P(clock_overflows); | 160 | P(clock_overflows); |
160 | P(clock_unstable_events); | 161 | P(clock_deep_idle_events); |
161 | P(clock_max_delta); | 162 | P(clock_max_delta); |
162 | P(cpu_load[0]); | 163 | P(cpu_load[0]); |
163 | P(cpu_load[1]); | 164 | P(cpu_load[1]); |