diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-08-06 16:12:36 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-08-06 16:12:36 -0400 |
| commit | af390084359a5de20046c901529b2b6a50b941cb (patch) | |
| tree | b73a6261d1b1f9fb34432cc9a47411a49330b8dc | |
| parent | 7645e4320497b35ce9fb6c2269ebcd57af9fe735 (diff) | |
| parent | 0fcb80818bc3ade5befd409051089f710adcf7b0 (diff) | |
Merge branch 'timers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'timers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
Documentation: Add timers/timers-howto.txt
timer: Added usleep_range timer
Revert "timer: Added usleep[_range] timer"
clockevents: Remove the per cpu tick skew
posix_timer: Move copy_to_user(created_timer_id) down in timer_create()
timer: Added usleep[_range] timer
timers: Document meaning of deferrable timer
| -rw-r--r-- | Documentation/timers/timers-howto.txt | 105 | ||||
| -rw-r--r-- | include/linux/delay.h | 1 | ||||
| -rw-r--r-- | kernel/posix-timers.c | 11 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 5 | ||||
| -rw-r--r-- | kernel/timer.c | 31 |
5 files changed, 141 insertions, 12 deletions
diff --git a/Documentation/timers/timers-howto.txt b/Documentation/timers/timers-howto.txt new file mode 100644 index 000000000000..c9ef29d2ede3 --- /dev/null +++ b/Documentation/timers/timers-howto.txt | |||
| @@ -0,0 +1,105 @@ | |||
| 1 | delays - Information on the various kernel delay / sleep mechanisms | ||
| 2 | ------------------------------------------------------------------- | ||
| 3 | |||
| 4 | This document seeks to answer the common question: "What is the | ||
| 5 | RightWay (TM) to insert a delay?" | ||
| 6 | |||
| 7 | This question is most often faced by driver writers who have to | ||
| 8 | deal with hardware delays and who may not be the most intimately | ||
| 9 | familiar with the inner workings of the Linux Kernel. | ||
| 10 | |||
| 11 | |||
| 12 | Inserting Delays | ||
| 13 | ---------------- | ||
| 14 | |||
| 15 | The first, and most important, question you need to ask is "Is my | ||
| 16 | code in an atomic context?" This should be followed closely by "Does | ||
| 17 | it really need to delay in atomic context?" If so... | ||
| 18 | |||
| 19 | ATOMIC CONTEXT: | ||
| 20 | You must use the *delay family of functions. These | ||
| 21 | functions use the jiffie estimation of clock speed | ||
| 22 | and will busy wait for enough loop cycles to achieve | ||
| 23 | the desired delay: | ||
| 24 | |||
| 25 | ndelay(unsigned long nsecs) | ||
| 26 | udelay(unsigned long usecs) | ||
| 27 | mdelay(unsgined long msecs) | ||
| 28 | |||
| 29 | udelay is the generally preferred API; ndelay-level | ||
| 30 | precision may not actually exist on many non-PC devices. | ||
| 31 | |||
| 32 | mdelay is macro wrapper around udelay, to account for | ||
| 33 | possible overflow when passing large arguments to udelay. | ||
| 34 | In general, use of mdelay is discouraged and code should | ||
| 35 | be refactored to allow for the use of msleep. | ||
| 36 | |||
| 37 | NON-ATOMIC CONTEXT: | ||
| 38 | You should use the *sleep[_range] family of functions. | ||
| 39 | There are a few more options here, while any of them may | ||
| 40 | work correctly, using the "right" sleep function will | ||
| 41 | help the scheduler, power management, and just make your | ||
| 42 | driver better :) | ||
| 43 | |||
| 44 | -- Backed by busy-wait loop: | ||
| 45 | udelay(unsigned long usecs) | ||
| 46 | -- Backed by hrtimers: | ||
| 47 | usleep_range(unsigned long min, unsigned long max) | ||
| 48 | -- Backed by jiffies / legacy_timers | ||
| 49 | msleep(unsigned long msecs) | ||
| 50 | msleep_interruptible(unsigned long msecs) | ||
| 51 | |||
| 52 | Unlike the *delay family, the underlying mechanism | ||
| 53 | driving each of these calls varies, thus there are | ||
| 54 | quirks you should be aware of. | ||
| 55 | |||
| 56 | |||
| 57 | SLEEPING FOR "A FEW" USECS ( < ~10us? ): | ||
| 58 | * Use udelay | ||
| 59 | |||
| 60 | - Why not usleep? | ||
| 61 | On slower systems, (embedded, OR perhaps a speed- | ||
| 62 | stepped PC!) the overhead of setting up the hrtimers | ||
| 63 | for usleep *may* not be worth it. Such an evaluation | ||
| 64 | will obviously depend on your specific situation, but | ||
| 65 | it is something to be aware of. | ||
| 66 | |||
| 67 | SLEEPING FOR ~USECS OR SMALL MSECS ( 10us - 20ms): | ||
| 68 | * Use usleep_range | ||
| 69 | |||
| 70 | - Why not msleep for (1ms - 20ms)? | ||
| 71 | Explained originally here: | ||
| 72 | http://lkml.org/lkml/2007/8/3/250 | ||
| 73 | msleep(1~20) may not do what the caller intends, and | ||
| 74 | will often sleep longer (~20 ms actual sleep for any | ||
| 75 | value given in the 1~20ms range). In many cases this | ||
| 76 | is not the desired behavior. | ||
| 77 | |||
| 78 | - Why is there no "usleep" / What is a good range? | ||
| 79 | Since usleep_range is built on top of hrtimers, the | ||
| 80 | wakeup will be very precise (ish), thus a simple | ||
| 81 | usleep function would likely introduce a large number | ||
| 82 | of undesired interrupts. | ||
| 83 | |||
| 84 | With the introduction of a range, the scheduler is | ||
| 85 | free to coalesce your wakeup with any other wakeup | ||
| 86 | that may have happened for other reasons, or at the | ||
| 87 | worst case, fire an interrupt for your upper bound. | ||
| 88 | |||
| 89 | The larger a range you supply, the greater a chance | ||
| 90 | that you will not trigger an interrupt; this should | ||
| 91 | be balanced with what is an acceptable upper bound on | ||
| 92 | delay / performance for your specific code path. Exact | ||
| 93 | tolerances here are very situation specific, thus it | ||
| 94 | is left to the caller to determine a reasonable range. | ||
| 95 | |||
| 96 | SLEEPING FOR LARGER MSECS ( 10ms+ ) | ||
| 97 | * Use msleep or possibly msleep_interruptible | ||
| 98 | |||
| 99 | - What's the difference? | ||
| 100 | msleep sets the current task to TASK_UNINTERRUPTIBLE | ||
| 101 | whereas msleep_interruptible sets the current task to | ||
| 102 | TASK_INTERRUPTIBLE before scheduling the sleep. In | ||
| 103 | short, the difference is whether the sleep can be ended | ||
| 104 | early by a signal. In general, just use msleep unless | ||
| 105 | you know you have a need for the interruptible variant. | ||
diff --git a/include/linux/delay.h b/include/linux/delay.h index fd832c6d419e..a6ecb34cf547 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h | |||
| @@ -45,6 +45,7 @@ extern unsigned long lpj_fine; | |||
| 45 | void calibrate_delay(void); | 45 | void calibrate_delay(void); |
| 46 | void msleep(unsigned int msecs); | 46 | void msleep(unsigned int msecs); |
| 47 | unsigned long msleep_interruptible(unsigned int msecs); | 47 | unsigned long msleep_interruptible(unsigned int msecs); |
| 48 | void usleep_range(unsigned long min, unsigned long max); | ||
| 48 | 49 | ||
| 49 | static inline void ssleep(unsigned int seconds) | 50 | static inline void ssleep(unsigned int seconds) |
| 50 | { | 51 | { |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ad723420acc3..9ca4973f736d 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
| @@ -560,11 +560,6 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
| 560 | new_timer->it_clock = which_clock; | 560 | new_timer->it_clock = which_clock; |
| 561 | new_timer->it_overrun = -1; | 561 | new_timer->it_overrun = -1; |
| 562 | 562 | ||
| 563 | if (copy_to_user(created_timer_id, | ||
| 564 | &new_timer_id, sizeof (new_timer_id))) { | ||
| 565 | error = -EFAULT; | ||
| 566 | goto out; | ||
| 567 | } | ||
| 568 | if (timer_event_spec) { | 563 | if (timer_event_spec) { |
| 569 | if (copy_from_user(&event, timer_event_spec, sizeof (event))) { | 564 | if (copy_from_user(&event, timer_event_spec, sizeof (event))) { |
| 570 | error = -EFAULT; | 565 | error = -EFAULT; |
| @@ -590,6 +585,12 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
| 590 | new_timer->sigq->info.si_tid = new_timer->it_id; | 585 | new_timer->sigq->info.si_tid = new_timer->it_id; |
| 591 | new_timer->sigq->info.si_code = SI_TIMER; | 586 | new_timer->sigq->info.si_code = SI_TIMER; |
| 592 | 587 | ||
| 588 | if (copy_to_user(created_timer_id, | ||
| 589 | &new_timer_id, sizeof (new_timer_id))) { | ||
| 590 | error = -EFAULT; | ||
| 591 | goto out; | ||
| 592 | } | ||
| 593 | |||
| 593 | error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); | 594 | error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); |
| 594 | if (error) | 595 | if (error) |
| 595 | goto out; | 596 | goto out; |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 021d2f878f19..3e216e01bbd1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -774,7 +774,6 @@ void tick_setup_sched_timer(void) | |||
| 774 | { | 774 | { |
| 775 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 775 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
| 776 | ktime_t now = ktime_get(); | 776 | ktime_t now = ktime_get(); |
| 777 | u64 offset; | ||
| 778 | 777 | ||
| 779 | /* | 778 | /* |
| 780 | * Emulate tick processing via per-CPU hrtimers: | 779 | * Emulate tick processing via per-CPU hrtimers: |
| @@ -784,10 +783,6 @@ void tick_setup_sched_timer(void) | |||
| 784 | 783 | ||
| 785 | /* Get the next period (per cpu) */ | 784 | /* Get the next period (per cpu) */ |
| 786 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); | 785 | hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); |
| 787 | offset = ktime_to_ns(tick_period) >> 1; | ||
| 788 | do_div(offset, num_possible_cpus()); | ||
| 789 | offset *= smp_processor_id(); | ||
| 790 | hrtimer_add_expires_ns(&ts->sched_timer, offset); | ||
| 791 | 786 | ||
| 792 | for (;;) { | 787 | for (;;) { |
| 793 | hrtimer_forward(&ts->sched_timer, now, tick_period); | 788 | hrtimer_forward(&ts->sched_timer, now, tick_period); |
diff --git a/kernel/timer.c b/kernel/timer.c index d61d16da0b64..f1b8afe1ad86 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -90,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; | |||
| 90 | 90 | ||
| 91 | /* | 91 | /* |
| 92 | * Note that all tvec_bases are 2 byte aligned and lower bit of | 92 | * Note that all tvec_bases are 2 byte aligned and lower bit of |
| 93 | * base in timer_list is guaranteed to be zero. Use the LSB for | 93 | * base in timer_list is guaranteed to be zero. Use the LSB to |
| 94 | * the new flag to indicate whether the timer is deferrable | 94 | * indicate whether the timer is deferrable. |
| 95 | * | ||
| 96 | * A deferrable timer will work normally when the system is busy, but | ||
| 97 | * will not cause a CPU to come out of idle just to service it; instead, | ||
| 98 | * the timer will be serviced when the CPU eventually wakes up with a | ||
| 99 | * subsequent non-deferrable timer. | ||
| 95 | */ | 100 | */ |
| 96 | #define TBASE_DEFERRABLE_FLAG (0x1) | 101 | #define TBASE_DEFERRABLE_FLAG (0x1) |
| 97 | 102 | ||
| @@ -1758,3 +1763,25 @@ unsigned long msleep_interruptible(unsigned int msecs) | |||
| 1758 | } | 1763 | } |
| 1759 | 1764 | ||
| 1760 | EXPORT_SYMBOL(msleep_interruptible); | 1765 | EXPORT_SYMBOL(msleep_interruptible); |
| 1766 | |||
| 1767 | static int __sched do_usleep_range(unsigned long min, unsigned long max) | ||
| 1768 | { | ||
| 1769 | ktime_t kmin; | ||
| 1770 | unsigned long delta; | ||
| 1771 | |||
| 1772 | kmin = ktime_set(0, min * NSEC_PER_USEC); | ||
| 1773 | delta = (max - min) * NSEC_PER_USEC; | ||
| 1774 | return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); | ||
| 1775 | } | ||
| 1776 | |||
| 1777 | /** | ||
| 1778 | * usleep_range - Drop in replacement for udelay where wakeup is flexible | ||
| 1779 | * @min: Minimum time in usecs to sleep | ||
| 1780 | * @max: Maximum time in usecs to sleep | ||
| 1781 | */ | ||
| 1782 | void usleep_range(unsigned long min, unsigned long max) | ||
| 1783 | { | ||
| 1784 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
| 1785 | do_usleep_range(min, max); | ||
| 1786 | } | ||
| 1787 | EXPORT_SYMBOL(usleep_range); | ||
