diff options
author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2012-03-09 13:55:17 -0500 |
---|---|---|
committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2012-03-09 13:55:17 -0500 |
commit | b675b3667f6729dcd1036a2a129b35445947f905 (patch) | |
tree | 0d58791e9063d3ca2c352da6f3e7df2bdb876f9d /kernel/time | |
parent | 104a5f3cad8f2f27cadbdf0029400ecd9e17ccc0 (diff) | |
parent | 192cfd58774b4d17b2fe8bdc77d89c2ef4e0591d (diff) |
Merge commit 'v3.3-rc6' into next
Diffstat (limited to 'kernel/time')
-rw-r--r-- | kernel/time/Kconfig | 2 | ||||
-rw-r--r-- | kernel/time/alarmtimer.c | 2 | ||||
-rw-r--r-- | kernel/time/clockevents.c | 1 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 111 | ||||
-rw-r--r-- | kernel/time/tick-broadcast.c | 2 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 105 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 94 |
7 files changed, 241 insertions, 76 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b26c2228fe92..2cf9cc7aa103 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -25,7 +25,7 @@ config HIGH_RES_TIMERS | |||
25 | config GENERIC_CLOCKEVENTS_BUILD | 25 | config GENERIC_CLOCKEVENTS_BUILD |
26 | bool | 26 | bool |
27 | default y | 27 | default y |
28 | depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR | 28 | depends on GENERIC_CLOCKEVENTS |
29 | 29 | ||
30 | config GENERIC_CLOCKEVENTS_MIN_ADJUST | 30 | config GENERIC_CLOCKEVENTS_MIN_ADJUST |
31 | bool | 31 | bool |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c436e790b21b..8a46f5d64504 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -195,7 +195,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) | |||
195 | struct alarm *alarm; | 195 | struct alarm *alarm; |
196 | ktime_t expired = next->expires; | 196 | ktime_t expired = next->expires; |
197 | 197 | ||
198 | if (expired.tv64 >= now.tv64) | 198 | if (expired.tv64 > now.tv64) |
199 | break; | 199 | break; |
200 | 200 | ||
201 | alarm = container_of(next, struct alarm, node); | 201 | alarm = container_of(next, struct alarm, node); |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 1ecd6ba36d6c..9cd928f7a7c6 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/notifier.h> | 18 | #include <linux/notifier.h> |
19 | #include <linux/smp.h> | 19 | #include <linux/smp.h> |
20 | #include <linux/sysdev.h> | ||
21 | 20 | ||
22 | #include "tick-internal.h" | 21 | #include "tick-internal.h" |
23 | 22 | ||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index cf52fda2e096..a45ca167ab24 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -23,8 +23,8 @@ | |||
23 | * o Allow clocksource drivers to be unregistered | 23 | * o Allow clocksource drivers to be unregistered |
24 | */ | 24 | */ |
25 | 25 | ||
26 | #include <linux/device.h> | ||
26 | #include <linux/clocksource.h> | 27 | #include <linux/clocksource.h> |
27 | #include <linux/sysdev.h> | ||
28 | #include <linux/init.h> | 28 | #include <linux/init.h> |
29 | #include <linux/module.h> | 29 | #include <linux/module.h> |
30 | #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ | 30 | #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ |
@@ -492,6 +492,22 @@ void clocksource_touch_watchdog(void) | |||
492 | } | 492 | } |
493 | 493 | ||
494 | /** | 494 | /** |
495 | * clocksource_max_adjustment- Returns max adjustment amount | ||
496 | * @cs: Pointer to clocksource | ||
497 | * | ||
498 | */ | ||
499 | static u32 clocksource_max_adjustment(struct clocksource *cs) | ||
500 | { | ||
501 | u64 ret; | ||
502 | /* | ||
503 | * We won't try to correct for more then 11% adjustments (110,000 ppm), | ||
504 | */ | ||
505 | ret = (u64)cs->mult * 11; | ||
506 | do_div(ret,100); | ||
507 | return (u32)ret; | ||
508 | } | ||
509 | |||
510 | /** | ||
495 | * clocksource_max_deferment - Returns max time the clocksource can be deferred | 511 | * clocksource_max_deferment - Returns max time the clocksource can be deferred |
496 | * @cs: Pointer to clocksource | 512 | * @cs: Pointer to clocksource |
497 | * | 513 | * |
@@ -503,25 +519,28 @@ static u64 clocksource_max_deferment(struct clocksource *cs) | |||
503 | /* | 519 | /* |
504 | * Calculate the maximum number of cycles that we can pass to the | 520 | * Calculate the maximum number of cycles that we can pass to the |
505 | * cyc2ns function without overflowing a 64-bit signed result. The | 521 | * cyc2ns function without overflowing a 64-bit signed result. The |
506 | * maximum number of cycles is equal to ULLONG_MAX/cs->mult which | 522 | * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) |
507 | * is equivalent to the below. | 523 | * which is equivalent to the below. |
508 | * max_cycles < (2^63)/cs->mult | 524 | * max_cycles < (2^63)/(cs->mult + cs->maxadj) |
509 | * max_cycles < 2^(log2((2^63)/cs->mult)) | 525 | * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) |
510 | * max_cycles < 2^(log2(2^63) - log2(cs->mult)) | 526 | * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) |
511 | * max_cycles < 2^(63 - log2(cs->mult)) | 527 | * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) |
512 | * max_cycles < 1 << (63 - log2(cs->mult)) | 528 | * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) |
513 | * Please note that we add 1 to the result of the log2 to account for | 529 | * Please note that we add 1 to the result of the log2 to account for |
514 | * any rounding errors, ensure the above inequality is satisfied and | 530 | * any rounding errors, ensure the above inequality is satisfied and |
515 | * no overflow will occur. | 531 | * no overflow will occur. |
516 | */ | 532 | */ |
517 | max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); | 533 | max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); |
518 | 534 | ||
519 | /* | 535 | /* |
520 | * The actual maximum number of cycles we can defer the clocksource is | 536 | * The actual maximum number of cycles we can defer the clocksource is |
521 | * determined by the minimum of max_cycles and cs->mask. | 537 | * determined by the minimum of max_cycles and cs->mask. |
538 | * Note: Here we subtract the maxadj to make sure we don't sleep for | ||
539 | * too long if there's a large negative adjustment. | ||
522 | */ | 540 | */ |
523 | max_cycles = min_t(u64, max_cycles, (u64) cs->mask); | 541 | max_cycles = min_t(u64, max_cycles, (u64) cs->mask); |
524 | max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); | 542 | max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, |
543 | cs->shift); | ||
525 | 544 | ||
526 | /* | 545 | /* |
527 | * To ensure that the clocksource does not wrap whilst we are idle, | 546 | * To ensure that the clocksource does not wrap whilst we are idle, |
@@ -529,7 +548,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) | |||
529 | * note a margin of 12.5% is used because this can be computed with | 548 | * note a margin of 12.5% is used because this can be computed with |
530 | * a shift, versus say 10% which would require division. | 549 | * a shift, versus say 10% which would require division. |
531 | */ | 550 | */ |
532 | return max_nsecs - (max_nsecs >> 5); | 551 | return max_nsecs - (max_nsecs >> 3); |
533 | } | 552 | } |
534 | 553 | ||
535 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET | 554 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET |
@@ -628,7 +647,7 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
628 | 647 | ||
629 | /** | 648 | /** |
630 | * __clocksource_updatefreq_scale - Used update clocksource with new freq | 649 | * __clocksource_updatefreq_scale - Used update clocksource with new freq |
631 | * @t: clocksource to be registered | 650 | * @cs: clocksource to be registered |
632 | * @scale: Scale factor multiplied against freq to get clocksource hz | 651 | * @scale: Scale factor multiplied against freq to get clocksource hz |
633 | * @freq: clocksource frequency (cycles per second) divided by scale | 652 | * @freq: clocksource frequency (cycles per second) divided by scale |
634 | * | 653 | * |
@@ -640,7 +659,6 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
640 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | 659 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) |
641 | { | 660 | { |
642 | u64 sec; | 661 | u64 sec; |
643 | |||
644 | /* | 662 | /* |
645 | * Calc the maximum number of seconds which we can run before | 663 | * Calc the maximum number of seconds which we can run before |
646 | * wrapping around. For clocksources which have a mask > 32bit | 664 | * wrapping around. For clocksources which have a mask > 32bit |
@@ -651,7 +669,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
651 | * ~ 0.06ppm granularity for NTP. We apply the same 12.5% | 669 | * ~ 0.06ppm granularity for NTP. We apply the same 12.5% |
652 | * margin as we do in clocksource_max_deferment() | 670 | * margin as we do in clocksource_max_deferment() |
653 | */ | 671 | */ |
654 | sec = (cs->mask - (cs->mask >> 5)); | 672 | sec = (cs->mask - (cs->mask >> 3)); |
655 | do_div(sec, freq); | 673 | do_div(sec, freq); |
656 | do_div(sec, scale); | 674 | do_div(sec, scale); |
657 | if (!sec) | 675 | if (!sec) |
@@ -661,13 +679,27 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
661 | 679 | ||
662 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, | 680 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, |
663 | NSEC_PER_SEC / scale, sec * scale); | 681 | NSEC_PER_SEC / scale, sec * scale); |
682 | |||
683 | /* | ||
684 | * for clocksources that have large mults, to avoid overflow. | ||
685 | * Since mult may be adjusted by ntp, add an safety extra margin | ||
686 | * | ||
687 | */ | ||
688 | cs->maxadj = clocksource_max_adjustment(cs); | ||
689 | while ((cs->mult + cs->maxadj < cs->mult) | ||
690 | || (cs->mult - cs->maxadj > cs->mult)) { | ||
691 | cs->mult >>= 1; | ||
692 | cs->shift--; | ||
693 | cs->maxadj = clocksource_max_adjustment(cs); | ||
694 | } | ||
695 | |||
664 | cs->max_idle_ns = clocksource_max_deferment(cs); | 696 | cs->max_idle_ns = clocksource_max_deferment(cs); |
665 | } | 697 | } |
666 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | 698 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); |
667 | 699 | ||
668 | /** | 700 | /** |
669 | * __clocksource_register_scale - Used to install new clocksources | 701 | * __clocksource_register_scale - Used to install new clocksources |
670 | * @t: clocksource to be registered | 702 | * @cs: clocksource to be registered |
671 | * @scale: Scale factor multiplied against freq to get clocksource hz | 703 | * @scale: Scale factor multiplied against freq to get clocksource hz |
672 | * @freq: clocksource frequency (cycles per second) divided by scale | 704 | * @freq: clocksource frequency (cycles per second) divided by scale |
673 | * | 705 | * |
@@ -695,12 +727,18 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale); | |||
695 | 727 | ||
696 | /** | 728 | /** |
697 | * clocksource_register - Used to install new clocksources | 729 | * clocksource_register - Used to install new clocksources |
698 | * @t: clocksource to be registered | 730 | * @cs: clocksource to be registered |
699 | * | 731 | * |
700 | * Returns -EBUSY if registration fails, zero otherwise. | 732 | * Returns -EBUSY if registration fails, zero otherwise. |
701 | */ | 733 | */ |
702 | int clocksource_register(struct clocksource *cs) | 734 | int clocksource_register(struct clocksource *cs) |
703 | { | 735 | { |
736 | /* calculate max adjustment for given mult/shift */ | ||
737 | cs->maxadj = clocksource_max_adjustment(cs); | ||
738 | WARN_ONCE(cs->mult + cs->maxadj < cs->mult, | ||
739 | "Clocksource %s might overflow on 11%% adjustment\n", | ||
740 | cs->name); | ||
741 | |||
704 | /* calculate max idle time permitted for this clocksource */ | 742 | /* calculate max idle time permitted for this clocksource */ |
705 | cs->max_idle_ns = clocksource_max_deferment(cs); | 743 | cs->max_idle_ns = clocksource_max_deferment(cs); |
706 | 744 | ||
@@ -723,6 +761,8 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating) | |||
723 | 761 | ||
724 | /** | 762 | /** |
725 | * clocksource_change_rating - Change the rating of a registered clocksource | 763 | * clocksource_change_rating - Change the rating of a registered clocksource |
764 | * @cs: clocksource to be changed | ||
765 | * @rating: new rating | ||
726 | */ | 766 | */ |
727 | void clocksource_change_rating(struct clocksource *cs, int rating) | 767 | void clocksource_change_rating(struct clocksource *cs, int rating) |
728 | { | 768 | { |
@@ -734,6 +774,7 @@ EXPORT_SYMBOL(clocksource_change_rating); | |||
734 | 774 | ||
735 | /** | 775 | /** |
736 | * clocksource_unregister - remove a registered clocksource | 776 | * clocksource_unregister - remove a registered clocksource |
777 | * @cs: clocksource to be unregistered | ||
737 | */ | 778 | */ |
738 | void clocksource_unregister(struct clocksource *cs) | 779 | void clocksource_unregister(struct clocksource *cs) |
739 | { | 780 | { |
@@ -749,13 +790,14 @@ EXPORT_SYMBOL(clocksource_unregister); | |||
749 | /** | 790 | /** |
750 | * sysfs_show_current_clocksources - sysfs interface for current clocksource | 791 | * sysfs_show_current_clocksources - sysfs interface for current clocksource |
751 | * @dev: unused | 792 | * @dev: unused |
793 | * @attr: unused | ||
752 | * @buf: char buffer to be filled with clocksource list | 794 | * @buf: char buffer to be filled with clocksource list |
753 | * | 795 | * |
754 | * Provides sysfs interface for listing current clocksource. | 796 | * Provides sysfs interface for listing current clocksource. |
755 | */ | 797 | */ |
756 | static ssize_t | 798 | static ssize_t |
757 | sysfs_show_current_clocksources(struct sys_device *dev, | 799 | sysfs_show_current_clocksources(struct device *dev, |
758 | struct sysdev_attribute *attr, char *buf) | 800 | struct device_attribute *attr, char *buf) |
759 | { | 801 | { |
760 | ssize_t count = 0; | 802 | ssize_t count = 0; |
761 | 803 | ||
@@ -769,14 +811,15 @@ sysfs_show_current_clocksources(struct sys_device *dev, | |||
769 | /** | 811 | /** |
770 | * sysfs_override_clocksource - interface for manually overriding clocksource | 812 | * sysfs_override_clocksource - interface for manually overriding clocksource |
771 | * @dev: unused | 813 | * @dev: unused |
814 | * @attr: unused | ||
772 | * @buf: name of override clocksource | 815 | * @buf: name of override clocksource |
773 | * @count: length of buffer | 816 | * @count: length of buffer |
774 | * | 817 | * |
775 | * Takes input from sysfs interface for manually overriding the default | 818 | * Takes input from sysfs interface for manually overriding the default |
776 | * clocksource selection. | 819 | * clocksource selection. |
777 | */ | 820 | */ |
778 | static ssize_t sysfs_override_clocksource(struct sys_device *dev, | 821 | static ssize_t sysfs_override_clocksource(struct device *dev, |
779 | struct sysdev_attribute *attr, | 822 | struct device_attribute *attr, |
780 | const char *buf, size_t count) | 823 | const char *buf, size_t count) |
781 | { | 824 | { |
782 | size_t ret = count; | 825 | size_t ret = count; |
@@ -804,13 +847,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, | |||
804 | /** | 847 | /** |
805 | * sysfs_show_available_clocksources - sysfs interface for listing clocksource | 848 | * sysfs_show_available_clocksources - sysfs interface for listing clocksource |
806 | * @dev: unused | 849 | * @dev: unused |
850 | * @attr: unused | ||
807 | * @buf: char buffer to be filled with clocksource list | 851 | * @buf: char buffer to be filled with clocksource list |
808 | * | 852 | * |
809 | * Provides sysfs interface for listing registered clocksources | 853 | * Provides sysfs interface for listing registered clocksources |
810 | */ | 854 | */ |
811 | static ssize_t | 855 | static ssize_t |
812 | sysfs_show_available_clocksources(struct sys_device *dev, | 856 | sysfs_show_available_clocksources(struct device *dev, |
813 | struct sysdev_attribute *attr, | 857 | struct device_attribute *attr, |
814 | char *buf) | 858 | char *buf) |
815 | { | 859 | { |
816 | struct clocksource *src; | 860 | struct clocksource *src; |
@@ -839,35 +883,36 @@ sysfs_show_available_clocksources(struct sys_device *dev, | |||
839 | /* | 883 | /* |
840 | * Sysfs setup bits: | 884 | * Sysfs setup bits: |
841 | */ | 885 | */ |
842 | static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, | 886 | static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, |
843 | sysfs_override_clocksource); | 887 | sysfs_override_clocksource); |
844 | 888 | ||
845 | static SYSDEV_ATTR(available_clocksource, 0444, | 889 | static DEVICE_ATTR(available_clocksource, 0444, |
846 | sysfs_show_available_clocksources, NULL); | 890 | sysfs_show_available_clocksources, NULL); |
847 | 891 | ||
848 | static struct sysdev_class clocksource_sysclass = { | 892 | static struct bus_type clocksource_subsys = { |
849 | .name = "clocksource", | 893 | .name = "clocksource", |
894 | .dev_name = "clocksource", | ||
850 | }; | 895 | }; |
851 | 896 | ||
852 | static struct sys_device device_clocksource = { | 897 | static struct device device_clocksource = { |
853 | .id = 0, | 898 | .id = 0, |
854 | .cls = &clocksource_sysclass, | 899 | .bus = &clocksource_subsys, |
855 | }; | 900 | }; |
856 | 901 | ||
857 | static int __init init_clocksource_sysfs(void) | 902 | static int __init init_clocksource_sysfs(void) |
858 | { | 903 | { |
859 | int error = sysdev_class_register(&clocksource_sysclass); | 904 | int error = subsys_system_register(&clocksource_subsys, NULL); |
860 | 905 | ||
861 | if (!error) | 906 | if (!error) |
862 | error = sysdev_register(&device_clocksource); | 907 | error = device_register(&device_clocksource); |
863 | if (!error) | 908 | if (!error) |
864 | error = sysdev_create_file( | 909 | error = device_create_file( |
865 | &device_clocksource, | 910 | &device_clocksource, |
866 | &attr_current_clocksource); | 911 | &dev_attr_current_clocksource); |
867 | if (!error) | 912 | if (!error) |
868 | error = sysdev_create_file( | 913 | error = device_create_file( |
869 | &device_clocksource, | 914 | &device_clocksource, |
870 | &attr_available_clocksource); | 915 | &dev_attr_available_clocksource); |
871 | return error; | 916 | return error; |
872 | } | 917 | } |
873 | 918 | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f954282d9a82..fd4a7b1625a2 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -71,7 +71,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev) | |||
71 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) | 71 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) |
72 | return 0; | 72 | return 0; |
73 | 73 | ||
74 | clockevents_exchange_device(NULL, dev); | 74 | clockevents_exchange_device(tick_broadcast_device.evtdev, dev); |
75 | tick_broadcast_device.evtdev = dev; | 75 | tick_broadcast_device.evtdev = dev; |
76 | if (!cpumask_empty(tick_get_broadcast_mask())) | 76 | if (!cpumask_empty(tick_get_broadcast_mask())) |
77 | tick_broadcast_start_periodic(dev); | 77 | tick_broadcast_start_periodic(dev); |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 40420644d0ba..7656642e4b8e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -275,42 +275,17 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) | |||
275 | } | 275 | } |
276 | EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); | 276 | EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); |
277 | 277 | ||
278 | /** | 278 | static void tick_nohz_stop_sched_tick(struct tick_sched *ts) |
279 | * tick_nohz_stop_sched_tick - stop the idle tick from the idle task | ||
280 | * | ||
281 | * When the next event is more than a tick into the future, stop the idle tick | ||
282 | * Called either from the idle loop or from irq_exit() when an idle period was | ||
283 | * just interrupted by an interrupt which did not cause a reschedule. | ||
284 | */ | ||
285 | void tick_nohz_stop_sched_tick(int inidle) | ||
286 | { | 279 | { |
287 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; | 280 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; |
288 | struct tick_sched *ts; | ||
289 | ktime_t last_update, expires, now; | 281 | ktime_t last_update, expires, now; |
290 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 282 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
291 | u64 time_delta; | 283 | u64 time_delta; |
292 | int cpu; | 284 | int cpu; |
293 | 285 | ||
294 | local_irq_save(flags); | ||
295 | |||
296 | cpu = smp_processor_id(); | 286 | cpu = smp_processor_id(); |
297 | ts = &per_cpu(tick_cpu_sched, cpu); | 287 | ts = &per_cpu(tick_cpu_sched, cpu); |
298 | 288 | ||
299 | /* | ||
300 | * Call to tick_nohz_start_idle stops the last_update_time from being | ||
301 | * updated. Thus, it must not be called in the event we are called from | ||
302 | * irq_exit() with the prior state different than idle. | ||
303 | */ | ||
304 | if (!inidle && !ts->inidle) | ||
305 | goto end; | ||
306 | |||
307 | /* | ||
308 | * Set ts->inidle unconditionally. Even if the system did not | ||
309 | * switch to NOHZ mode the cpu frequency governers rely on the | ||
310 | * update of the idle time accounting in tick_nohz_start_idle(). | ||
311 | */ | ||
312 | ts->inidle = 1; | ||
313 | |||
314 | now = tick_nohz_start_idle(cpu, ts); | 289 | now = tick_nohz_start_idle(cpu, ts); |
315 | 290 | ||
316 | /* | 291 | /* |
@@ -326,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
326 | } | 301 | } |
327 | 302 | ||
328 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) | 303 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) |
329 | goto end; | 304 | return; |
330 | 305 | ||
331 | if (need_resched()) | 306 | if (need_resched()) |
332 | goto end; | 307 | return; |
333 | 308 | ||
334 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { | 309 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { |
335 | static int ratelimit; | 310 | static int ratelimit; |
@@ -339,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
339 | (unsigned int) local_softirq_pending()); | 314 | (unsigned int) local_softirq_pending()); |
340 | ratelimit++; | 315 | ratelimit++; |
341 | } | 316 | } |
342 | goto end; | 317 | return; |
343 | } | 318 | } |
344 | 319 | ||
345 | ts->idle_calls++; | 320 | ts->idle_calls++; |
@@ -434,7 +409,6 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
434 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); | 409 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); |
435 | ts->tick_stopped = 1; | 410 | ts->tick_stopped = 1; |
436 | ts->idle_jiffies = last_jiffies; | 411 | ts->idle_jiffies = last_jiffies; |
437 | rcu_enter_nohz(); | ||
438 | } | 412 | } |
439 | 413 | ||
440 | ts->idle_sleeps++; | 414 | ts->idle_sleeps++; |
@@ -472,8 +446,64 @@ out: | |||
472 | ts->next_jiffies = next_jiffies; | 446 | ts->next_jiffies = next_jiffies; |
473 | ts->last_jiffies = last_jiffies; | 447 | ts->last_jiffies = last_jiffies; |
474 | ts->sleep_length = ktime_sub(dev->next_event, now); | 448 | ts->sleep_length = ktime_sub(dev->next_event, now); |
475 | end: | 449 | } |
476 | local_irq_restore(flags); | 450 | |
451 | /** | ||
452 | * tick_nohz_idle_enter - stop the idle tick from the idle task | ||
453 | * | ||
454 | * When the next event is more than a tick into the future, stop the idle tick | ||
455 | * Called when we start the idle loop. | ||
456 | * | ||
457 | * The arch is responsible of calling: | ||
458 | * | ||
459 | * - rcu_idle_enter() after its last use of RCU before the CPU is put | ||
460 | * to sleep. | ||
461 | * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. | ||
462 | */ | ||
463 | void tick_nohz_idle_enter(void) | ||
464 | { | ||
465 | struct tick_sched *ts; | ||
466 | |||
467 | WARN_ON_ONCE(irqs_disabled()); | ||
468 | |||
469 | /* | ||
470 | * Update the idle state in the scheduler domain hierarchy | ||
471 | * when tick_nohz_stop_sched_tick() is called from the idle loop. | ||
472 | * State will be updated to busy during the first busy tick after | ||
473 | * exiting idle. | ||
474 | */ | ||
475 | set_cpu_sd_state_idle(); | ||
476 | |||
477 | local_irq_disable(); | ||
478 | |||
479 | ts = &__get_cpu_var(tick_cpu_sched); | ||
480 | /* | ||
481 | * set ts->inidle unconditionally. even if the system did not | ||
482 | * switch to nohz mode the cpu frequency governers rely on the | ||
483 | * update of the idle time accounting in tick_nohz_start_idle(). | ||
484 | */ | ||
485 | ts->inidle = 1; | ||
486 | tick_nohz_stop_sched_tick(ts); | ||
487 | |||
488 | local_irq_enable(); | ||
489 | } | ||
490 | |||
491 | /** | ||
492 | * tick_nohz_irq_exit - update next tick event from interrupt exit | ||
493 | * | ||
494 | * When an interrupt fires while we are idle and it doesn't cause | ||
495 | * a reschedule, it may still add, modify or delete a timer, enqueue | ||
496 | * an RCU callback, etc... | ||
497 | * So we need to re-calculate and reprogram the next tick event. | ||
498 | */ | ||
499 | void tick_nohz_irq_exit(void) | ||
500 | { | ||
501 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
502 | |||
503 | if (!ts->inidle) | ||
504 | return; | ||
505 | |||
506 | tick_nohz_stop_sched_tick(ts); | ||
477 | } | 507 | } |
478 | 508 | ||
479 | /** | 509 | /** |
@@ -515,11 +545,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) | |||
515 | } | 545 | } |
516 | 546 | ||
517 | /** | 547 | /** |
518 | * tick_nohz_restart_sched_tick - restart the idle tick from the idle task | 548 | * tick_nohz_idle_exit - restart the idle tick from the idle task |
519 | * | 549 | * |
520 | * Restart the idle tick when the CPU is woken up from idle | 550 | * Restart the idle tick when the CPU is woken up from idle |
551 | * This also exit the RCU extended quiescent state. The CPU | ||
552 | * can use RCU again after this function is called. | ||
521 | */ | 553 | */ |
522 | void tick_nohz_restart_sched_tick(void) | 554 | void tick_nohz_idle_exit(void) |
523 | { | 555 | { |
524 | int cpu = smp_processor_id(); | 556 | int cpu = smp_processor_id(); |
525 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 557 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
@@ -529,6 +561,7 @@ void tick_nohz_restart_sched_tick(void) | |||
529 | ktime_t now; | 561 | ktime_t now; |
530 | 562 | ||
531 | local_irq_disable(); | 563 | local_irq_disable(); |
564 | |||
532 | if (ts->idle_active || (ts->inidle && ts->tick_stopped)) | 565 | if (ts->idle_active || (ts->inidle && ts->tick_stopped)) |
533 | now = ktime_get(); | 566 | now = ktime_get(); |
534 | 567 | ||
@@ -543,8 +576,6 @@ void tick_nohz_restart_sched_tick(void) | |||
543 | 576 | ||
544 | ts->inidle = 0; | 577 | ts->inidle = 0; |
545 | 578 | ||
546 | rcu_exit_nohz(); | ||
547 | |||
548 | /* Update jiffies first */ | 579 | /* Update jiffies first */ |
549 | select_nohz_load_balancer(0); | 580 | select_nohz_load_balancer(0); |
550 | tick_do_update_jiffies64(now); | 581 | tick_do_update_jiffies64(now); |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 169479994755..e6a5a6bc2769 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -131,7 +131,7 @@ static inline s64 timekeeping_get_ns_raw(void) | |||
131 | /* calculate the delta since the last update_wall_time: */ | 131 | /* calculate the delta since the last update_wall_time: */ |
132 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | 132 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; |
133 | 133 | ||
134 | /* return delta convert to nanoseconds using ntp adjusted mult. */ | 134 | /* return delta convert to nanoseconds. */ |
135 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); | 135 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); |
136 | } | 136 | } |
137 | 137 | ||
@@ -249,6 +249,8 @@ ktime_t ktime_get(void) | |||
249 | secs = xtime.tv_sec + wall_to_monotonic.tv_sec; | 249 | secs = xtime.tv_sec + wall_to_monotonic.tv_sec; |
250 | nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; | 250 | nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; |
251 | nsecs += timekeeping_get_ns(); | 251 | nsecs += timekeeping_get_ns(); |
252 | /* If arch requires, add in gettimeoffset() */ | ||
253 | nsecs += arch_gettimeoffset(); | ||
252 | 254 | ||
253 | } while (read_seqretry(&xtime_lock, seq)); | 255 | } while (read_seqretry(&xtime_lock, seq)); |
254 | /* | 256 | /* |
@@ -280,6 +282,8 @@ void ktime_get_ts(struct timespec *ts) | |||
280 | *ts = xtime; | 282 | *ts = xtime; |
281 | tomono = wall_to_monotonic; | 283 | tomono = wall_to_monotonic; |
282 | nsecs = timekeeping_get_ns(); | 284 | nsecs = timekeeping_get_ns(); |
285 | /* If arch requires, add in gettimeoffset() */ | ||
286 | nsecs += arch_gettimeoffset(); | ||
283 | 287 | ||
284 | } while (read_seqretry(&xtime_lock, seq)); | 288 | } while (read_seqretry(&xtime_lock, seq)); |
285 | 289 | ||
@@ -802,14 +806,44 @@ static void timekeeping_adjust(s64 offset) | |||
802 | s64 error, interval = timekeeper.cycle_interval; | 806 | s64 error, interval = timekeeper.cycle_interval; |
803 | int adj; | 807 | int adj; |
804 | 808 | ||
809 | /* | ||
810 | * The point of this is to check if the error is greater then half | ||
811 | * an interval. | ||
812 | * | ||
813 | * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. | ||
814 | * | ||
815 | * Note we subtract one in the shift, so that error is really error*2. | ||
816 | * This "saves" dividing(shifting) interval twice, but keeps the | ||
817 | * (error > interval) comparison as still measuring if error is | ||
818 | * larger then half an interval. | ||
819 | * | ||
820 | * Note: It does not "save" on aggravation when reading the code. | ||
821 | */ | ||
805 | error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); | 822 | error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); |
806 | if (error > interval) { | 823 | if (error > interval) { |
824 | /* | ||
825 | * We now divide error by 4(via shift), which checks if | ||
826 | * the error is greater then twice the interval. | ||
827 | * If it is greater, we need a bigadjust, if its smaller, | ||
828 | * we can adjust by 1. | ||
829 | */ | ||
807 | error >>= 2; | 830 | error >>= 2; |
831 | /* | ||
832 | * XXX - In update_wall_time, we round up to the next | ||
833 | * nanosecond, and store the amount rounded up into | ||
834 | * the error. This causes the likely below to be unlikely. | ||
835 | * | ||
836 | * The proper fix is to avoid rounding up by using | ||
837 | * the high precision timekeeper.xtime_nsec instead of | ||
838 | * xtime.tv_nsec everywhere. Fixing this will take some | ||
839 | * time. | ||
840 | */ | ||
808 | if (likely(error <= interval)) | 841 | if (likely(error <= interval)) |
809 | adj = 1; | 842 | adj = 1; |
810 | else | 843 | else |
811 | adj = timekeeping_bigadjust(error, &interval, &offset); | 844 | adj = timekeeping_bigadjust(error, &interval, &offset); |
812 | } else if (error < -interval) { | 845 | } else if (error < -interval) { |
846 | /* See comment above, this is just switched for the negative */ | ||
813 | error >>= 2; | 847 | error >>= 2; |
814 | if (likely(error >= -interval)) { | 848 | if (likely(error >= -interval)) { |
815 | adj = -1; | 849 | adj = -1; |
@@ -817,9 +851,65 @@ static void timekeeping_adjust(s64 offset) | |||
817 | offset = -offset; | 851 | offset = -offset; |
818 | } else | 852 | } else |
819 | adj = timekeeping_bigadjust(error, &interval, &offset); | 853 | adj = timekeeping_bigadjust(error, &interval, &offset); |
820 | } else | 854 | } else /* No adjustment needed */ |
821 | return; | 855 | return; |
822 | 856 | ||
857 | WARN_ONCE(timekeeper.clock->maxadj && | ||
858 | (timekeeper.mult + adj > timekeeper.clock->mult + | ||
859 | timekeeper.clock->maxadj), | ||
860 | "Adjusting %s more then 11%% (%ld vs %ld)\n", | ||
861 | timekeeper.clock->name, (long)timekeeper.mult + adj, | ||
862 | (long)timekeeper.clock->mult + | ||
863 | timekeeper.clock->maxadj); | ||
864 | /* | ||
865 | * So the following can be confusing. | ||
866 | * | ||
867 | * To keep things simple, lets assume adj == 1 for now. | ||
868 | * | ||
869 | * When adj != 1, remember that the interval and offset values | ||
870 | * have been appropriately scaled so the math is the same. | ||
871 | * | ||
872 | * The basic idea here is that we're increasing the multiplier | ||
873 | * by one, this causes the xtime_interval to be incremented by | ||
874 | * one cycle_interval. This is because: | ||
875 | * xtime_interval = cycle_interval * mult | ||
876 | * So if mult is being incremented by one: | ||
877 | * xtime_interval = cycle_interval * (mult + 1) | ||
878 | * Its the same as: | ||
879 | * xtime_interval = (cycle_interval * mult) + cycle_interval | ||
880 | * Which can be shortened to: | ||
881 | * xtime_interval += cycle_interval | ||
882 | * | ||
883 | * So offset stores the non-accumulated cycles. Thus the current | ||
884 | * time (in shifted nanoseconds) is: | ||
885 | * now = (offset * adj) + xtime_nsec | ||
886 | * Now, even though we're adjusting the clock frequency, we have | ||
887 | * to keep time consistent. In other words, we can't jump back | ||
888 | * in time, and we also want to avoid jumping forward in time. | ||
889 | * | ||
890 | * So given the same offset value, we need the time to be the same | ||
891 | * both before and after the freq adjustment. | ||
892 | * now = (offset * adj_1) + xtime_nsec_1 | ||
893 | * now = (offset * adj_2) + xtime_nsec_2 | ||
894 | * So: | ||
895 | * (offset * adj_1) + xtime_nsec_1 = | ||
896 | * (offset * adj_2) + xtime_nsec_2 | ||
897 | * And we know: | ||
898 | * adj_2 = adj_1 + 1 | ||
899 | * So: | ||
900 | * (offset * adj_1) + xtime_nsec_1 = | ||
901 | * (offset * (adj_1+1)) + xtime_nsec_2 | ||
902 | * (offset * adj_1) + xtime_nsec_1 = | ||
903 | * (offset * adj_1) + offset + xtime_nsec_2 | ||
904 | * Canceling the sides: | ||
905 | * xtime_nsec_1 = offset + xtime_nsec_2 | ||
906 | * Which gives us: | ||
907 | * xtime_nsec_2 = xtime_nsec_1 - offset | ||
908 | * Which simplfies to: | ||
909 | * xtime_nsec -= offset | ||
910 | * | ||
911 | * XXX - TODO: Doc ntp_error calculation. | ||
912 | */ | ||
823 | timekeeper.mult += adj; | 913 | timekeeper.mult += adj; |
824 | timekeeper.xtime_interval += interval; | 914 | timekeeper.xtime_interval += interval; |
825 | timekeeper.xtime_nsec -= offset; | 915 | timekeeper.xtime_nsec -= offset; |