diff options
| author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2012-03-09 13:55:17 -0500 |
|---|---|---|
| committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2012-03-09 13:55:17 -0500 |
| commit | b675b3667f6729dcd1036a2a129b35445947f905 (patch) | |
| tree | 0d58791e9063d3ca2c352da6f3e7df2bdb876f9d /kernel/time | |
| parent | 104a5f3cad8f2f27cadbdf0029400ecd9e17ccc0 (diff) | |
| parent | 192cfd58774b4d17b2fe8bdc77d89c2ef4e0591d (diff) | |
Merge commit 'v3.3-rc6' into next
Diffstat (limited to 'kernel/time')
| -rw-r--r-- | kernel/time/Kconfig | 2 | ||||
| -rw-r--r-- | kernel/time/alarmtimer.c | 2 | ||||
| -rw-r--r-- | kernel/time/clockevents.c | 1 | ||||
| -rw-r--r-- | kernel/time/clocksource.c | 111 | ||||
| -rw-r--r-- | kernel/time/tick-broadcast.c | 2 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 105 | ||||
| -rw-r--r-- | kernel/time/timekeeping.c | 94 |
7 files changed, 241 insertions, 76 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b26c2228fe92..2cf9cc7aa103 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
| @@ -25,7 +25,7 @@ config HIGH_RES_TIMERS | |||
| 25 | config GENERIC_CLOCKEVENTS_BUILD | 25 | config GENERIC_CLOCKEVENTS_BUILD |
| 26 | bool | 26 | bool |
| 27 | default y | 27 | default y |
| 28 | depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR | 28 | depends on GENERIC_CLOCKEVENTS |
| 29 | 29 | ||
| 30 | config GENERIC_CLOCKEVENTS_MIN_ADJUST | 30 | config GENERIC_CLOCKEVENTS_MIN_ADJUST |
| 31 | bool | 31 | bool |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c436e790b21b..8a46f5d64504 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
| @@ -195,7 +195,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) | |||
| 195 | struct alarm *alarm; | 195 | struct alarm *alarm; |
| 196 | ktime_t expired = next->expires; | 196 | ktime_t expired = next->expires; |
| 197 | 197 | ||
| 198 | if (expired.tv64 >= now.tv64) | 198 | if (expired.tv64 > now.tv64) |
| 199 | break; | 199 | break; |
| 200 | 200 | ||
| 201 | alarm = container_of(next, struct alarm, node); | 201 | alarm = container_of(next, struct alarm, node); |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 1ecd6ba36d6c..9cd928f7a7c6 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
| @@ -17,7 +17,6 @@ | |||
| 17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
| 18 | #include <linux/notifier.h> | 18 | #include <linux/notifier.h> |
| 19 | #include <linux/smp.h> | 19 | #include <linux/smp.h> |
| 20 | #include <linux/sysdev.h> | ||
| 21 | 20 | ||
| 22 | #include "tick-internal.h" | 21 | #include "tick-internal.h" |
| 23 | 22 | ||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index cf52fda2e096..a45ca167ab24 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
| @@ -23,8 +23,8 @@ | |||
| 23 | * o Allow clocksource drivers to be unregistered | 23 | * o Allow clocksource drivers to be unregistered |
| 24 | */ | 24 | */ |
| 25 | 25 | ||
| 26 | #include <linux/device.h> | ||
| 26 | #include <linux/clocksource.h> | 27 | #include <linux/clocksource.h> |
| 27 | #include <linux/sysdev.h> | ||
| 28 | #include <linux/init.h> | 28 | #include <linux/init.h> |
| 29 | #include <linux/module.h> | 29 | #include <linux/module.h> |
| 30 | #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ | 30 | #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ |
| @@ -492,6 +492,22 @@ void clocksource_touch_watchdog(void) | |||
| 492 | } | 492 | } |
| 493 | 493 | ||
| 494 | /** | 494 | /** |
| 495 | * clocksource_max_adjustment- Returns max adjustment amount | ||
| 496 | * @cs: Pointer to clocksource | ||
| 497 | * | ||
| 498 | */ | ||
| 499 | static u32 clocksource_max_adjustment(struct clocksource *cs) | ||
| 500 | { | ||
| 501 | u64 ret; | ||
| 502 | /* | ||
| 503 | * We won't try to correct for more then 11% adjustments (110,000 ppm), | ||
| 504 | */ | ||
| 505 | ret = (u64)cs->mult * 11; | ||
| 506 | do_div(ret,100); | ||
| 507 | return (u32)ret; | ||
| 508 | } | ||
| 509 | |||
| 510 | /** | ||
| 495 | * clocksource_max_deferment - Returns max time the clocksource can be deferred | 511 | * clocksource_max_deferment - Returns max time the clocksource can be deferred |
| 496 | * @cs: Pointer to clocksource | 512 | * @cs: Pointer to clocksource |
| 497 | * | 513 | * |
| @@ -503,25 +519,28 @@ static u64 clocksource_max_deferment(struct clocksource *cs) | |||
| 503 | /* | 519 | /* |
| 504 | * Calculate the maximum number of cycles that we can pass to the | 520 | * Calculate the maximum number of cycles that we can pass to the |
| 505 | * cyc2ns function without overflowing a 64-bit signed result. The | 521 | * cyc2ns function without overflowing a 64-bit signed result. The |
| 506 | * maximum number of cycles is equal to ULLONG_MAX/cs->mult which | 522 | * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) |
| 507 | * is equivalent to the below. | 523 | * which is equivalent to the below. |
| 508 | * max_cycles < (2^63)/cs->mult | 524 | * max_cycles < (2^63)/(cs->mult + cs->maxadj) |
| 509 | * max_cycles < 2^(log2((2^63)/cs->mult)) | 525 | * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) |
| 510 | * max_cycles < 2^(log2(2^63) - log2(cs->mult)) | 526 | * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) |
| 511 | * max_cycles < 2^(63 - log2(cs->mult)) | 527 | * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) |
| 512 | * max_cycles < 1 << (63 - log2(cs->mult)) | 528 | * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) |
| 513 | * Please note that we add 1 to the result of the log2 to account for | 529 | * Please note that we add 1 to the result of the log2 to account for |
| 514 | * any rounding errors, ensure the above inequality is satisfied and | 530 | * any rounding errors, ensure the above inequality is satisfied and |
| 515 | * no overflow will occur. | 531 | * no overflow will occur. |
| 516 | */ | 532 | */ |
| 517 | max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); | 533 | max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); |
| 518 | 534 | ||
| 519 | /* | 535 | /* |
| 520 | * The actual maximum number of cycles we can defer the clocksource is | 536 | * The actual maximum number of cycles we can defer the clocksource is |
| 521 | * determined by the minimum of max_cycles and cs->mask. | 537 | * determined by the minimum of max_cycles and cs->mask. |
| 538 | * Note: Here we subtract the maxadj to make sure we don't sleep for | ||
| 539 | * too long if there's a large negative adjustment. | ||
| 522 | */ | 540 | */ |
| 523 | max_cycles = min_t(u64, max_cycles, (u64) cs->mask); | 541 | max_cycles = min_t(u64, max_cycles, (u64) cs->mask); |
| 524 | max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); | 542 | max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, |
| 543 | cs->shift); | ||
| 525 | 544 | ||
| 526 | /* | 545 | /* |
| 527 | * To ensure that the clocksource does not wrap whilst we are idle, | 546 | * To ensure that the clocksource does not wrap whilst we are idle, |
| @@ -529,7 +548,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) | |||
| 529 | * note a margin of 12.5% is used because this can be computed with | 548 | * note a margin of 12.5% is used because this can be computed with |
| 530 | * a shift, versus say 10% which would require division. | 549 | * a shift, versus say 10% which would require division. |
| 531 | */ | 550 | */ |
| 532 | return max_nsecs - (max_nsecs >> 5); | 551 | return max_nsecs - (max_nsecs >> 3); |
| 533 | } | 552 | } |
| 534 | 553 | ||
| 535 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET | 554 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET |
| @@ -628,7 +647,7 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
| 628 | 647 | ||
| 629 | /** | 648 | /** |
| 630 | * __clocksource_updatefreq_scale - Used update clocksource with new freq | 649 | * __clocksource_updatefreq_scale - Used update clocksource with new freq |
| 631 | * @t: clocksource to be registered | 650 | * @cs: clocksource to be registered |
| 632 | * @scale: Scale factor multiplied against freq to get clocksource hz | 651 | * @scale: Scale factor multiplied against freq to get clocksource hz |
| 633 | * @freq: clocksource frequency (cycles per second) divided by scale | 652 | * @freq: clocksource frequency (cycles per second) divided by scale |
| 634 | * | 653 | * |
| @@ -640,7 +659,6 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
| 640 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | 659 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) |
| 641 | { | 660 | { |
| 642 | u64 sec; | 661 | u64 sec; |
| 643 | |||
| 644 | /* | 662 | /* |
| 645 | * Calc the maximum number of seconds which we can run before | 663 | * Calc the maximum number of seconds which we can run before |
| 646 | * wrapping around. For clocksources which have a mask > 32bit | 664 | * wrapping around. For clocksources which have a mask > 32bit |
| @@ -651,7 +669,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
| 651 | * ~ 0.06ppm granularity for NTP. We apply the same 12.5% | 669 | * ~ 0.06ppm granularity for NTP. We apply the same 12.5% |
| 652 | * margin as we do in clocksource_max_deferment() | 670 | * margin as we do in clocksource_max_deferment() |
| 653 | */ | 671 | */ |
| 654 | sec = (cs->mask - (cs->mask >> 5)); | 672 | sec = (cs->mask - (cs->mask >> 3)); |
| 655 | do_div(sec, freq); | 673 | do_div(sec, freq); |
| 656 | do_div(sec, scale); | 674 | do_div(sec, scale); |
| 657 | if (!sec) | 675 | if (!sec) |
| @@ -661,13 +679,27 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
| 661 | 679 | ||
| 662 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, | 680 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, |
| 663 | NSEC_PER_SEC / scale, sec * scale); | 681 | NSEC_PER_SEC / scale, sec * scale); |
| 682 | |||
| 683 | /* | ||
| 684 | * for clocksources that have large mults, to avoid overflow. | ||
| 685 | * Since mult may be adjusted by ntp, add an safety extra margin | ||
| 686 | * | ||
| 687 | */ | ||
| 688 | cs->maxadj = clocksource_max_adjustment(cs); | ||
| 689 | while ((cs->mult + cs->maxadj < cs->mult) | ||
| 690 | || (cs->mult - cs->maxadj > cs->mult)) { | ||
| 691 | cs->mult >>= 1; | ||
| 692 | cs->shift--; | ||
| 693 | cs->maxadj = clocksource_max_adjustment(cs); | ||
| 694 | } | ||
| 695 | |||
| 664 | cs->max_idle_ns = clocksource_max_deferment(cs); | 696 | cs->max_idle_ns = clocksource_max_deferment(cs); |
| 665 | } | 697 | } |
| 666 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | 698 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); |
| 667 | 699 | ||
| 668 | /** | 700 | /** |
| 669 | * __clocksource_register_scale - Used to install new clocksources | 701 | * __clocksource_register_scale - Used to install new clocksources |
| 670 | * @t: clocksource to be registered | 702 | * @cs: clocksource to be registered |
| 671 | * @scale: Scale factor multiplied against freq to get clocksource hz | 703 | * @scale: Scale factor multiplied against freq to get clocksource hz |
| 672 | * @freq: clocksource frequency (cycles per second) divided by scale | 704 | * @freq: clocksource frequency (cycles per second) divided by scale |
| 673 | * | 705 | * |
| @@ -695,12 +727,18 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale); | |||
| 695 | 727 | ||
| 696 | /** | 728 | /** |
| 697 | * clocksource_register - Used to install new clocksources | 729 | * clocksource_register - Used to install new clocksources |
| 698 | * @t: clocksource to be registered | 730 | * @cs: clocksource to be registered |
| 699 | * | 731 | * |
| 700 | * Returns -EBUSY if registration fails, zero otherwise. | 732 | * Returns -EBUSY if registration fails, zero otherwise. |
| 701 | */ | 733 | */ |
| 702 | int clocksource_register(struct clocksource *cs) | 734 | int clocksource_register(struct clocksource *cs) |
| 703 | { | 735 | { |
| 736 | /* calculate max adjustment for given mult/shift */ | ||
| 737 | cs->maxadj = clocksource_max_adjustment(cs); | ||
| 738 | WARN_ONCE(cs->mult + cs->maxadj < cs->mult, | ||
| 739 | "Clocksource %s might overflow on 11%% adjustment\n", | ||
| 740 | cs->name); | ||
| 741 | |||
| 704 | /* calculate max idle time permitted for this clocksource */ | 742 | /* calculate max idle time permitted for this clocksource */ |
| 705 | cs->max_idle_ns = clocksource_max_deferment(cs); | 743 | cs->max_idle_ns = clocksource_max_deferment(cs); |
| 706 | 744 | ||
| @@ -723,6 +761,8 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating) | |||
| 723 | 761 | ||
| 724 | /** | 762 | /** |
| 725 | * clocksource_change_rating - Change the rating of a registered clocksource | 763 | * clocksource_change_rating - Change the rating of a registered clocksource |
| 764 | * @cs: clocksource to be changed | ||
| 765 | * @rating: new rating | ||
| 726 | */ | 766 | */ |
| 727 | void clocksource_change_rating(struct clocksource *cs, int rating) | 767 | void clocksource_change_rating(struct clocksource *cs, int rating) |
| 728 | { | 768 | { |
| @@ -734,6 +774,7 @@ EXPORT_SYMBOL(clocksource_change_rating); | |||
| 734 | 774 | ||
| 735 | /** | 775 | /** |
| 736 | * clocksource_unregister - remove a registered clocksource | 776 | * clocksource_unregister - remove a registered clocksource |
| 777 | * @cs: clocksource to be unregistered | ||
| 737 | */ | 778 | */ |
| 738 | void clocksource_unregister(struct clocksource *cs) | 779 | void clocksource_unregister(struct clocksource *cs) |
| 739 | { | 780 | { |
| @@ -749,13 +790,14 @@ EXPORT_SYMBOL(clocksource_unregister); | |||
| 749 | /** | 790 | /** |
| 750 | * sysfs_show_current_clocksources - sysfs interface for current clocksource | 791 | * sysfs_show_current_clocksources - sysfs interface for current clocksource |
| 751 | * @dev: unused | 792 | * @dev: unused |
| 793 | * @attr: unused | ||
| 752 | * @buf: char buffer to be filled with clocksource list | 794 | * @buf: char buffer to be filled with clocksource list |
| 753 | * | 795 | * |
| 754 | * Provides sysfs interface for listing current clocksource. | 796 | * Provides sysfs interface for listing current clocksource. |
| 755 | */ | 797 | */ |
| 756 | static ssize_t | 798 | static ssize_t |
| 757 | sysfs_show_current_clocksources(struct sys_device *dev, | 799 | sysfs_show_current_clocksources(struct device *dev, |
| 758 | struct sysdev_attribute *attr, char *buf) | 800 | struct device_attribute *attr, char *buf) |
| 759 | { | 801 | { |
| 760 | ssize_t count = 0; | 802 | ssize_t count = 0; |
| 761 | 803 | ||
| @@ -769,14 +811,15 @@ sysfs_show_current_clocksources(struct sys_device *dev, | |||
| 769 | /** | 811 | /** |
| 770 | * sysfs_override_clocksource - interface for manually overriding clocksource | 812 | * sysfs_override_clocksource - interface for manually overriding clocksource |
| 771 | * @dev: unused | 813 | * @dev: unused |
| 814 | * @attr: unused | ||
| 772 | * @buf: name of override clocksource | 815 | * @buf: name of override clocksource |
| 773 | * @count: length of buffer | 816 | * @count: length of buffer |
| 774 | * | 817 | * |
| 775 | * Takes input from sysfs interface for manually overriding the default | 818 | * Takes input from sysfs interface for manually overriding the default |
| 776 | * clocksource selection. | 819 | * clocksource selection. |
| 777 | */ | 820 | */ |
| 778 | static ssize_t sysfs_override_clocksource(struct sys_device *dev, | 821 | static ssize_t sysfs_override_clocksource(struct device *dev, |
| 779 | struct sysdev_attribute *attr, | 822 | struct device_attribute *attr, |
| 780 | const char *buf, size_t count) | 823 | const char *buf, size_t count) |
| 781 | { | 824 | { |
| 782 | size_t ret = count; | 825 | size_t ret = count; |
| @@ -804,13 +847,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, | |||
| 804 | /** | 847 | /** |
| 805 | * sysfs_show_available_clocksources - sysfs interface for listing clocksource | 848 | * sysfs_show_available_clocksources - sysfs interface for listing clocksource |
| 806 | * @dev: unused | 849 | * @dev: unused |
| 850 | * @attr: unused | ||
| 807 | * @buf: char buffer to be filled with clocksource list | 851 | * @buf: char buffer to be filled with clocksource list |
| 808 | * | 852 | * |
| 809 | * Provides sysfs interface for listing registered clocksources | 853 | * Provides sysfs interface for listing registered clocksources |
| 810 | */ | 854 | */ |
| 811 | static ssize_t | 855 | static ssize_t |
| 812 | sysfs_show_available_clocksources(struct sys_device *dev, | 856 | sysfs_show_available_clocksources(struct device *dev, |
| 813 | struct sysdev_attribute *attr, | 857 | struct device_attribute *attr, |
| 814 | char *buf) | 858 | char *buf) |
| 815 | { | 859 | { |
| 816 | struct clocksource *src; | 860 | struct clocksource *src; |
| @@ -839,35 +883,36 @@ sysfs_show_available_clocksources(struct sys_device *dev, | |||
| 839 | /* | 883 | /* |
| 840 | * Sysfs setup bits: | 884 | * Sysfs setup bits: |
| 841 | */ | 885 | */ |
| 842 | static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, | 886 | static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, |
| 843 | sysfs_override_clocksource); | 887 | sysfs_override_clocksource); |
| 844 | 888 | ||
| 845 | static SYSDEV_ATTR(available_clocksource, 0444, | 889 | static DEVICE_ATTR(available_clocksource, 0444, |
| 846 | sysfs_show_available_clocksources, NULL); | 890 | sysfs_show_available_clocksources, NULL); |
| 847 | 891 | ||
| 848 | static struct sysdev_class clocksource_sysclass = { | 892 | static struct bus_type clocksource_subsys = { |
| 849 | .name = "clocksource", | 893 | .name = "clocksource", |
| 894 | .dev_name = "clocksource", | ||
| 850 | }; | 895 | }; |
| 851 | 896 | ||
| 852 | static struct sys_device device_clocksource = { | 897 | static struct device device_clocksource = { |
| 853 | .id = 0, | 898 | .id = 0, |
| 854 | .cls = &clocksource_sysclass, | 899 | .bus = &clocksource_subsys, |
| 855 | }; | 900 | }; |
| 856 | 901 | ||
| 857 | static int __init init_clocksource_sysfs(void) | 902 | static int __init init_clocksource_sysfs(void) |
| 858 | { | 903 | { |
| 859 | int error = sysdev_class_register(&clocksource_sysclass); | 904 | int error = subsys_system_register(&clocksource_subsys, NULL); |
| 860 | 905 | ||
| 861 | if (!error) | 906 | if (!error) |
| 862 | error = sysdev_register(&device_clocksource); | 907 | error = device_register(&device_clocksource); |
| 863 | if (!error) | 908 | if (!error) |
| 864 | error = sysdev_create_file( | 909 | error = device_create_file( |
| 865 | &device_clocksource, | 910 | &device_clocksource, |
| 866 | &attr_current_clocksource); | 911 | &dev_attr_current_clocksource); |
| 867 | if (!error) | 912 | if (!error) |
| 868 | error = sysdev_create_file( | 913 | error = device_create_file( |
| 869 | &device_clocksource, | 914 | &device_clocksource, |
| 870 | &attr_available_clocksource); | 915 | &dev_attr_available_clocksource); |
| 871 | return error; | 916 | return error; |
| 872 | } | 917 | } |
| 873 | 918 | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f954282d9a82..fd4a7b1625a2 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
| @@ -71,7 +71,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev) | |||
| 71 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) | 71 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) |
| 72 | return 0; | 72 | return 0; |
| 73 | 73 | ||
| 74 | clockevents_exchange_device(NULL, dev); | 74 | clockevents_exchange_device(tick_broadcast_device.evtdev, dev); |
| 75 | tick_broadcast_device.evtdev = dev; | 75 | tick_broadcast_device.evtdev = dev; |
| 76 | if (!cpumask_empty(tick_get_broadcast_mask())) | 76 | if (!cpumask_empty(tick_get_broadcast_mask())) |
| 77 | tick_broadcast_start_periodic(dev); | 77 | tick_broadcast_start_periodic(dev); |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 40420644d0ba..7656642e4b8e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -275,42 +275,17 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) | |||
| 275 | } | 275 | } |
| 276 | EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); | 276 | EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); |
| 277 | 277 | ||
| 278 | /** | 278 | static void tick_nohz_stop_sched_tick(struct tick_sched *ts) |
| 279 | * tick_nohz_stop_sched_tick - stop the idle tick from the idle task | ||
| 280 | * | ||
| 281 | * When the next event is more than a tick into the future, stop the idle tick | ||
| 282 | * Called either from the idle loop or from irq_exit() when an idle period was | ||
| 283 | * just interrupted by an interrupt which did not cause a reschedule. | ||
| 284 | */ | ||
| 285 | void tick_nohz_stop_sched_tick(int inidle) | ||
| 286 | { | 279 | { |
| 287 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; | 280 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; |
| 288 | struct tick_sched *ts; | ||
| 289 | ktime_t last_update, expires, now; | 281 | ktime_t last_update, expires, now; |
| 290 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 282 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
| 291 | u64 time_delta; | 283 | u64 time_delta; |
| 292 | int cpu; | 284 | int cpu; |
| 293 | 285 | ||
| 294 | local_irq_save(flags); | ||
| 295 | |||
| 296 | cpu = smp_processor_id(); | 286 | cpu = smp_processor_id(); |
| 297 | ts = &per_cpu(tick_cpu_sched, cpu); | 287 | ts = &per_cpu(tick_cpu_sched, cpu); |
| 298 | 288 | ||
| 299 | /* | ||
| 300 | * Call to tick_nohz_start_idle stops the last_update_time from being | ||
| 301 | * updated. Thus, it must not be called in the event we are called from | ||
| 302 | * irq_exit() with the prior state different than idle. | ||
| 303 | */ | ||
| 304 | if (!inidle && !ts->inidle) | ||
| 305 | goto end; | ||
| 306 | |||
| 307 | /* | ||
| 308 | * Set ts->inidle unconditionally. Even if the system did not | ||
| 309 | * switch to NOHZ mode the cpu frequency governers rely on the | ||
| 310 | * update of the idle time accounting in tick_nohz_start_idle(). | ||
| 311 | */ | ||
| 312 | ts->inidle = 1; | ||
| 313 | |||
| 314 | now = tick_nohz_start_idle(cpu, ts); | 289 | now = tick_nohz_start_idle(cpu, ts); |
| 315 | 290 | ||
| 316 | /* | 291 | /* |
| @@ -326,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
| 326 | } | 301 | } |
| 327 | 302 | ||
| 328 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) | 303 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) |
| 329 | goto end; | 304 | return; |
| 330 | 305 | ||
| 331 | if (need_resched()) | 306 | if (need_resched()) |
| 332 | goto end; | 307 | return; |
| 333 | 308 | ||
| 334 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { | 309 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { |
| 335 | static int ratelimit; | 310 | static int ratelimit; |
| @@ -339,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
| 339 | (unsigned int) local_softirq_pending()); | 314 | (unsigned int) local_softirq_pending()); |
| 340 | ratelimit++; | 315 | ratelimit++; |
| 341 | } | 316 | } |
| 342 | goto end; | 317 | return; |
| 343 | } | 318 | } |
| 344 | 319 | ||
| 345 | ts->idle_calls++; | 320 | ts->idle_calls++; |
| @@ -434,7 +409,6 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
| 434 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); | 409 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); |
| 435 | ts->tick_stopped = 1; | 410 | ts->tick_stopped = 1; |
| 436 | ts->idle_jiffies = last_jiffies; | 411 | ts->idle_jiffies = last_jiffies; |
| 437 | rcu_enter_nohz(); | ||
| 438 | } | 412 | } |
| 439 | 413 | ||
| 440 | ts->idle_sleeps++; | 414 | ts->idle_sleeps++; |
| @@ -472,8 +446,64 @@ out: | |||
| 472 | ts->next_jiffies = next_jiffies; | 446 | ts->next_jiffies = next_jiffies; |
| 473 | ts->last_jiffies = last_jiffies; | 447 | ts->last_jiffies = last_jiffies; |
| 474 | ts->sleep_length = ktime_sub(dev->next_event, now); | 448 | ts->sleep_length = ktime_sub(dev->next_event, now); |
| 475 | end: | 449 | } |
| 476 | local_irq_restore(flags); | 450 | |
| 451 | /** | ||
| 452 | * tick_nohz_idle_enter - stop the idle tick from the idle task | ||
| 453 | * | ||
| 454 | * When the next event is more than a tick into the future, stop the idle tick | ||
| 455 | * Called when we start the idle loop. | ||
| 456 | * | ||
| 457 | * The arch is responsible of calling: | ||
| 458 | * | ||
| 459 | * - rcu_idle_enter() after its last use of RCU before the CPU is put | ||
| 460 | * to sleep. | ||
| 461 | * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. | ||
| 462 | */ | ||
| 463 | void tick_nohz_idle_enter(void) | ||
| 464 | { | ||
| 465 | struct tick_sched *ts; | ||
| 466 | |||
| 467 | WARN_ON_ONCE(irqs_disabled()); | ||
| 468 | |||
| 469 | /* | ||
| 470 | * Update the idle state in the scheduler domain hierarchy | ||
| 471 | * when tick_nohz_stop_sched_tick() is called from the idle loop. | ||
| 472 | * State will be updated to busy during the first busy tick after | ||
| 473 | * exiting idle. | ||
| 474 | */ | ||
| 475 | set_cpu_sd_state_idle(); | ||
| 476 | |||
| 477 | local_irq_disable(); | ||
| 478 | |||
| 479 | ts = &__get_cpu_var(tick_cpu_sched); | ||
| 480 | /* | ||
| 481 | * set ts->inidle unconditionally. even if the system did not | ||
| 482 | * switch to nohz mode the cpu frequency governers rely on the | ||
| 483 | * update of the idle time accounting in tick_nohz_start_idle(). | ||
| 484 | */ | ||
| 485 | ts->inidle = 1; | ||
| 486 | tick_nohz_stop_sched_tick(ts); | ||
| 487 | |||
| 488 | local_irq_enable(); | ||
| 489 | } | ||
| 490 | |||
| 491 | /** | ||
| 492 | * tick_nohz_irq_exit - update next tick event from interrupt exit | ||
| 493 | * | ||
| 494 | * When an interrupt fires while we are idle and it doesn't cause | ||
| 495 | * a reschedule, it may still add, modify or delete a timer, enqueue | ||
| 496 | * an RCU callback, etc... | ||
| 497 | * So we need to re-calculate and reprogram the next tick event. | ||
| 498 | */ | ||
| 499 | void tick_nohz_irq_exit(void) | ||
| 500 | { | ||
| 501 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
| 502 | |||
| 503 | if (!ts->inidle) | ||
| 504 | return; | ||
| 505 | |||
| 506 | tick_nohz_stop_sched_tick(ts); | ||
| 477 | } | 507 | } |
| 478 | 508 | ||
| 479 | /** | 509 | /** |
| @@ -515,11 +545,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) | |||
| 515 | } | 545 | } |
| 516 | 546 | ||
| 517 | /** | 547 | /** |
| 518 | * tick_nohz_restart_sched_tick - restart the idle tick from the idle task | 548 | * tick_nohz_idle_exit - restart the idle tick from the idle task |
| 519 | * | 549 | * |
| 520 | * Restart the idle tick when the CPU is woken up from idle | 550 | * Restart the idle tick when the CPU is woken up from idle |
| 551 | * This also exit the RCU extended quiescent state. The CPU | ||
| 552 | * can use RCU again after this function is called. | ||
| 521 | */ | 553 | */ |
| 522 | void tick_nohz_restart_sched_tick(void) | 554 | void tick_nohz_idle_exit(void) |
| 523 | { | 555 | { |
| 524 | int cpu = smp_processor_id(); | 556 | int cpu = smp_processor_id(); |
| 525 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 557 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
| @@ -529,6 +561,7 @@ void tick_nohz_restart_sched_tick(void) | |||
| 529 | ktime_t now; | 561 | ktime_t now; |
| 530 | 562 | ||
| 531 | local_irq_disable(); | 563 | local_irq_disable(); |
| 564 | |||
| 532 | if (ts->idle_active || (ts->inidle && ts->tick_stopped)) | 565 | if (ts->idle_active || (ts->inidle && ts->tick_stopped)) |
| 533 | now = ktime_get(); | 566 | now = ktime_get(); |
| 534 | 567 | ||
| @@ -543,8 +576,6 @@ void tick_nohz_restart_sched_tick(void) | |||
| 543 | 576 | ||
| 544 | ts->inidle = 0; | 577 | ts->inidle = 0; |
| 545 | 578 | ||
| 546 | rcu_exit_nohz(); | ||
| 547 | |||
| 548 | /* Update jiffies first */ | 579 | /* Update jiffies first */ |
| 549 | select_nohz_load_balancer(0); | 580 | select_nohz_load_balancer(0); |
| 550 | tick_do_update_jiffies64(now); | 581 | tick_do_update_jiffies64(now); |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 169479994755..e6a5a6bc2769 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -131,7 +131,7 @@ static inline s64 timekeeping_get_ns_raw(void) | |||
| 131 | /* calculate the delta since the last update_wall_time: */ | 131 | /* calculate the delta since the last update_wall_time: */ |
| 132 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | 132 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; |
| 133 | 133 | ||
| 134 | /* return delta convert to nanoseconds using ntp adjusted mult. */ | 134 | /* return delta convert to nanoseconds. */ |
| 135 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); | 135 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); |
| 136 | } | 136 | } |
| 137 | 137 | ||
| @@ -249,6 +249,8 @@ ktime_t ktime_get(void) | |||
| 249 | secs = xtime.tv_sec + wall_to_monotonic.tv_sec; | 249 | secs = xtime.tv_sec + wall_to_monotonic.tv_sec; |
| 250 | nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; | 250 | nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; |
| 251 | nsecs += timekeeping_get_ns(); | 251 | nsecs += timekeeping_get_ns(); |
| 252 | /* If arch requires, add in gettimeoffset() */ | ||
| 253 | nsecs += arch_gettimeoffset(); | ||
| 252 | 254 | ||
| 253 | } while (read_seqretry(&xtime_lock, seq)); | 255 | } while (read_seqretry(&xtime_lock, seq)); |
| 254 | /* | 256 | /* |
| @@ -280,6 +282,8 @@ void ktime_get_ts(struct timespec *ts) | |||
| 280 | *ts = xtime; | 282 | *ts = xtime; |
| 281 | tomono = wall_to_monotonic; | 283 | tomono = wall_to_monotonic; |
| 282 | nsecs = timekeeping_get_ns(); | 284 | nsecs = timekeeping_get_ns(); |
| 285 | /* If arch requires, add in gettimeoffset() */ | ||
| 286 | nsecs += arch_gettimeoffset(); | ||
| 283 | 287 | ||
| 284 | } while (read_seqretry(&xtime_lock, seq)); | 288 | } while (read_seqretry(&xtime_lock, seq)); |
| 285 | 289 | ||
| @@ -802,14 +806,44 @@ static void timekeeping_adjust(s64 offset) | |||
| 802 | s64 error, interval = timekeeper.cycle_interval; | 806 | s64 error, interval = timekeeper.cycle_interval; |
| 803 | int adj; | 807 | int adj; |
| 804 | 808 | ||
| 809 | /* | ||
| 810 | * The point of this is to check if the error is greater then half | ||
| 811 | * an interval. | ||
| 812 | * | ||
| 813 | * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. | ||
| 814 | * | ||
| 815 | * Note we subtract one in the shift, so that error is really error*2. | ||
| 816 | * This "saves" dividing(shifting) interval twice, but keeps the | ||
| 817 | * (error > interval) comparison as still measuring if error is | ||
| 818 | * larger then half an interval. | ||
| 819 | * | ||
| 820 | * Note: It does not "save" on aggravation when reading the code. | ||
| 821 | */ | ||
| 805 | error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); | 822 | error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); |
| 806 | if (error > interval) { | 823 | if (error > interval) { |
| 824 | /* | ||
| 825 | * We now divide error by 4(via shift), which checks if | ||
| 826 | * the error is greater then twice the interval. | ||
| 827 | * If it is greater, we need a bigadjust, if its smaller, | ||
| 828 | * we can adjust by 1. | ||
| 829 | */ | ||
| 807 | error >>= 2; | 830 | error >>= 2; |
| 831 | /* | ||
| 832 | * XXX - In update_wall_time, we round up to the next | ||
| 833 | * nanosecond, and store the amount rounded up into | ||
| 834 | * the error. This causes the likely below to be unlikely. | ||
| 835 | * | ||
| 836 | * The proper fix is to avoid rounding up by using | ||
| 837 | * the high precision timekeeper.xtime_nsec instead of | ||
| 838 | * xtime.tv_nsec everywhere. Fixing this will take some | ||
| 839 | * time. | ||
| 840 | */ | ||
| 808 | if (likely(error <= interval)) | 841 | if (likely(error <= interval)) |
| 809 | adj = 1; | 842 | adj = 1; |
| 810 | else | 843 | else |
| 811 | adj = timekeeping_bigadjust(error, &interval, &offset); | 844 | adj = timekeeping_bigadjust(error, &interval, &offset); |
| 812 | } else if (error < -interval) { | 845 | } else if (error < -interval) { |
| 846 | /* See comment above, this is just switched for the negative */ | ||
| 813 | error >>= 2; | 847 | error >>= 2; |
| 814 | if (likely(error >= -interval)) { | 848 | if (likely(error >= -interval)) { |
| 815 | adj = -1; | 849 | adj = -1; |
| @@ -817,9 +851,65 @@ static void timekeeping_adjust(s64 offset) | |||
| 817 | offset = -offset; | 851 | offset = -offset; |
| 818 | } else | 852 | } else |
| 819 | adj = timekeeping_bigadjust(error, &interval, &offset); | 853 | adj = timekeeping_bigadjust(error, &interval, &offset); |
| 820 | } else | 854 | } else /* No adjustment needed */ |
| 821 | return; | 855 | return; |
| 822 | 856 | ||
| 857 | WARN_ONCE(timekeeper.clock->maxadj && | ||
| 858 | (timekeeper.mult + adj > timekeeper.clock->mult + | ||
| 859 | timekeeper.clock->maxadj), | ||
| 860 | "Adjusting %s more then 11%% (%ld vs %ld)\n", | ||
| 861 | timekeeper.clock->name, (long)timekeeper.mult + adj, | ||
| 862 | (long)timekeeper.clock->mult + | ||
| 863 | timekeeper.clock->maxadj); | ||
| 864 | /* | ||
| 865 | * So the following can be confusing. | ||
| 866 | * | ||
| 867 | * To keep things simple, lets assume adj == 1 for now. | ||
| 868 | * | ||
| 869 | * When adj != 1, remember that the interval and offset values | ||
| 870 | * have been appropriately scaled so the math is the same. | ||
| 871 | * | ||
| 872 | * The basic idea here is that we're increasing the multiplier | ||
| 873 | * by one, this causes the xtime_interval to be incremented by | ||
| 874 | * one cycle_interval. This is because: | ||
| 875 | * xtime_interval = cycle_interval * mult | ||
| 876 | * So if mult is being incremented by one: | ||
| 877 | * xtime_interval = cycle_interval * (mult + 1) | ||
| 878 | * Its the same as: | ||
| 879 | * xtime_interval = (cycle_interval * mult) + cycle_interval | ||
| 880 | * Which can be shortened to: | ||
| 881 | * xtime_interval += cycle_interval | ||
| 882 | * | ||
| 883 | * So offset stores the non-accumulated cycles. Thus the current | ||
| 884 | * time (in shifted nanoseconds) is: | ||
| 885 | * now = (offset * adj) + xtime_nsec | ||
| 886 | * Now, even though we're adjusting the clock frequency, we have | ||
| 887 | * to keep time consistent. In other words, we can't jump back | ||
| 888 | * in time, and we also want to avoid jumping forward in time. | ||
| 889 | * | ||
| 890 | * So given the same offset value, we need the time to be the same | ||
| 891 | * both before and after the freq adjustment. | ||
| 892 | * now = (offset * adj_1) + xtime_nsec_1 | ||
| 893 | * now = (offset * adj_2) + xtime_nsec_2 | ||
| 894 | * So: | ||
| 895 | * (offset * adj_1) + xtime_nsec_1 = | ||
| 896 | * (offset * adj_2) + xtime_nsec_2 | ||
| 897 | * And we know: | ||
| 898 | * adj_2 = adj_1 + 1 | ||
| 899 | * So: | ||
| 900 | * (offset * adj_1) + xtime_nsec_1 = | ||
| 901 | * (offset * (adj_1+1)) + xtime_nsec_2 | ||
| 902 | * (offset * adj_1) + xtime_nsec_1 = | ||
| 903 | * (offset * adj_1) + offset + xtime_nsec_2 | ||
| 904 | * Canceling the sides: | ||
| 905 | * xtime_nsec_1 = offset + xtime_nsec_2 | ||
| 906 | * Which gives us: | ||
| 907 | * xtime_nsec_2 = xtime_nsec_1 - offset | ||
| 908 | * Which simplfies to: | ||
| 909 | * xtime_nsec -= offset | ||
| 910 | * | ||
| 911 | * XXX - TODO: Doc ntp_error calculation. | ||
| 912 | */ | ||
| 823 | timekeeper.mult += adj; | 913 | timekeeper.mult += adj; |
| 824 | timekeeper.xtime_interval += interval; | 914 | timekeeper.xtime_interval += interval; |
| 825 | timekeeper.xtime_nsec -= offset; | 915 | timekeeper.xtime_nsec -= offset; |
