aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-10-20 16:19:56 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-20 16:19:56 -0400
commit99ebcf8285df28f32fd2d1c19a7166e70f00309c (patch)
treecaf45f39a77026b2fae2413c145067a1e5164701 /kernel
parent72558dde738b06cc01e16b3247a9659ca739e22d (diff)
parentc465a76af658b443075d6efee1c3131257643020 (diff)
Merge branch 'v28-timers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'v28-timers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (36 commits) fix documentation of sysrq-q really Fix documentation of sysrq-q timer_list: add base address to clock base timer_list: print cpu number of clockevents device timer_list: print real timer address NOHZ: restart tick device from irq_enter() NOHZ: split tick_nohz_restart_sched_tick() NOHZ: unify the nohz function calls in irq_enter() timers: fix itimer/many thread hang, fix timers: fix itimer/many thread hang, v3 ntp: improve adjtimex frequency rounding timekeeping: fix rounding problem during clock update ntp: let update_persistent_clock() sleep hrtimer: reorder struct hrtimer to save 8 bytes on 64bit builds posix-timers: lock_timer: make it readable posix-timers: lock_timer: kill the bogus ->it_id check posix-timers: kill ->it_sigev_signo and ->it_sigev_value posix-timers: sys_timer_create: cleanup the error handling posix-timers: move the initialization of timer->sigq from send to create path posix-timers: sys_timer_create: simplify and s/tasklist/rcu/ ... Fix trivial conflicts due to sysrq-q description clahes in Documentation/sysrq.txt and drivers/char/sysrq.c
Diffstat (limited to 'kernel')
-rw-r--r--kernel/compat.c53
-rw-r--r--kernel/exit.c19
-rw-r--r--kernel/fork.c92
-rw-r--r--kernel/hrtimer.c15
-rw-r--r--kernel/itimer.c33
-rw-r--r--kernel/posix-cpu-timers.c512
-rw-r--r--kernel/posix-timers.c153
-rw-r--r--kernel/sched.c19
-rw-r--r--kernel/sched_fair.c1
-rw-r--r--kernel/sched_rt.c4
-rw-r--r--kernel/sched_stats.h86
-rw-r--r--kernel/signal.c8
-rw-r--r--kernel/softirq.c10
-rw-r--r--kernel/sys.c75
-rw-r--r--kernel/time/clocksource.c3
-rw-r--r--kernel/time/jiffies.c1
-rw-r--r--kernel/time/ntp.c93
-rw-r--r--kernel/time/tick-broadcast.c13
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-sched.c93
-rw-r--r--kernel/time/timekeeping.c122
-rw-r--r--kernel/time/timer_list.c20
-rw-r--r--kernel/timer.c11
23 files changed, 778 insertions, 660 deletions
diff --git a/kernel/compat.c b/kernel/compat.c
index 143990e48cb9..8eafe3eb50d9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -23,6 +23,7 @@
23#include <linux/timex.h> 23#include <linux/timex.h>
24#include <linux/migrate.h> 24#include <linux/migrate.h>
25#include <linux/posix-timers.h> 25#include <linux/posix-timers.h>
26#include <linux/times.h>
26 27
27#include <asm/uaccess.h> 28#include <asm/uaccess.h>
28 29
@@ -208,49 +209,23 @@ asmlinkage long compat_sys_setitimer(int which,
208 return 0; 209 return 0;
209} 210}
210 211
212static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
213{
214 return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
215}
216
211asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) 217asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
212{ 218{
213 /*
214 * In the SMP world we might just be unlucky and have one of
215 * the times increment as we use it. Since the value is an
216 * atomically safe type this is just fine. Conceptually its
217 * as if the syscall took an instant longer to occur.
218 */
219 if (tbuf) { 219 if (tbuf) {
220 struct tms tms;
220 struct compat_tms tmp; 221 struct compat_tms tmp;
221 struct task_struct *tsk = current; 222
222 struct task_struct *t; 223 do_sys_times(&tms);
223 cputime_t utime, stime, cutime, cstime; 224 /* Convert our struct tms to the compat version. */
224 225 tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
225 read_lock(&tasklist_lock); 226 tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
226 utime = tsk->signal->utime; 227 tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
227 stime = tsk->signal->stime; 228 tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
228 t = tsk;
229 do {
230 utime = cputime_add(utime, t->utime);
231 stime = cputime_add(stime, t->stime);
232 t = next_thread(t);
233 } while (t != tsk);
234
235 /*
236 * While we have tasklist_lock read-locked, no dying thread
237 * can be updating current->signal->[us]time. Instead,
238 * we got their counts included in the live thread loop.
239 * However, another thread can come in right now and
240 * do a wait call that updates current->signal->c[us]time.
241 * To make sure we always see that pair updated atomically,
242 * we take the siglock around fetching them.
243 */
244 spin_lock_irq(&tsk->sighand->siglock);
245 cutime = tsk->signal->cutime;
246 cstime = tsk->signal->cstime;
247 spin_unlock_irq(&tsk->sighand->siglock);
248 read_unlock(&tasklist_lock);
249
250 tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime));
251 tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime));
252 tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime));
253 tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime));
254 if (copy_to_user(tbuf, &tmp, sizeof(tmp))) 229 if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
255 return -EFAULT; 230 return -EFAULT;
256 } 231 }
diff --git a/kernel/exit.c b/kernel/exit.c
index 0ef4673e351b..059b38cae384 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -112,8 +112,6 @@ static void __exit_signal(struct task_struct *tsk)
112 * We won't ever get here for the group leader, since it 112 * We won't ever get here for the group leader, since it
113 * will have been the last reference on the signal_struct. 113 * will have been the last reference on the signal_struct.
114 */ 114 */
115 sig->utime = cputime_add(sig->utime, task_utime(tsk));
116 sig->stime = cputime_add(sig->stime, task_stime(tsk));
117 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); 115 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
118 sig->min_flt += tsk->min_flt; 116 sig->min_flt += tsk->min_flt;
119 sig->maj_flt += tsk->maj_flt; 117 sig->maj_flt += tsk->maj_flt;
@@ -122,7 +120,6 @@ static void __exit_signal(struct task_struct *tsk)
122 sig->inblock += task_io_get_inblock(tsk); 120 sig->inblock += task_io_get_inblock(tsk);
123 sig->oublock += task_io_get_oublock(tsk); 121 sig->oublock += task_io_get_oublock(tsk);
124 task_io_accounting_add(&sig->ioac, &tsk->ioac); 122 task_io_accounting_add(&sig->ioac, &tsk->ioac);
125 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
126 sig = NULL; /* Marker for below. */ 123 sig = NULL; /* Marker for below. */
127 } 124 }
128 125
@@ -1301,6 +1298,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
1301 if (likely(!traced)) { 1298 if (likely(!traced)) {
1302 struct signal_struct *psig; 1299 struct signal_struct *psig;
1303 struct signal_struct *sig; 1300 struct signal_struct *sig;
1301 struct task_cputime cputime;
1304 1302
1305 /* 1303 /*
1306 * The resource counters for the group leader are in its 1304 * The resource counters for the group leader are in its
@@ -1316,20 +1314,23 @@ static int wait_task_zombie(struct task_struct *p, int options,
1316 * need to protect the access to p->parent->signal fields, 1314 * need to protect the access to p->parent->signal fields,
1317 * as other threads in the parent group can be right 1315 * as other threads in the parent group can be right
1318 * here reaping other children at the same time. 1316 * here reaping other children at the same time.
1317 *
1318 * We use thread_group_cputime() to get times for the thread
1319 * group, which consolidates times for all threads in the
1320 * group including the group leader.
1319 */ 1321 */
1320 spin_lock_irq(&p->parent->sighand->siglock); 1322 spin_lock_irq(&p->parent->sighand->siglock);
1321 psig = p->parent->signal; 1323 psig = p->parent->signal;
1322 sig = p->signal; 1324 sig = p->signal;
1325 thread_group_cputime(p, &cputime);
1323 psig->cutime = 1326 psig->cutime =
1324 cputime_add(psig->cutime, 1327 cputime_add(psig->cutime,
1325 cputime_add(p->utime, 1328 cputime_add(cputime.utime,
1326 cputime_add(sig->utime, 1329 sig->cutime));
1327 sig->cutime)));
1328 psig->cstime = 1330 psig->cstime =
1329 cputime_add(psig->cstime, 1331 cputime_add(psig->cstime,
1330 cputime_add(p->stime, 1332 cputime_add(cputime.stime,
1331 cputime_add(sig->stime, 1333 sig->cstime));
1332 sig->cstime)));
1333 psig->cgtime = 1334 psig->cgtime =
1334 cputime_add(psig->cgtime, 1335 cputime_add(psig->cgtime,
1335 cputime_add(p->gtime, 1336 cputime_add(p->gtime,
diff --git a/kernel/fork.c b/kernel/fork.c
index 30de644a40c4..44e64d7ba29b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -759,15 +759,44 @@ void __cleanup_sighand(struct sighand_struct *sighand)
759 kmem_cache_free(sighand_cachep, sighand); 759 kmem_cache_free(sighand_cachep, sighand);
760} 760}
761 761
762
763/*
764 * Initialize POSIX timer handling for a thread group.
765 */
766static void posix_cpu_timers_init_group(struct signal_struct *sig)
767{
768 /* Thread group counters. */
769 thread_group_cputime_init(sig);
770
771 /* Expiration times and increments. */
772 sig->it_virt_expires = cputime_zero;
773 sig->it_virt_incr = cputime_zero;
774 sig->it_prof_expires = cputime_zero;
775 sig->it_prof_incr = cputime_zero;
776
777 /* Cached expiration times. */
778 sig->cputime_expires.prof_exp = cputime_zero;
779 sig->cputime_expires.virt_exp = cputime_zero;
780 sig->cputime_expires.sched_exp = 0;
781
782 /* The timer lists. */
783 INIT_LIST_HEAD(&sig->cpu_timers[0]);
784 INIT_LIST_HEAD(&sig->cpu_timers[1]);
785 INIT_LIST_HEAD(&sig->cpu_timers[2]);
786}
787
762static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) 788static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
763{ 789{
764 struct signal_struct *sig; 790 struct signal_struct *sig;
765 int ret; 791 int ret;
766 792
767 if (clone_flags & CLONE_THREAD) { 793 if (clone_flags & CLONE_THREAD) {
768 atomic_inc(&current->signal->count); 794 ret = thread_group_cputime_clone_thread(current);
769 atomic_inc(&current->signal->live); 795 if (likely(!ret)) {
770 return 0; 796 atomic_inc(&current->signal->count);
797 atomic_inc(&current->signal->live);
798 }
799 return ret;
771 } 800 }
772 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 801 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
773 tsk->signal = sig; 802 tsk->signal = sig;
@@ -795,40 +824,25 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
795 sig->it_real_incr.tv64 = 0; 824 sig->it_real_incr.tv64 = 0;
796 sig->real_timer.function = it_real_fn; 825 sig->real_timer.function = it_real_fn;
797 826
798 sig->it_virt_expires = cputime_zero;
799 sig->it_virt_incr = cputime_zero;
800 sig->it_prof_expires = cputime_zero;
801 sig->it_prof_incr = cputime_zero;
802
803 sig->leader = 0; /* session leadership doesn't inherit */ 827 sig->leader = 0; /* session leadership doesn't inherit */
804 sig->tty_old_pgrp = NULL; 828 sig->tty_old_pgrp = NULL;
805 sig->tty = NULL; 829 sig->tty = NULL;
806 830
807 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 831 sig->cutime = sig->cstime = cputime_zero;
808 sig->gtime = cputime_zero; 832 sig->gtime = cputime_zero;
809 sig->cgtime = cputime_zero; 833 sig->cgtime = cputime_zero;
810 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 834 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
811 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 835 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
812 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 836 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
813 task_io_accounting_init(&sig->ioac); 837 task_io_accounting_init(&sig->ioac);
814 sig->sum_sched_runtime = 0;
815 INIT_LIST_HEAD(&sig->cpu_timers[0]);
816 INIT_LIST_HEAD(&sig->cpu_timers[1]);
817 INIT_LIST_HEAD(&sig->cpu_timers[2]);
818 taskstats_tgid_init(sig); 838 taskstats_tgid_init(sig);
819 839
820 task_lock(current->group_leader); 840 task_lock(current->group_leader);
821 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 841 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
822 task_unlock(current->group_leader); 842 task_unlock(current->group_leader);
823 843
824 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 844 posix_cpu_timers_init_group(sig);
825 /* 845
826 * New sole thread in the process gets an expiry time
827 * of the whole CPU time limit.
828 */
829 tsk->it_prof_expires =
830 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
831 }
832 acct_init_pacct(&sig->pacct); 846 acct_init_pacct(&sig->pacct);
833 847
834 tty_audit_fork(sig); 848 tty_audit_fork(sig);
@@ -838,6 +852,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
838 852
839void __cleanup_signal(struct signal_struct *sig) 853void __cleanup_signal(struct signal_struct *sig)
840{ 854{
855 thread_group_cputime_free(sig);
841 exit_thread_group_keys(sig); 856 exit_thread_group_keys(sig);
842 tty_kref_put(sig->tty); 857 tty_kref_put(sig->tty);
843 kmem_cache_free(signal_cachep, sig); 858 kmem_cache_free(signal_cachep, sig);
@@ -888,6 +903,19 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
888#endif /* CONFIG_MM_OWNER */ 903#endif /* CONFIG_MM_OWNER */
889 904
890/* 905/*
906 * Initialize POSIX timer handling for a single task.
907 */
908static void posix_cpu_timers_init(struct task_struct *tsk)
909{
910 tsk->cputime_expires.prof_exp = cputime_zero;
911 tsk->cputime_expires.virt_exp = cputime_zero;
912 tsk->cputime_expires.sched_exp = 0;
913 INIT_LIST_HEAD(&tsk->cpu_timers[0]);
914 INIT_LIST_HEAD(&tsk->cpu_timers[1]);
915 INIT_LIST_HEAD(&tsk->cpu_timers[2]);
916}
917
918/*
891 * This creates a new process as a copy of the old one, 919 * This creates a new process as a copy of the old one,
892 * but does not actually start it yet. 920 * but does not actually start it yet.
893 * 921 *
@@ -997,12 +1025,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
997 task_io_accounting_init(&p->ioac); 1025 task_io_accounting_init(&p->ioac);
998 acct_clear_integrals(p); 1026 acct_clear_integrals(p);
999 1027
1000 p->it_virt_expires = cputime_zero; 1028 posix_cpu_timers_init(p);
1001 p->it_prof_expires = cputime_zero;
1002 p->it_sched_expires = 0;
1003 INIT_LIST_HEAD(&p->cpu_timers[0]);
1004 INIT_LIST_HEAD(&p->cpu_timers[1]);
1005 INIT_LIST_HEAD(&p->cpu_timers[2]);
1006 1029
1007 p->lock_depth = -1; /* -1 = no lock */ 1030 p->lock_depth = -1; /* -1 = no lock */
1008 do_posix_clock_monotonic_gettime(&p->start_time); 1031 do_posix_clock_monotonic_gettime(&p->start_time);
@@ -1203,21 +1226,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1203 if (clone_flags & CLONE_THREAD) { 1226 if (clone_flags & CLONE_THREAD) {
1204 p->group_leader = current->group_leader; 1227 p->group_leader = current->group_leader;
1205 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1228 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1206
1207 if (!cputime_eq(current->signal->it_virt_expires,
1208 cputime_zero) ||
1209 !cputime_eq(current->signal->it_prof_expires,
1210 cputime_zero) ||
1211 current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
1212 !list_empty(&current->signal->cpu_timers[0]) ||
1213 !list_empty(&current->signal->cpu_timers[1]) ||
1214 !list_empty(&current->signal->cpu_timers[2])) {
1215 /*
1216 * Have child wake up on its first tick to check
1217 * for process CPU timers.
1218 */
1219 p->it_prof_expires = jiffies_to_cputime(1);
1220 }
1221 } 1229 }
1222 1230
1223 if (likely(p->pid)) { 1231 if (likely(p->pid)) {
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cdec83e722fa..95978f48e039 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1403,9 +1403,7 @@ void hrtimer_run_queues(void)
1403 if (!base->first) 1403 if (!base->first)
1404 continue; 1404 continue;
1405 1405
1406 if (base->get_softirq_time) 1406 if (gettime) {
1407 base->softirq_time = base->get_softirq_time();
1408 else if (gettime) {
1409 hrtimer_get_softirq_time(cpu_base); 1407 hrtimer_get_softirq_time(cpu_base);
1410 gettime = 0; 1408 gettime = 0;
1411 } 1409 }
@@ -1688,9 +1686,11 @@ static void migrate_hrtimers(int cpu)
1688 new_base = &get_cpu_var(hrtimer_bases); 1686 new_base = &get_cpu_var(hrtimer_bases);
1689 1687
1690 tick_cancel_sched_timer(cpu); 1688 tick_cancel_sched_timer(cpu);
1691 1689 /*
1692 local_irq_disable(); 1690 * The caller is globally serialized and nobody else
1693 spin_lock(&new_base->lock); 1691 * takes two locks at once, deadlock is not possible.
1692 */
1693 spin_lock_irq(&new_base->lock);
1694 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1694 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1695 1695
1696 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1696 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
@@ -1703,8 +1703,7 @@ static void migrate_hrtimers(int cpu)
1703 raise = 1; 1703 raise = 1;
1704 1704
1705 spin_unlock(&old_base->lock); 1705 spin_unlock(&old_base->lock);
1706 spin_unlock(&new_base->lock); 1706 spin_unlock_irq(&new_base->lock);
1707 local_irq_enable();
1708 put_cpu_var(hrtimer_bases); 1707 put_cpu_var(hrtimer_bases);
1709 1708
1710 if (raise) 1709 if (raise)
diff --git a/kernel/itimer.c b/kernel/itimer.c
index ab982747d9bd..db7c358b9a02 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -55,17 +55,15 @@ int do_getitimer(int which, struct itimerval *value)
55 spin_unlock_irq(&tsk->sighand->siglock); 55 spin_unlock_irq(&tsk->sighand->siglock);
56 break; 56 break;
57 case ITIMER_VIRTUAL: 57 case ITIMER_VIRTUAL:
58 read_lock(&tasklist_lock);
59 spin_lock_irq(&tsk->sighand->siglock); 58 spin_lock_irq(&tsk->sighand->siglock);
60 cval = tsk->signal->it_virt_expires; 59 cval = tsk->signal->it_virt_expires;
61 cinterval = tsk->signal->it_virt_incr; 60 cinterval = tsk->signal->it_virt_incr;
62 if (!cputime_eq(cval, cputime_zero)) { 61 if (!cputime_eq(cval, cputime_zero)) {
63 struct task_struct *t = tsk; 62 struct task_cputime cputime;
64 cputime_t utime = tsk->signal->utime; 63 cputime_t utime;
65 do { 64
66 utime = cputime_add(utime, t->utime); 65 thread_group_cputime(tsk, &cputime);
67 t = next_thread(t); 66 utime = cputime.utime;
68 } while (t != tsk);
69 if (cputime_le(cval, utime)) { /* about to fire */ 67 if (cputime_le(cval, utime)) { /* about to fire */
70 cval = jiffies_to_cputime(1); 68 cval = jiffies_to_cputime(1);
71 } else { 69 } else {
@@ -73,25 +71,19 @@ int do_getitimer(int which, struct itimerval *value)
73 } 71 }
74 } 72 }
75 spin_unlock_irq(&tsk->sighand->siglock); 73 spin_unlock_irq(&tsk->sighand->siglock);
76 read_unlock(&tasklist_lock);
77 cputime_to_timeval(cval, &value->it_value); 74 cputime_to_timeval(cval, &value->it_value);
78 cputime_to_timeval(cinterval, &value->it_interval); 75 cputime_to_timeval(cinterval, &value->it_interval);
79 break; 76 break;
80 case ITIMER_PROF: 77 case ITIMER_PROF:
81 read_lock(&tasklist_lock);
82 spin_lock_irq(&tsk->sighand->siglock); 78 spin_lock_irq(&tsk->sighand->siglock);
83 cval = tsk->signal->it_prof_expires; 79 cval = tsk->signal->it_prof_expires;
84 cinterval = tsk->signal->it_prof_incr; 80 cinterval = tsk->signal->it_prof_incr;
85 if (!cputime_eq(cval, cputime_zero)) { 81 if (!cputime_eq(cval, cputime_zero)) {
86 struct task_struct *t = tsk; 82 struct task_cputime times;
87 cputime_t ptime = cputime_add(tsk->signal->utime, 83 cputime_t ptime;
88 tsk->signal->stime); 84
89 do { 85 thread_group_cputime(tsk, &times);
90 ptime = cputime_add(ptime, 86 ptime = cputime_add(times.utime, times.stime);
91 cputime_add(t->utime,
92 t->stime));
93 t = next_thread(t);
94 } while (t != tsk);
95 if (cputime_le(cval, ptime)) { /* about to fire */ 87 if (cputime_le(cval, ptime)) { /* about to fire */
96 cval = jiffies_to_cputime(1); 88 cval = jiffies_to_cputime(1);
97 } else { 89 } else {
@@ -99,7 +91,6 @@ int do_getitimer(int which, struct itimerval *value)
99 } 91 }
100 } 92 }
101 spin_unlock_irq(&tsk->sighand->siglock); 93 spin_unlock_irq(&tsk->sighand->siglock);
102 read_unlock(&tasklist_lock);
103 cputime_to_timeval(cval, &value->it_value); 94 cputime_to_timeval(cval, &value->it_value);
104 cputime_to_timeval(cinterval, &value->it_interval); 95 cputime_to_timeval(cinterval, &value->it_interval);
105 break; 96 break;
@@ -185,7 +176,6 @@ again:
185 case ITIMER_VIRTUAL: 176 case ITIMER_VIRTUAL:
186 nval = timeval_to_cputime(&value->it_value); 177 nval = timeval_to_cputime(&value->it_value);
187 ninterval = timeval_to_cputime(&value->it_interval); 178 ninterval = timeval_to_cputime(&value->it_interval);
188 read_lock(&tasklist_lock);
189 spin_lock_irq(&tsk->sighand->siglock); 179 spin_lock_irq(&tsk->sighand->siglock);
190 cval = tsk->signal->it_virt_expires; 180 cval = tsk->signal->it_virt_expires;
191 cinterval = tsk->signal->it_virt_incr; 181 cinterval = tsk->signal->it_virt_incr;
@@ -200,7 +190,6 @@ again:
200 tsk->signal->it_virt_expires = nval; 190 tsk->signal->it_virt_expires = nval;
201 tsk->signal->it_virt_incr = ninterval; 191 tsk->signal->it_virt_incr = ninterval;
202 spin_unlock_irq(&tsk->sighand->siglock); 192 spin_unlock_irq(&tsk->sighand->siglock);
203 read_unlock(&tasklist_lock);
204 if (ovalue) { 193 if (ovalue) {
205 cputime_to_timeval(cval, &ovalue->it_value); 194 cputime_to_timeval(cval, &ovalue->it_value);
206 cputime_to_timeval(cinterval, &ovalue->it_interval); 195 cputime_to_timeval(cinterval, &ovalue->it_interval);
@@ -209,7 +198,6 @@ again:
209 case ITIMER_PROF: 198 case ITIMER_PROF:
210 nval = timeval_to_cputime(&value->it_value); 199 nval = timeval_to_cputime(&value->it_value);
211 ninterval = timeval_to_cputime(&value->it_interval); 200 ninterval = timeval_to_cputime(&value->it_interval);
212 read_lock(&tasklist_lock);
213 spin_lock_irq(&tsk->sighand->siglock); 201 spin_lock_irq(&tsk->sighand->siglock);
214 cval = tsk->signal->it_prof_expires; 202 cval = tsk->signal->it_prof_expires;
215 cinterval = tsk->signal->it_prof_incr; 203 cinterval = tsk->signal->it_prof_incr;
@@ -224,7 +212,6 @@ again:
224 tsk->signal->it_prof_expires = nval; 212 tsk->signal->it_prof_expires = nval;
225 tsk->signal->it_prof_incr = ninterval; 213 tsk->signal->it_prof_incr = ninterval;
226 spin_unlock_irq(&tsk->sighand->siglock); 214 spin_unlock_irq(&tsk->sighand->siglock);
227 read_unlock(&tasklist_lock);
228 if (ovalue) { 215 if (ovalue) {
229 cputime_to_timeval(cval, &ovalue->it_value); 216 cputime_to_timeval(cval, &ovalue->it_value);
230 cputime_to_timeval(cinterval, &ovalue->it_interval); 217 cputime_to_timeval(cinterval, &ovalue->it_interval);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index c42a03aef36f..153dcb2639c3 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -7,6 +7,93 @@
7#include <linux/errno.h> 7#include <linux/errno.h>
8#include <linux/math64.h> 8#include <linux/math64.h>
9#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <linux/kernel_stat.h>
11
12/*
13 * Allocate the thread_group_cputime structure appropriately and fill in the
14 * current values of the fields. Called from copy_signal() via
15 * thread_group_cputime_clone_thread() when adding a second or subsequent
16 * thread to a thread group. Assumes interrupts are enabled when called.
17 */
18int thread_group_cputime_alloc(struct task_struct *tsk)
19{
20 struct signal_struct *sig = tsk->signal;
21 struct task_cputime *cputime;
22
23 /*
24 * If we have multiple threads and we don't already have a
25 * per-CPU task_cputime struct (checked in the caller), allocate
26 * one and fill it in with the times accumulated so far. We may
27 * race with another thread so recheck after we pick up the sighand
28 * lock.
29 */
30 cputime = alloc_percpu(struct task_cputime);
31 if (cputime == NULL)
32 return -ENOMEM;
33 spin_lock_irq(&tsk->sighand->siglock);
34 if (sig->cputime.totals) {
35 spin_unlock_irq(&tsk->sighand->siglock);
36 free_percpu(cputime);
37 return 0;
38 }
39 sig->cputime.totals = cputime;
40 cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id());
41 cputime->utime = tsk->utime;
42 cputime->stime = tsk->stime;
43 cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
44 spin_unlock_irq(&tsk->sighand->siglock);
45 return 0;
46}
47
48/**
49 * thread_group_cputime - Sum the thread group time fields across all CPUs.
50 *
51 * @tsk: The task we use to identify the thread group.
52 * @times: task_cputime structure in which we return the summed fields.
53 *
54 * Walk the list of CPUs to sum the per-CPU time fields in the thread group
55 * time structure.
56 */
57void thread_group_cputime(
58 struct task_struct *tsk,
59 struct task_cputime *times)
60{
61 struct signal_struct *sig;
62 int i;
63 struct task_cputime *tot;
64
65 sig = tsk->signal;
66 if (unlikely(!sig) || !sig->cputime.totals) {
67 times->utime = tsk->utime;
68 times->stime = tsk->stime;
69 times->sum_exec_runtime = tsk->se.sum_exec_runtime;
70 return;
71 }
72 times->stime = times->utime = cputime_zero;
73 times->sum_exec_runtime = 0;
74 for_each_possible_cpu(i) {
75 tot = per_cpu_ptr(tsk->signal->cputime.totals, i);
76 times->utime = cputime_add(times->utime, tot->utime);
77 times->stime = cputime_add(times->stime, tot->stime);
78 times->sum_exec_runtime += tot->sum_exec_runtime;
79 }
80}
81
82/*
83 * Called after updating RLIMIT_CPU to set timer expiration if necessary.
84 */
85void update_rlimit_cpu(unsigned long rlim_new)
86{
87 cputime_t cputime;
88
89 cputime = secs_to_cputime(rlim_new);
90 if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
91 cputime_lt(current->signal->it_prof_expires, cputime)) {
92 spin_lock_irq(&current->sighand->siglock);
93 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
94 spin_unlock_irq(&current->sighand->siglock);
95 }
96}
10 97
11static int check_clock(const clockid_t which_clock) 98static int check_clock(const clockid_t which_clock)
12{ 99{
@@ -158,10 +245,6 @@ static inline cputime_t virt_ticks(struct task_struct *p)
158{ 245{
159 return p->utime; 246 return p->utime;
160} 247}
161static inline unsigned long long sched_ns(struct task_struct *p)
162{
163 return task_sched_runtime(p);
164}
165 248
166int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) 249int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
167{ 250{
@@ -211,7 +294,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
211 cpu->cpu = virt_ticks(p); 294 cpu->cpu = virt_ticks(p);
212 break; 295 break;
213 case CPUCLOCK_SCHED: 296 case CPUCLOCK_SCHED:
214 cpu->sched = sched_ns(p); 297 cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
215 break; 298 break;
216 } 299 }
217 return 0; 300 return 0;
@@ -220,59 +303,30 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
220/* 303/*
221 * Sample a process (thread group) clock for the given group_leader task. 304 * Sample a process (thread group) clock for the given group_leader task.
222 * Must be called with tasklist_lock held for reading. 305 * Must be called with tasklist_lock held for reading.
223 * Must be called with tasklist_lock held for reading, and p->sighand->siglock.
224 */ 306 */
225static int cpu_clock_sample_group_locked(unsigned int clock_idx, 307static int cpu_clock_sample_group(const clockid_t which_clock,
226 struct task_struct *p, 308 struct task_struct *p,
227 union cpu_time_count *cpu) 309 union cpu_time_count *cpu)
228{ 310{
229 struct task_struct *t = p; 311 struct task_cputime cputime;
230 switch (clock_idx) { 312
313 thread_group_cputime(p, &cputime);
314 switch (which_clock) {
231 default: 315 default:
232 return -EINVAL; 316 return -EINVAL;
233 case CPUCLOCK_PROF: 317 case CPUCLOCK_PROF:
234 cpu->cpu = cputime_add(p->signal->utime, p->signal->stime); 318 cpu->cpu = cputime_add(cputime.utime, cputime.stime);
235 do {
236 cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t));
237 t = next_thread(t);
238 } while (t != p);
239 break; 319 break;
240 case CPUCLOCK_VIRT: 320 case CPUCLOCK_VIRT:
241 cpu->cpu = p->signal->utime; 321 cpu->cpu = cputime.utime;
242 do {
243 cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t));
244 t = next_thread(t);
245 } while (t != p);
246 break; 322 break;
247 case CPUCLOCK_SCHED: 323 case CPUCLOCK_SCHED:
248 cpu->sched = p->signal->sum_sched_runtime; 324 cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
249 /* Add in each other live thread. */
250 while ((t = next_thread(t)) != p) {
251 cpu->sched += t->se.sum_exec_runtime;
252 }
253 cpu->sched += sched_ns(p);
254 break; 325 break;
255 } 326 }
256 return 0; 327 return 0;
257} 328}
258 329
259/*
260 * Sample a process (thread group) clock for the given group_leader task.
261 * Must be called with tasklist_lock held for reading.
262 */
263static int cpu_clock_sample_group(const clockid_t which_clock,
264 struct task_struct *p,
265 union cpu_time_count *cpu)
266{
267 int ret;
268 unsigned long flags;
269 spin_lock_irqsave(&p->sighand->siglock, flags);
270 ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p,
271 cpu);
272 spin_unlock_irqrestore(&p->sighand->siglock, flags);
273 return ret;
274}
275
276 330
277int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) 331int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
278{ 332{
@@ -471,80 +525,11 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
471} 525}
472void posix_cpu_timers_exit_group(struct task_struct *tsk) 526void posix_cpu_timers_exit_group(struct task_struct *tsk)
473{ 527{
474 cleanup_timers(tsk->signal->cpu_timers, 528 struct task_cputime cputime;
475 cputime_add(tsk->utime, tsk->signal->utime),
476 cputime_add(tsk->stime, tsk->signal->stime),
477 tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
478}
479 529
480 530 thread_group_cputime(tsk, &cputime);
481/* 531 cleanup_timers(tsk->signal->cpu_timers,
482 * Set the expiry times of all the threads in the process so one of them 532 cputime.utime, cputime.stime, cputime.sum_exec_runtime);
483 * will go off before the process cumulative expiry total is reached.
484 */
485static void process_timer_rebalance(struct task_struct *p,
486 unsigned int clock_idx,
487 union cpu_time_count expires,
488 union cpu_time_count val)
489{
490 cputime_t ticks, left;
491 unsigned long long ns, nsleft;
492 struct task_struct *t = p;
493 unsigned int nthreads = atomic_read(&p->signal->live);
494
495 if (!nthreads)
496 return;
497
498 switch (clock_idx) {
499 default:
500 BUG();
501 break;
502 case CPUCLOCK_PROF:
503 left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
504 nthreads);
505 do {
506 if (likely(!(t->flags & PF_EXITING))) {
507 ticks = cputime_add(prof_ticks(t), left);
508 if (cputime_eq(t->it_prof_expires,
509 cputime_zero) ||
510 cputime_gt(t->it_prof_expires, ticks)) {
511 t->it_prof_expires = ticks;
512 }
513 }
514 t = next_thread(t);
515 } while (t != p);
516 break;
517 case CPUCLOCK_VIRT:
518 left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
519 nthreads);
520 do {
521 if (likely(!(t->flags & PF_EXITING))) {
522 ticks = cputime_add(virt_ticks(t), left);
523 if (cputime_eq(t->it_virt_expires,
524 cputime_zero) ||
525 cputime_gt(t->it_virt_expires, ticks)) {
526 t->it_virt_expires = ticks;
527 }
528 }
529 t = next_thread(t);
530 } while (t != p);
531 break;
532 case CPUCLOCK_SCHED:
533 nsleft = expires.sched - val.sched;
534 do_div(nsleft, nthreads);
535 nsleft = max_t(unsigned long long, nsleft, 1);
536 do {
537 if (likely(!(t->flags & PF_EXITING))) {
538 ns = t->se.sum_exec_runtime + nsleft;
539 if (t->it_sched_expires == 0 ||
540 t->it_sched_expires > ns) {
541 t->it_sched_expires = ns;
542 }
543 }
544 t = next_thread(t);
545 } while (t != p);
546 break;
547 }
548} 533}
549 534
550static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) 535static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
@@ -608,29 +593,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
608 default: 593 default:
609 BUG(); 594 BUG();
610 case CPUCLOCK_PROF: 595 case CPUCLOCK_PROF:
611 if (cputime_eq(p->it_prof_expires, 596 if (cputime_eq(p->cputime_expires.prof_exp,
612 cputime_zero) || 597 cputime_zero) ||
613 cputime_gt(p->it_prof_expires, 598 cputime_gt(p->cputime_expires.prof_exp,
614 nt->expires.cpu)) 599 nt->expires.cpu))
615 p->it_prof_expires = nt->expires.cpu; 600 p->cputime_expires.prof_exp =
601 nt->expires.cpu;
616 break; 602 break;
617 case CPUCLOCK_VIRT: 603 case CPUCLOCK_VIRT:
618 if (cputime_eq(p->it_virt_expires, 604 if (cputime_eq(p->cputime_expires.virt_exp,
619 cputime_zero) || 605 cputime_zero) ||
620 cputime_gt(p->it_virt_expires, 606 cputime_gt(p->cputime_expires.virt_exp,
621 nt->expires.cpu)) 607 nt->expires.cpu))
622 p->it_virt_expires = nt->expires.cpu; 608 p->cputime_expires.virt_exp =
609 nt->expires.cpu;
623 break; 610 break;
624 case CPUCLOCK_SCHED: 611 case CPUCLOCK_SCHED:
625 if (p->it_sched_expires == 0 || 612 if (p->cputime_expires.sched_exp == 0 ||
626 p->it_sched_expires > nt->expires.sched) 613 p->cputime_expires.sched_exp >
627 p->it_sched_expires = nt->expires.sched; 614 nt->expires.sched)
615 p->cputime_expires.sched_exp =
616 nt->expires.sched;
628 break; 617 break;
629 } 618 }
630 } else { 619 } else {
631 /* 620 /*
632 * For a process timer, we must balance 621 * For a process timer, set the cached expiration time.
633 * all the live threads' expirations.
634 */ 622 */
635 switch (CPUCLOCK_WHICH(timer->it_clock)) { 623 switch (CPUCLOCK_WHICH(timer->it_clock)) {
636 default: 624 default:
@@ -641,7 +629,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
641 cputime_lt(p->signal->it_virt_expires, 629 cputime_lt(p->signal->it_virt_expires,
642 timer->it.cpu.expires.cpu)) 630 timer->it.cpu.expires.cpu))
643 break; 631 break;
644 goto rebalance; 632 p->signal->cputime_expires.virt_exp =
633 timer->it.cpu.expires.cpu;
634 break;
645 case CPUCLOCK_PROF: 635 case CPUCLOCK_PROF:
646 if (!cputime_eq(p->signal->it_prof_expires, 636 if (!cputime_eq(p->signal->it_prof_expires,
647 cputime_zero) && 637 cputime_zero) &&
@@ -652,13 +642,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
652 if (i != RLIM_INFINITY && 642 if (i != RLIM_INFINITY &&
653 i <= cputime_to_secs(timer->it.cpu.expires.cpu)) 643 i <= cputime_to_secs(timer->it.cpu.expires.cpu))
654 break; 644 break;
655 goto rebalance; 645 p->signal->cputime_expires.prof_exp =
646 timer->it.cpu.expires.cpu;
647 break;
656 case CPUCLOCK_SCHED: 648 case CPUCLOCK_SCHED:
657 rebalance: 649 p->signal->cputime_expires.sched_exp =
658 process_timer_rebalance( 650 timer->it.cpu.expires.sched;
659 timer->it.cpu.task,
660 CPUCLOCK_WHICH(timer->it_clock),
661 timer->it.cpu.expires, now);
662 break; 651 break;
663 } 652 }
664 } 653 }
@@ -969,13 +958,13 @@ static void check_thread_timers(struct task_struct *tsk,
969 struct signal_struct *const sig = tsk->signal; 958 struct signal_struct *const sig = tsk->signal;
970 959
971 maxfire = 20; 960 maxfire = 20;
972 tsk->it_prof_expires = cputime_zero; 961 tsk->cputime_expires.prof_exp = cputime_zero;
973 while (!list_empty(timers)) { 962 while (!list_empty(timers)) {
974 struct cpu_timer_list *t = list_first_entry(timers, 963 struct cpu_timer_list *t = list_first_entry(timers,
975 struct cpu_timer_list, 964 struct cpu_timer_list,
976 entry); 965 entry);
977 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { 966 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
978 tsk->it_prof_expires = t->expires.cpu; 967 tsk->cputime_expires.prof_exp = t->expires.cpu;
979 break; 968 break;
980 } 969 }
981 t->firing = 1; 970 t->firing = 1;
@@ -984,13 +973,13 @@ static void check_thread_timers(struct task_struct *tsk,
984 973
985 ++timers; 974 ++timers;
986 maxfire = 20; 975 maxfire = 20;
987 tsk->it_virt_expires = cputime_zero; 976 tsk->cputime_expires.virt_exp = cputime_zero;
988 while (!list_empty(timers)) { 977 while (!list_empty(timers)) {
989 struct cpu_timer_list *t = list_first_entry(timers, 978 struct cpu_timer_list *t = list_first_entry(timers,
990 struct cpu_timer_list, 979 struct cpu_timer_list,
991 entry); 980 entry);
992 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { 981 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
993 tsk->it_virt_expires = t->expires.cpu; 982 tsk->cputime_expires.virt_exp = t->expires.cpu;
994 break; 983 break;
995 } 984 }
996 t->firing = 1; 985 t->firing = 1;
@@ -999,13 +988,13 @@ static void check_thread_timers(struct task_struct *tsk,
999 988
1000 ++timers; 989 ++timers;
1001 maxfire = 20; 990 maxfire = 20;
1002 tsk->it_sched_expires = 0; 991 tsk->cputime_expires.sched_exp = 0;
1003 while (!list_empty(timers)) { 992 while (!list_empty(timers)) {
1004 struct cpu_timer_list *t = list_first_entry(timers, 993 struct cpu_timer_list *t = list_first_entry(timers,
1005 struct cpu_timer_list, 994 struct cpu_timer_list,
1006 entry); 995 entry);
1007 if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { 996 if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
1008 tsk->it_sched_expires = t->expires.sched; 997 tsk->cputime_expires.sched_exp = t->expires.sched;
1009 break; 998 break;
1010 } 999 }
1011 t->firing = 1; 1000 t->firing = 1;
@@ -1055,10 +1044,10 @@ static void check_process_timers(struct task_struct *tsk,
1055{ 1044{
1056 int maxfire; 1045 int maxfire;
1057 struct signal_struct *const sig = tsk->signal; 1046 struct signal_struct *const sig = tsk->signal;
1058 cputime_t utime, stime, ptime, virt_expires, prof_expires; 1047 cputime_t utime, ptime, virt_expires, prof_expires;
1059 unsigned long long sum_sched_runtime, sched_expires; 1048 unsigned long long sum_sched_runtime, sched_expires;
1060 struct task_struct *t;
1061 struct list_head *timers = sig->cpu_timers; 1049 struct list_head *timers = sig->cpu_timers;
1050 struct task_cputime cputime;
1062 1051
1063 /* 1052 /*
1064 * Don't sample the current process CPU clocks if there are no timers. 1053 * Don't sample the current process CPU clocks if there are no timers.
@@ -1074,18 +1063,10 @@ static void check_process_timers(struct task_struct *tsk,
1074 /* 1063 /*
1075 * Collect the current process totals. 1064 * Collect the current process totals.
1076 */ 1065 */
1077 utime = sig->utime; 1066 thread_group_cputime(tsk, &cputime);
1078 stime = sig->stime; 1067 utime = cputime.utime;
1079 sum_sched_runtime = sig->sum_sched_runtime; 1068 ptime = cputime_add(utime, cputime.stime);
1080 t = tsk; 1069 sum_sched_runtime = cputime.sum_exec_runtime;
1081 do {
1082 utime = cputime_add(utime, t->utime);
1083 stime = cputime_add(stime, t->stime);
1084 sum_sched_runtime += t->se.sum_exec_runtime;
1085 t = next_thread(t);
1086 } while (t != tsk);
1087 ptime = cputime_add(utime, stime);
1088
1089 maxfire = 20; 1070 maxfire = 20;
1090 prof_expires = cputime_zero; 1071 prof_expires = cputime_zero;
1091 while (!list_empty(timers)) { 1072 while (!list_empty(timers)) {
@@ -1193,60 +1174,18 @@ static void check_process_timers(struct task_struct *tsk,
1193 } 1174 }
1194 } 1175 }
1195 1176
1196 if (!cputime_eq(prof_expires, cputime_zero) || 1177 if (!cputime_eq(prof_expires, cputime_zero) &&
1197 !cputime_eq(virt_expires, cputime_zero) || 1178 (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) ||
1198 sched_expires != 0) { 1179 cputime_gt(sig->cputime_expires.prof_exp, prof_expires)))
1199 /* 1180 sig->cputime_expires.prof_exp = prof_expires;
1200 * Rebalance the threads' expiry times for the remaining 1181 if (!cputime_eq(virt_expires, cputime_zero) &&
1201 * process CPU timers. 1182 (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
1202 */ 1183 cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
1203 1184 sig->cputime_expires.virt_exp = virt_expires;
1204 cputime_t prof_left, virt_left, ticks; 1185 if (sched_expires != 0 &&
1205 unsigned long long sched_left, sched; 1186 (sig->cputime_expires.sched_exp == 0 ||
1206 const unsigned int nthreads = atomic_read(&sig->live); 1187 sig->cputime_expires.sched_exp > sched_expires))
1207 1188 sig->cputime_expires.sched_exp = sched_expires;
1208 if (!nthreads)
1209 return;
1210
1211 prof_left = cputime_sub(prof_expires, utime);
1212 prof_left = cputime_sub(prof_left, stime);
1213 prof_left = cputime_div_non_zero(prof_left, nthreads);
1214 virt_left = cputime_sub(virt_expires, utime);
1215 virt_left = cputime_div_non_zero(virt_left, nthreads);
1216 if (sched_expires) {
1217 sched_left = sched_expires - sum_sched_runtime;
1218 do_div(sched_left, nthreads);
1219 sched_left = max_t(unsigned long long, sched_left, 1);
1220 } else {
1221 sched_left = 0;
1222 }
1223 t = tsk;
1224 do {
1225 if (unlikely(t->flags & PF_EXITING))
1226 continue;
1227
1228 ticks = cputime_add(cputime_add(t->utime, t->stime),
1229 prof_left);
1230 if (!cputime_eq(prof_expires, cputime_zero) &&
1231 (cputime_eq(t->it_prof_expires, cputime_zero) ||
1232 cputime_gt(t->it_prof_expires, ticks))) {
1233 t->it_prof_expires = ticks;
1234 }
1235
1236 ticks = cputime_add(t->utime, virt_left);
1237 if (!cputime_eq(virt_expires, cputime_zero) &&
1238 (cputime_eq(t->it_virt_expires, cputime_zero) ||
1239 cputime_gt(t->it_virt_expires, ticks))) {
1240 t->it_virt_expires = ticks;
1241 }
1242
1243 sched = t->se.sum_exec_runtime + sched_left;
1244 if (sched_expires && (t->it_sched_expires == 0 ||
1245 t->it_sched_expires > sched)) {
1246 t->it_sched_expires = sched;
1247 }
1248 } while ((t = next_thread(t)) != tsk);
1249 }
1250} 1189}
1251 1190
1252/* 1191/*
@@ -1314,6 +1253,86 @@ out:
1314 ++timer->it_requeue_pending; 1253 ++timer->it_requeue_pending;
1315} 1254}
1316 1255
1256/**
1257 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1258 *
1259 * @cputime: The struct to compare.
1260 *
1261 * Checks @cputime to see if all fields are zero. Returns true if all fields
1262 * are zero, false if any field is nonzero.
1263 */
1264static inline int task_cputime_zero(const struct task_cputime *cputime)
1265{
1266 if (cputime_eq(cputime->utime, cputime_zero) &&
1267 cputime_eq(cputime->stime, cputime_zero) &&
1268 cputime->sum_exec_runtime == 0)
1269 return 1;
1270 return 0;
1271}
1272
1273/**
1274 * task_cputime_expired - Compare two task_cputime entities.
1275 *
1276 * @sample: The task_cputime structure to be checked for expiration.
1277 * @expires: Expiration times, against which @sample will be checked.
1278 *
1279 * Checks @sample against @expires to see if any field of @sample has expired.
1280 * Returns true if any field of the former is greater than the corresponding
1281 * field of the latter if the latter field is set. Otherwise returns false.
1282 */
1283static inline int task_cputime_expired(const struct task_cputime *sample,
1284 const struct task_cputime *expires)
1285{
1286 if (!cputime_eq(expires->utime, cputime_zero) &&
1287 cputime_ge(sample->utime, expires->utime))
1288 return 1;
1289 if (!cputime_eq(expires->stime, cputime_zero) &&
1290 cputime_ge(cputime_add(sample->utime, sample->stime),
1291 expires->stime))
1292 return 1;
1293 if (expires->sum_exec_runtime != 0 &&
1294 sample->sum_exec_runtime >= expires->sum_exec_runtime)
1295 return 1;
1296 return 0;
1297}
1298
1299/**
1300 * fastpath_timer_check - POSIX CPU timers fast path.
1301 *
1302 * @tsk: The task (thread) being checked.
1303 *
1304 * Check the task and thread group timers. If both are zero (there are no
1305 * timers set) return false. Otherwise snapshot the task and thread group
1306 * timers and compare them with the corresponding expiration times. Return
1307 * true if a timer has expired, else return false.
1308 */
1309static inline int fastpath_timer_check(struct task_struct *tsk)
1310{
1311 struct signal_struct *sig = tsk->signal;
1312
1313 if (unlikely(!sig))
1314 return 0;
1315
1316 if (!task_cputime_zero(&tsk->cputime_expires)) {
1317 struct task_cputime task_sample = {
1318 .utime = tsk->utime,
1319 .stime = tsk->stime,
1320 .sum_exec_runtime = tsk->se.sum_exec_runtime
1321 };
1322
1323 if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
1324 return 1;
1325 }
1326 if (!task_cputime_zero(&sig->cputime_expires)) {
1327 struct task_cputime group_sample;
1328
1329 thread_group_cputime(tsk, &group_sample);
1330 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1331 return 1;
1332 }
1333 return 0;
1334}
1335
1317/* 1336/*
1318 * This is called from the timer interrupt handler. The irq handler has 1337 * This is called from the timer interrupt handler. The irq handler has
1319 * already updated our counts. We need to check if any timers fire now. 1338 * already updated our counts. We need to check if any timers fire now.
@@ -1326,42 +1345,31 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1326 1345
1327 BUG_ON(!irqs_disabled()); 1346 BUG_ON(!irqs_disabled());
1328 1347
1329#define UNEXPIRED(clock) \ 1348 /*
1330 (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \ 1349 * The fast path checks that there are no expired thread or thread
1331 cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires)) 1350 * group timers. If that's so, just return.
1332 1351 */
1333 if (UNEXPIRED(prof) && UNEXPIRED(virt) && 1352 if (!fastpath_timer_check(tsk))
1334 (tsk->it_sched_expires == 0 ||
1335 tsk->se.sum_exec_runtime < tsk->it_sched_expires))
1336 return; 1353 return;
1337 1354
1338#undef UNEXPIRED 1355 spin_lock(&tsk->sighand->siglock);
1339
1340 /* 1356 /*
1341 * Double-check with locks held. 1357 * Here we take off tsk->signal->cpu_timers[N] and
1358 * tsk->cpu_timers[N] all the timers that are firing, and
1359 * put them on the firing list.
1342 */ 1360 */
1343 read_lock(&tasklist_lock); 1361 check_thread_timers(tsk, &firing);
1344 if (likely(tsk->signal != NULL)) { 1362 check_process_timers(tsk, &firing);
1345 spin_lock(&tsk->sighand->siglock);
1346 1363
1347 /* 1364 /*
1348 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] 1365 * We must release these locks before taking any timer's lock.
1349 * all the timers that are firing, and put them on the firing list. 1366 * There is a potential race with timer deletion here, as the
1350 */ 1367 * siglock now protects our private firing list. We have set
1351 check_thread_timers(tsk, &firing); 1368 * the firing flag in each timer, so that a deletion attempt
1352 check_process_timers(tsk, &firing); 1369 * that gets the timer lock before we do will give it up and
1353 1370 * spin until we've taken care of that timer below.
1354 /* 1371 */
1355 * We must release these locks before taking any timer's lock. 1372 spin_unlock(&tsk->sighand->siglock);
1356 * There is a potential race with timer deletion here, as the
1357 * siglock now protects our private firing list. We have set
1358 * the firing flag in each timer, so that a deletion attempt
1359 * that gets the timer lock before we do will give it up and
1360 * spin until we've taken care of that timer below.
1361 */
1362 spin_unlock(&tsk->sighand->siglock);
1363 }
1364 read_unlock(&tasklist_lock);
1365 1373
1366 /* 1374 /*
1367 * Now that all the timers on our list have the firing flag, 1375 * Now that all the timers on our list have the firing flag,
@@ -1389,10 +1397,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1389 1397
1390/* 1398/*
1391 * Set one of the process-wide special case CPU timers. 1399 * Set one of the process-wide special case CPU timers.
1392 * The tasklist_lock and tsk->sighand->siglock must be held by the caller. 1400 * The tsk->sighand->siglock must be held by the caller.
1393 * The oldval argument is null for the RLIMIT_CPU timer, where *newval is 1401 * The *newval argument is relative and we update it to be absolute, *oldval
1394 * absolute; non-null for ITIMER_*, where *newval is relative and we update 1402 * is absolute and we update it to be relative.
1395 * it to be absolute, *oldval is absolute and we update it to be relative.
1396 */ 1403 */
1397void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, 1404void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1398 cputime_t *newval, cputime_t *oldval) 1405 cputime_t *newval, cputime_t *oldval)
@@ -1401,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1401 struct list_head *head; 1408 struct list_head *head;
1402 1409
1403 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1410 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1404 cpu_clock_sample_group_locked(clock_idx, tsk, &now); 1411 cpu_clock_sample_group(clock_idx, tsk, &now);
1405 1412
1406 if (oldval) { 1413 if (oldval) {
1407 if (!cputime_eq(*oldval, cputime_zero)) { 1414 if (!cputime_eq(*oldval, cputime_zero)) {
@@ -1435,13 +1442,14 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1435 cputime_ge(list_first_entry(head, 1442 cputime_ge(list_first_entry(head,
1436 struct cpu_timer_list, entry)->expires.cpu, 1443 struct cpu_timer_list, entry)->expires.cpu,
1437 *newval)) { 1444 *newval)) {
1438 /* 1445 switch (clock_idx) {
1439 * Rejigger each thread's expiry time so that one will 1446 case CPUCLOCK_PROF:
1440 * notice before we hit the process-cumulative expiry time. 1447 tsk->signal->cputime_expires.prof_exp = *newval;
1441 */ 1448 break;
1442 union cpu_time_count expires = { .sched = 0 }; 1449 case CPUCLOCK_VIRT:
1443 expires.cpu = *newval; 1450 tsk->signal->cputime_expires.virt_exp = *newval;
1444 process_timer_rebalance(tsk, clock_idx, expires, now); 1451 break;
1452 }
1445 } 1453 }
1446} 1454}
1447 1455
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5131e5471169..b931d7cedbfa 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -223,6 +223,15 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
223} 223}
224 224
225/* 225/*
226 * Get monotonic time for posix timers
227 */
228static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
229{
230 getrawmonotonic(tp);
231 return 0;
232}
233
234/*
226 * Initialize everything, well, just everything in Posix clocks/timers ;) 235 * Initialize everything, well, just everything in Posix clocks/timers ;)
227 */ 236 */
228static __init int init_posix_timers(void) 237static __init int init_posix_timers(void)
@@ -235,9 +244,15 @@ static __init int init_posix_timers(void)
235 .clock_get = posix_ktime_get_ts, 244 .clock_get = posix_ktime_get_ts,
236 .clock_set = do_posix_clock_nosettime, 245 .clock_set = do_posix_clock_nosettime,
237 }; 246 };
247 struct k_clock clock_monotonic_raw = {
248 .clock_getres = hrtimer_get_res,
249 .clock_get = posix_get_monotonic_raw,
250 .clock_set = do_posix_clock_nosettime,
251 };
238 252
239 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 253 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
240 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 254 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
255 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
241 256
242 posix_timers_cache = kmem_cache_create("posix_timers_cache", 257 posix_timers_cache = kmem_cache_create("posix_timers_cache",
243 sizeof (struct k_itimer), 0, SLAB_PANIC, 258 sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -298,6 +313,7 @@ void do_schedule_next_timer(struct siginfo *info)
298 313
299int posix_timer_event(struct k_itimer *timr, int si_private) 314int posix_timer_event(struct k_itimer *timr, int si_private)
300{ 315{
316 int shared, ret;
301 /* 317 /*
302 * FIXME: if ->sigq is queued we can race with 318 * FIXME: if ->sigq is queued we can race with
303 * dequeue_signal()->do_schedule_next_timer(). 319 * dequeue_signal()->do_schedule_next_timer().
@@ -311,25 +327,10 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
311 */ 327 */
312 timr->sigq->info.si_sys_private = si_private; 328 timr->sigq->info.si_sys_private = si_private;
313 329
314 timr->sigq->info.si_signo = timr->it_sigev_signo; 330 shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
315 timr->sigq->info.si_code = SI_TIMER; 331 ret = send_sigqueue(timr->sigq, timr->it_process, shared);
316 timr->sigq->info.si_tid = timr->it_id; 332 /* If we failed to send the signal the timer stops. */
317 timr->sigq->info.si_value = timr->it_sigev_value; 333 return ret > 0;
318
319 if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
320 struct task_struct *leader;
321 int ret = send_sigqueue(timr->sigq, timr->it_process, 0);
322
323 if (likely(ret >= 0))
324 return ret;
325
326 timr->it_sigev_notify = SIGEV_SIGNAL;
327 leader = timr->it_process->group_leader;
328 put_task_struct(timr->it_process);
329 timr->it_process = leader;
330 }
331
332 return send_sigqueue(timr->sigq, timr->it_process, 1);
333} 334}
334EXPORT_SYMBOL_GPL(posix_timer_event); 335EXPORT_SYMBOL_GPL(posix_timer_event);
335 336
@@ -468,11 +469,9 @@ sys_timer_create(const clockid_t which_clock,
468 struct sigevent __user *timer_event_spec, 469 struct sigevent __user *timer_event_spec,
469 timer_t __user * created_timer_id) 470 timer_t __user * created_timer_id)
470{ 471{
471 int error = 0; 472 struct k_itimer *new_timer;
472 struct k_itimer *new_timer = NULL; 473 int error, new_timer_id;
473 int new_timer_id; 474 struct task_struct *process;
474 struct task_struct *process = NULL;
475 unsigned long flags;
476 sigevent_t event; 475 sigevent_t event;
477 int it_id_set = IT_ID_NOT_SET; 476 int it_id_set = IT_ID_NOT_SET;
478 477
@@ -490,12 +489,11 @@ sys_timer_create(const clockid_t which_clock,
490 goto out; 489 goto out;
491 } 490 }
492 spin_lock_irq(&idr_lock); 491 spin_lock_irq(&idr_lock);
493 error = idr_get_new(&posix_timers_id, (void *) new_timer, 492 error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
494 &new_timer_id);
495 spin_unlock_irq(&idr_lock); 493 spin_unlock_irq(&idr_lock);
496 if (error == -EAGAIN) 494 if (error) {
497 goto retry; 495 if (error == -EAGAIN)
498 else if (error) { 496 goto retry;
499 /* 497 /*
500 * Weird looking, but we return EAGAIN if the IDR is 498 * Weird looking, but we return EAGAIN if the IDR is
501 * full (proper POSIX return value for this) 499 * full (proper POSIX return value for this)
@@ -526,67 +524,43 @@ sys_timer_create(const clockid_t which_clock,
526 error = -EFAULT; 524 error = -EFAULT;
527 goto out; 525 goto out;
528 } 526 }
529 new_timer->it_sigev_notify = event.sigev_notify; 527 rcu_read_lock();
530 new_timer->it_sigev_signo = event.sigev_signo; 528 process = good_sigevent(&event);
531 new_timer->it_sigev_value = event.sigev_value; 529 if (process)
532 530 get_task_struct(process);
533 read_lock(&tasklist_lock); 531 rcu_read_unlock();
534 if ((process = good_sigevent(&event))) {
535 /*
536 * We may be setting up this process for another
537 * thread. It may be exiting. To catch this
538 * case the we check the PF_EXITING flag. If
539 * the flag is not set, the siglock will catch
540 * him before it is too late (in exit_itimers).
541 *
542 * The exec case is a bit more invloved but easy
543 * to code. If the process is in our thread
544 * group (and it must be or we would not allow
545 * it here) and is doing an exec, it will cause
546 * us to be killed. In this case it will wait
547 * for us to die which means we can finish this
548 * linkage with our last gasp. I.e. no code :)
549 */
550 spin_lock_irqsave(&process->sighand->siglock, flags);
551 if (!(process->flags & PF_EXITING)) {
552 new_timer->it_process = process;
553 list_add(&new_timer->list,
554 &process->signal->posix_timers);
555 if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
556 get_task_struct(process);
557 spin_unlock_irqrestore(&process->sighand->siglock, flags);
558 } else {
559 spin_unlock_irqrestore(&process->sighand->siglock, flags);
560 process = NULL;
561 }
562 }
563 read_unlock(&tasklist_lock);
564 if (!process) { 532 if (!process) {
565 error = -EINVAL; 533 error = -EINVAL;
566 goto out; 534 goto out;
567 } 535 }
568 } else { 536 } else {
569 new_timer->it_sigev_notify = SIGEV_SIGNAL; 537 event.sigev_notify = SIGEV_SIGNAL;
570 new_timer->it_sigev_signo = SIGALRM; 538 event.sigev_signo = SIGALRM;
571 new_timer->it_sigev_value.sival_int = new_timer->it_id; 539 event.sigev_value.sival_int = new_timer->it_id;
572 process = current->group_leader; 540 process = current->group_leader;
573 spin_lock_irqsave(&process->sighand->siglock, flags); 541 get_task_struct(process);
574 new_timer->it_process = process;
575 list_add(&new_timer->list, &process->signal->posix_timers);
576 spin_unlock_irqrestore(&process->sighand->siglock, flags);
577 } 542 }
578 543
544 new_timer->it_sigev_notify = event.sigev_notify;
545 new_timer->sigq->info.si_signo = event.sigev_signo;
546 new_timer->sigq->info.si_value = event.sigev_value;
547 new_timer->sigq->info.si_tid = new_timer->it_id;
548 new_timer->sigq->info.si_code = SI_TIMER;
549
550 spin_lock_irq(&current->sighand->siglock);
551 new_timer->it_process = process;
552 list_add(&new_timer->list, &current->signal->posix_timers);
553 spin_unlock_irq(&current->sighand->siglock);
554
555 return 0;
579 /* 556 /*
580 * In the case of the timer belonging to another task, after 557 * In the case of the timer belonging to another task, after
581 * the task is unlocked, the timer is owned by the other task 558 * the task is unlocked, the timer is owned by the other task
582 * and may cease to exist at any time. Don't use or modify 559 * and may cease to exist at any time. Don't use or modify
583 * new_timer after the unlock call. 560 * new_timer after the unlock call.
584 */ 561 */
585
586out: 562out:
587 if (error) 563 release_posix_timer(new_timer, it_id_set);
588 release_posix_timer(new_timer, it_id_set);
589
590 return error; 564 return error;
591} 565}
592 566
@@ -597,7 +571,7 @@ out:
597 * the find to the timer lock. To avoid a dead lock, the timer id MUST 571 * the find to the timer lock. To avoid a dead lock, the timer id MUST
598 * be release with out holding the timer lock. 572 * be release with out holding the timer lock.
599 */ 573 */
600static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) 574static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
601{ 575{
602 struct k_itimer *timr; 576 struct k_itimer *timr;
603 /* 577 /*
@@ -605,23 +579,20 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
605 * flags part over to the timer lock. Must not let interrupts in 579 * flags part over to the timer lock. Must not let interrupts in
606 * while we are moving the lock. 580 * while we are moving the lock.
607 */ 581 */
608
609 spin_lock_irqsave(&idr_lock, *flags); 582 spin_lock_irqsave(&idr_lock, *flags);
610 timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); 583 timr = idr_find(&posix_timers_id, (int)timer_id);
611 if (timr) { 584 if (timr) {
612 spin_lock(&timr->it_lock); 585 spin_lock(&timr->it_lock);
613 586 if (timr->it_process &&
614 if ((timr->it_id != timer_id) || !(timr->it_process) || 587 same_thread_group(timr->it_process, current)) {
615 !same_thread_group(timr->it_process, current)) {
616 spin_unlock(&timr->it_lock);
617 spin_unlock_irqrestore(&idr_lock, *flags);
618 timr = NULL;
619 } else
620 spin_unlock(&idr_lock); 588 spin_unlock(&idr_lock);
621 } else 589 return timr;
622 spin_unlock_irqrestore(&idr_lock, *flags); 590 }
591 spin_unlock(&timr->it_lock);
592 }
593 spin_unlock_irqrestore(&idr_lock, *flags);
623 594
624 return timr; 595 return NULL;
625} 596}
626 597
627/* 598/*
@@ -862,8 +833,7 @@ retry_delete:
862 * This keeps any tasks waiting on the spin lock from thinking 833 * This keeps any tasks waiting on the spin lock from thinking
863 * they got something (see the lock code above). 834 * they got something (see the lock code above).
864 */ 835 */
865 if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) 836 put_task_struct(timer->it_process);
866 put_task_struct(timer->it_process);
867 timer->it_process = NULL; 837 timer->it_process = NULL;
868 838
869 unlock_timer(timer, flags); 839 unlock_timer(timer, flags);
@@ -890,8 +860,7 @@ retry_delete:
890 * This keeps any tasks waiting on the spin lock from thinking 860 * This keeps any tasks waiting on the spin lock from thinking
891 * they got something (see the lock code above). 861 * they got something (see the lock code above).
892 */ 862 */
893 if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) 863 put_task_struct(timer->it_process);
894 put_task_struct(timer->it_process);
895 timer->it_process = NULL; 864 timer->it_process = NULL;
896 865
897 unlock_timer(timer, flags); 866 unlock_timer(timer, flags);
diff --git a/kernel/sched.c b/kernel/sched.c
index 6f230596bd0c..09a8c15748f1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4052,23 +4052,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
4052EXPORT_PER_CPU_SYMBOL(kstat); 4052EXPORT_PER_CPU_SYMBOL(kstat);
4053 4053
4054/* 4054/*
4055 * Return p->sum_exec_runtime plus any more ns on the sched_clock 4055 * Return any ns on the sched_clock that have not yet been banked in
4056 * that have not yet been banked in case the task is currently running. 4056 * @p in case that task is currently running.
4057 */ 4057 */
4058unsigned long long task_sched_runtime(struct task_struct *p) 4058unsigned long long task_delta_exec(struct task_struct *p)
4059{ 4059{
4060 unsigned long flags; 4060 unsigned long flags;
4061 u64 ns, delta_exec;
4062 struct rq *rq; 4061 struct rq *rq;
4062 u64 ns = 0;
4063 4063
4064 rq = task_rq_lock(p, &flags); 4064 rq = task_rq_lock(p, &flags);
4065 ns = p->se.sum_exec_runtime; 4065
4066 if (task_current(rq, p)) { 4066 if (task_current(rq, p)) {
4067 u64 delta_exec;
4068
4067 update_rq_clock(rq); 4069 update_rq_clock(rq);
4068 delta_exec = rq->clock - p->se.exec_start; 4070 delta_exec = rq->clock - p->se.exec_start;
4069 if ((s64)delta_exec > 0) 4071 if ((s64)delta_exec > 0)
4070 ns += delta_exec; 4072 ns = delta_exec;
4071 } 4073 }
4074
4072 task_rq_unlock(rq, &flags); 4075 task_rq_unlock(rq, &flags);
4073 4076
4074 return ns; 4077 return ns;
@@ -4085,6 +4088,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
4085 cputime64_t tmp; 4088 cputime64_t tmp;
4086 4089
4087 p->utime = cputime_add(p->utime, cputime); 4090 p->utime = cputime_add(p->utime, cputime);
4091 account_group_user_time(p, cputime);
4088 4092
4089 /* Add user time to cpustat. */ 4093 /* Add user time to cpustat. */
4090 tmp = cputime_to_cputime64(cputime); 4094 tmp = cputime_to_cputime64(cputime);
@@ -4109,6 +4113,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
4109 tmp = cputime_to_cputime64(cputime); 4113 tmp = cputime_to_cputime64(cputime);
4110 4114
4111 p->utime = cputime_add(p->utime, cputime); 4115 p->utime = cputime_add(p->utime, cputime);
4116 account_group_user_time(p, cputime);
4112 p->gtime = cputime_add(p->gtime, cputime); 4117 p->gtime = cputime_add(p->gtime, cputime);
4113 4118
4114 cpustat->user = cputime64_add(cpustat->user, tmp); 4119 cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -4144,6 +4149,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4144 } 4149 }
4145 4150
4146 p->stime = cputime_add(p->stime, cputime); 4151 p->stime = cputime_add(p->stime, cputime);
4152 account_group_system_time(p, cputime);
4147 4153
4148 /* Add system time to cpustat. */ 4154 /* Add system time to cpustat. */
4149 tmp = cputime_to_cputime64(cputime); 4155 tmp = cputime_to_cputime64(cputime);
@@ -4185,6 +4191,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
4185 4191
4186 if (p == rq->idle) { 4192 if (p == rq->idle) {
4187 p->stime = cputime_add(p->stime, steal); 4193 p->stime = cputime_add(p->stime, steal);
4194 account_group_system_time(p, steal);
4188 if (atomic_read(&rq->nr_iowait) > 0) 4195 if (atomic_read(&rq->nr_iowait) > 0)
4189 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 4196 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4190 else 4197 else
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 18fd17172eb6..f604dae71316 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -449,6 +449,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
449 struct task_struct *curtask = task_of(curr); 449 struct task_struct *curtask = task_of(curr);
450 450
451 cpuacct_charge(curtask, delta_exec); 451 cpuacct_charge(curtask, delta_exec);
452 account_group_exec_runtime(curtask, delta_exec);
452 } 453 }
453} 454}
454 455
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index cdf5740ab03e..b446dc87494f 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -526,6 +526,8 @@ static void update_curr_rt(struct rq *rq)
526 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); 526 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
527 527
528 curr->se.sum_exec_runtime += delta_exec; 528 curr->se.sum_exec_runtime += delta_exec;
529 account_group_exec_runtime(curr, delta_exec);
530
529 curr->se.exec_start = rq->clock; 531 curr->se.exec_start = rq->clock;
530 cpuacct_charge(curr, delta_exec); 532 cpuacct_charge(curr, delta_exec);
531 533
@@ -1458,7 +1460,7 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1458 p->rt.timeout++; 1460 p->rt.timeout++;
1459 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); 1461 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1460 if (p->rt.timeout > next) 1462 if (p->rt.timeout > next)
1461 p->it_sched_expires = p->se.sum_exec_runtime; 1463 p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
1462 } 1464 }
1463} 1465}
1464 1466
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 8385d43987e2..b8c156979cf2 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -270,3 +270,89 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
270#define sched_info_switch(t, next) do { } while (0) 270#define sched_info_switch(t, next) do { } while (0)
271#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ 271#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
272 272
273/*
274 * The following are functions that support scheduler-internal time accounting.
275 * These functions are generally called at the timer tick. None of this depends
276 * on CONFIG_SCHEDSTATS.
277 */
278
279/**
280 * account_group_user_time - Maintain utime for a thread group.
281 *
282 * @tsk: Pointer to task structure.
283 * @cputime: Time value by which to increment the utime field of the
284 * thread_group_cputime structure.
285 *
286 * If thread group time is being maintained, get the structure for the
287 * running CPU and update the utime field there.
288 */
289static inline void account_group_user_time(struct task_struct *tsk,
290 cputime_t cputime)
291{
292 struct signal_struct *sig;
293
294 sig = tsk->signal;
295 if (unlikely(!sig))
296 return;
297 if (sig->cputime.totals) {
298 struct task_cputime *times;
299
300 times = per_cpu_ptr(sig->cputime.totals, get_cpu());
301 times->utime = cputime_add(times->utime, cputime);
302 put_cpu_no_resched();
303 }
304}
305
306/**
307 * account_group_system_time - Maintain stime for a thread group.
308 *
309 * @tsk: Pointer to task structure.
310 * @cputime: Time value by which to increment the stime field of the
311 * thread_group_cputime structure.
312 *
313 * If thread group time is being maintained, get the structure for the
314 * running CPU and update the stime field there.
315 */
316static inline void account_group_system_time(struct task_struct *tsk,
317 cputime_t cputime)
318{
319 struct signal_struct *sig;
320
321 sig = tsk->signal;
322 if (unlikely(!sig))
323 return;
324 if (sig->cputime.totals) {
325 struct task_cputime *times;
326
327 times = per_cpu_ptr(sig->cputime.totals, get_cpu());
328 times->stime = cputime_add(times->stime, cputime);
329 put_cpu_no_resched();
330 }
331}
332
333/**
334 * account_group_exec_runtime - Maintain exec runtime for a thread group.
335 *
336 * @tsk: Pointer to task structure.
337 * @ns: Time value by which to increment the sum_exec_runtime field
338 * of the thread_group_cputime structure.
339 *
340 * If thread group time is being maintained, get the structure for the
341 * running CPU and update the sum_exec_runtime field there.
342 */
343static inline void account_group_exec_runtime(struct task_struct *tsk,
344 unsigned long long ns)
345{
346 struct signal_struct *sig;
347
348 sig = tsk->signal;
349 if (unlikely(!sig))
350 return;
351 if (sig->cputime.totals) {
352 struct task_cputime *times;
353
354 times = per_cpu_ptr(sig->cputime.totals, get_cpu());
355 times->sum_exec_runtime += ns;
356 put_cpu_no_resched();
357 }
358}
diff --git a/kernel/signal.c b/kernel/signal.c
index e661b01d340f..6eea5826d618 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1338,6 +1338,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1338 struct siginfo info; 1338 struct siginfo info;
1339 unsigned long flags; 1339 unsigned long flags;
1340 struct sighand_struct *psig; 1340 struct sighand_struct *psig;
1341 struct task_cputime cputime;
1341 int ret = sig; 1342 int ret = sig;
1342 1343
1343 BUG_ON(sig == -1); 1344 BUG_ON(sig == -1);
@@ -1368,10 +1369,9 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1368 1369
1369 info.si_uid = tsk->uid; 1370 info.si_uid = tsk->uid;
1370 1371
1371 info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, 1372 thread_group_cputime(tsk, &cputime);
1372 tsk->signal->utime)); 1373 info.si_utime = cputime_to_jiffies(cputime.utime);
1373 info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, 1374 info.si_stime = cputime_to_jiffies(cputime.stime);
1374 tsk->signal->stime));
1375 1375
1376 info.si_status = tsk->exit_code & 0x7f; 1376 info.si_status = tsk->exit_code & 0x7f;
1377 if (tsk->exit_code & 0x80) 1377 if (tsk->exit_code & 0x80)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 83ba21a13bd4..7110daeb9a90 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -267,16 +267,12 @@ asmlinkage void do_softirq(void)
267 */ 267 */
268void irq_enter(void) 268void irq_enter(void)
269{ 269{
270#ifdef CONFIG_NO_HZ
271 int cpu = smp_processor_id(); 270 int cpu = smp_processor_id();
271
272 if (idle_cpu(cpu) && !in_interrupt()) 272 if (idle_cpu(cpu) && !in_interrupt())
273 tick_nohz_stop_idle(cpu); 273 tick_check_idle(cpu);
274#endif 274
275 __irq_enter(); 275 __irq_enter();
276#ifdef CONFIG_NO_HZ
277 if (idle_cpu(cpu))
278 tick_nohz_update_jiffies();
279#endif
280} 276}
281 277
282#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 278#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
diff --git a/kernel/sys.c b/kernel/sys.c
index 0bc8fa3c2288..53879cdae483 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -853,38 +853,28 @@ asmlinkage long sys_setfsgid(gid_t gid)
853 return old_fsgid; 853 return old_fsgid;
854} 854}
855 855
856void do_sys_times(struct tms *tms)
857{
858 struct task_cputime cputime;
859 cputime_t cutime, cstime;
860
861 spin_lock_irq(&current->sighand->siglock);
862 thread_group_cputime(current, &cputime);
863 cutime = current->signal->cutime;
864 cstime = current->signal->cstime;
865 spin_unlock_irq(&current->sighand->siglock);
866 tms->tms_utime = cputime_to_clock_t(cputime.utime);
867 tms->tms_stime = cputime_to_clock_t(cputime.stime);
868 tms->tms_cutime = cputime_to_clock_t(cutime);
869 tms->tms_cstime = cputime_to_clock_t(cstime);
870}
871
856asmlinkage long sys_times(struct tms __user * tbuf) 872asmlinkage long sys_times(struct tms __user * tbuf)
857{ 873{
858 /*
859 * In the SMP world we might just be unlucky and have one of
860 * the times increment as we use it. Since the value is an
861 * atomically safe type this is just fine. Conceptually its
862 * as if the syscall took an instant longer to occur.
863 */
864 if (tbuf) { 874 if (tbuf) {
865 struct tms tmp; 875 struct tms tmp;
866 struct task_struct *tsk = current; 876
867 struct task_struct *t; 877 do_sys_times(&tmp);
868 cputime_t utime, stime, cutime, cstime;
869
870 spin_lock_irq(&tsk->sighand->siglock);
871 utime = tsk->signal->utime;
872 stime = tsk->signal->stime;
873 t = tsk;
874 do {
875 utime = cputime_add(utime, t->utime);
876 stime = cputime_add(stime, t->stime);
877 t = next_thread(t);
878 } while (t != tsk);
879
880 cutime = tsk->signal->cutime;
881 cstime = tsk->signal->cstime;
882 spin_unlock_irq(&tsk->sighand->siglock);
883
884 tmp.tms_utime = cputime_to_clock_t(utime);
885 tmp.tms_stime = cputime_to_clock_t(stime);
886 tmp.tms_cutime = cputime_to_clock_t(cutime);
887 tmp.tms_cstime = cputime_to_clock_t(cstime);
888 if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) 878 if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
889 return -EFAULT; 879 return -EFAULT;
890 } 880 }
@@ -1449,7 +1439,6 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
1449asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) 1439asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1450{ 1440{
1451 struct rlimit new_rlim, *old_rlim; 1441 struct rlimit new_rlim, *old_rlim;
1452 unsigned long it_prof_secs;
1453 int retval; 1442 int retval;
1454 1443
1455 if (resource >= RLIM_NLIMITS) 1444 if (resource >= RLIM_NLIMITS)
@@ -1503,18 +1492,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1503 if (new_rlim.rlim_cur == RLIM_INFINITY) 1492 if (new_rlim.rlim_cur == RLIM_INFINITY)
1504 goto out; 1493 goto out;
1505 1494
1506 it_prof_secs = cputime_to_secs(current->signal->it_prof_expires); 1495 update_rlimit_cpu(new_rlim.rlim_cur);
1507 if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
1508 unsigned long rlim_cur = new_rlim.rlim_cur;
1509 cputime_t cputime;
1510
1511 cputime = secs_to_cputime(rlim_cur);
1512 read_lock(&tasklist_lock);
1513 spin_lock_irq(&current->sighand->siglock);
1514 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
1515 spin_unlock_irq(&current->sighand->siglock);
1516 read_unlock(&tasklist_lock);
1517 }
1518out: 1496out:
1519 return 0; 1497 return 0;
1520} 1498}
@@ -1552,11 +1530,8 @@ out:
1552 * 1530 *
1553 */ 1531 */
1554 1532
1555static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r, 1533static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
1556 cputime_t *utimep, cputime_t *stimep)
1557{ 1534{
1558 *utimep = cputime_add(*utimep, t->utime);
1559 *stimep = cputime_add(*stimep, t->stime);
1560 r->ru_nvcsw += t->nvcsw; 1535 r->ru_nvcsw += t->nvcsw;
1561 r->ru_nivcsw += t->nivcsw; 1536 r->ru_nivcsw += t->nivcsw;
1562 r->ru_minflt += t->min_flt; 1537 r->ru_minflt += t->min_flt;
@@ -1570,12 +1545,13 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1570 struct task_struct *t; 1545 struct task_struct *t;
1571 unsigned long flags; 1546 unsigned long flags;
1572 cputime_t utime, stime; 1547 cputime_t utime, stime;
1548 struct task_cputime cputime;
1573 1549
1574 memset((char *) r, 0, sizeof *r); 1550 memset((char *) r, 0, sizeof *r);
1575 utime = stime = cputime_zero; 1551 utime = stime = cputime_zero;
1576 1552
1577 if (who == RUSAGE_THREAD) { 1553 if (who == RUSAGE_THREAD) {
1578 accumulate_thread_rusage(p, r, &utime, &stime); 1554 accumulate_thread_rusage(p, r);
1579 goto out; 1555 goto out;
1580 } 1556 }
1581 1557
@@ -1598,8 +1574,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1598 break; 1574 break;
1599 1575
1600 case RUSAGE_SELF: 1576 case RUSAGE_SELF:
1601 utime = cputime_add(utime, p->signal->utime); 1577 thread_group_cputime(p, &cputime);
1602 stime = cputime_add(stime, p->signal->stime); 1578 utime = cputime_add(utime, cputime.utime);
1579 stime = cputime_add(stime, cputime.stime);
1603 r->ru_nvcsw += p->signal->nvcsw; 1580 r->ru_nvcsw += p->signal->nvcsw;
1604 r->ru_nivcsw += p->signal->nivcsw; 1581 r->ru_nivcsw += p->signal->nivcsw;
1605 r->ru_minflt += p->signal->min_flt; 1582 r->ru_minflt += p->signal->min_flt;
@@ -1608,7 +1585,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1608 r->ru_oublock += p->signal->oublock; 1585 r->ru_oublock += p->signal->oublock;
1609 t = p; 1586 t = p;
1610 do { 1587 do {
1611 accumulate_thread_rusage(t, r, &utime, &stime); 1588 accumulate_thread_rusage(t, r);
1612 t = next_thread(t); 1589 t = next_thread(t);
1613 } while (t != p); 1590 } while (t != p);
1614 break; 1591 break;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 093d4acf993b..9ed2eec97526 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -325,6 +325,9 @@ int clocksource_register(struct clocksource *c)
325 unsigned long flags; 325 unsigned long flags;
326 int ret; 326 int ret;
327 327
328 /* save mult_orig on registration */
329 c->mult_orig = c->mult;
330
328 spin_lock_irqsave(&clocksource_lock, flags); 331 spin_lock_irqsave(&clocksource_lock, flags);
329 ret = clocksource_enqueue(c); 332 ret = clocksource_enqueue(c);
330 if (!ret) 333 if (!ret)
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 4c256fdb8875..1ca99557e929 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,6 +61,7 @@ struct clocksource clocksource_jiffies = {
61 .read = jiffies_read, 61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/ 62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT,
64 .shift = JIFFIES_SHIFT, 65 .shift = JIFFIES_SHIFT,
65}; 66};
66 67
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 1ad46f3df6e7..1a20715bfd6e 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -10,13 +10,13 @@
10 10
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/timer.h>
14#include <linux/timex.h> 13#include <linux/timex.h>
15#include <linux/jiffies.h> 14#include <linux/jiffies.h>
16#include <linux/hrtimer.h> 15#include <linux/hrtimer.h>
17#include <linux/capability.h> 16#include <linux/capability.h>
18#include <linux/math64.h> 17#include <linux/math64.h>
19#include <linux/clocksource.h> 18#include <linux/clocksource.h>
19#include <linux/workqueue.h>
20#include <asm/timex.h> 20#include <asm/timex.h>
21 21
22/* 22/*
@@ -218,11 +218,11 @@ void second_overflow(void)
218/* Disable the cmos update - used by virtualization and embedded */ 218/* Disable the cmos update - used by virtualization and embedded */
219int no_sync_cmos_clock __read_mostly; 219int no_sync_cmos_clock __read_mostly;
220 220
221static void sync_cmos_clock(unsigned long dummy); 221static void sync_cmos_clock(struct work_struct *work);
222 222
223static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); 223static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
224 224
225static void sync_cmos_clock(unsigned long dummy) 225static void sync_cmos_clock(struct work_struct *work)
226{ 226{
227 struct timespec now, next; 227 struct timespec now, next;
228 int fail = 1; 228 int fail = 1;
@@ -258,13 +258,13 @@ static void sync_cmos_clock(unsigned long dummy)
258 next.tv_sec++; 258 next.tv_sec++;
259 next.tv_nsec -= NSEC_PER_SEC; 259 next.tv_nsec -= NSEC_PER_SEC;
260 } 260 }
261 mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); 261 schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
262} 262}
263 263
264static void notify_cmos_timer(void) 264static void notify_cmos_timer(void)
265{ 265{
266 if (!no_sync_cmos_clock) 266 if (!no_sync_cmos_clock)
267 mod_timer(&sync_cmos_timer, jiffies + 1); 267 schedule_delayed_work(&sync_cmos_work, 0);
268} 268}
269 269
270#else 270#else
@@ -277,38 +277,50 @@ static inline void notify_cmos_timer(void) { }
277int do_adjtimex(struct timex *txc) 277int do_adjtimex(struct timex *txc)
278{ 278{
279 struct timespec ts; 279 struct timespec ts;
280 long save_adjust, sec;
281 int result; 280 int result;
282 281
283 /* In order to modify anything, you gotta be super-user! */ 282 /* Validate the data before disabling interrupts */
284 if (txc->modes && !capable(CAP_SYS_TIME)) 283 if (txc->modes & ADJ_ADJTIME) {
285 return -EPERM;
286
287 /* Now we validate the data before disabling interrupts */
288
289 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
290 /* singleshot must not be used with any other mode bits */ 284 /* singleshot must not be used with any other mode bits */
291 if (txc->modes & ~ADJ_OFFSET_SS_READ) 285 if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
292 return -EINVAL; 286 return -EINVAL;
287 if (!(txc->modes & ADJ_OFFSET_READONLY) &&
288 !capable(CAP_SYS_TIME))
289 return -EPERM;
290 } else {
291 /* In order to modify anything, you gotta be super-user! */
292 if (txc->modes && !capable(CAP_SYS_TIME))
293 return -EPERM;
294
295 /* if the quartz is off by more than 10% something is VERY wrong! */
296 if (txc->modes & ADJ_TICK &&
297 (txc->tick < 900000/USER_HZ ||
298 txc->tick > 1100000/USER_HZ))
299 return -EINVAL;
300
301 if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
302 hrtimer_cancel(&leap_timer);
293 } 303 }
294 304
295 /* if the quartz is off by more than 10% something is VERY wrong ! */
296 if (txc->modes & ADJ_TICK)
297 if (txc->tick < 900000/USER_HZ ||
298 txc->tick > 1100000/USER_HZ)
299 return -EINVAL;
300
301 if (time_state != TIME_OK && txc->modes & ADJ_STATUS)
302 hrtimer_cancel(&leap_timer);
303 getnstimeofday(&ts); 305 getnstimeofday(&ts);
304 306
305 write_seqlock_irq(&xtime_lock); 307 write_seqlock_irq(&xtime_lock);
306 308
307 /* Save for later - semantics of adjtime is to return old value */
308 save_adjust = time_adjust;
309
310 /* If there are input parameters, then process them */ 309 /* If there are input parameters, then process them */
310 if (txc->modes & ADJ_ADJTIME) {
311 long save_adjust = time_adjust;
312
313 if (!(txc->modes & ADJ_OFFSET_READONLY)) {
314 /* adjtime() is independent from ntp_adjtime() */
315 time_adjust = txc->offset;
316 ntp_update_frequency();
317 }
318 txc->offset = save_adjust;
319 goto adj_done;
320 }
311 if (txc->modes) { 321 if (txc->modes) {
322 long sec;
323
312 if (txc->modes & ADJ_STATUS) { 324 if (txc->modes & ADJ_STATUS) {
313 if ((time_status & STA_PLL) && 325 if ((time_status & STA_PLL) &&
314 !(txc->status & STA_PLL)) { 326 !(txc->status & STA_PLL)) {
@@ -375,13 +387,8 @@ int do_adjtimex(struct timex *txc)
375 if (txc->modes & ADJ_TAI && txc->constant > 0) 387 if (txc->modes & ADJ_TAI && txc->constant > 0)
376 time_tai = txc->constant; 388 time_tai = txc->constant;
377 389
378 if (txc->modes & ADJ_OFFSET) { 390 if (txc->modes & ADJ_OFFSET)
379 if (txc->modes == ADJ_OFFSET_SINGLESHOT) 391 ntp_update_offset(txc->offset);
380 /* adjtime() is independent from ntp_adjtime() */
381 time_adjust = txc->offset;
382 else
383 ntp_update_offset(txc->offset);
384 }
385 if (txc->modes & ADJ_TICK) 392 if (txc->modes & ADJ_TICK)
386 tick_usec = txc->tick; 393 tick_usec = txc->tick;
387 394
@@ -389,22 +396,18 @@ int do_adjtimex(struct timex *txc)
389 ntp_update_frequency(); 396 ntp_update_frequency();
390 } 397 }
391 398
399 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
400 NTP_SCALE_SHIFT);
401 if (!(time_status & STA_NANO))
402 txc->offset /= NSEC_PER_USEC;
403
404adj_done:
392 result = time_state; /* mostly `TIME_OK' */ 405 result = time_state; /* mostly `TIME_OK' */
393 if (time_status & (STA_UNSYNC|STA_CLOCKERR)) 406 if (time_status & (STA_UNSYNC|STA_CLOCKERR))
394 result = TIME_ERROR; 407 result = TIME_ERROR;
395 408
396 if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || 409 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
397 (txc->modes == ADJ_OFFSET_SS_READ)) 410 (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);
398 txc->offset = save_adjust;
399 else {
400 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
401 NTP_SCALE_SHIFT);
402 if (!(time_status & STA_NANO))
403 txc->offset /= NSEC_PER_USEC;
404 }
405 txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
406 (s64)PPM_SCALE_INV,
407 NTP_SCALE_SHIFT);
408 txc->maxerror = time_maxerror; 411 txc->maxerror = time_maxerror;
409 txc->esterror = time_esterror; 412 txc->esterror = time_esterror;
410 txc->status = time_status; 413 txc->status = time_status;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index cb01cd8f919b..f98a1b7b16e9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -384,6 +384,19 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
384} 384}
385 385
386/* 386/*
387 * Called from irq_enter() when idle was interrupted to reenable the
388 * per cpu device.
389 */
390void tick_check_oneshot_broadcast(int cpu)
391{
392 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
393 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
394
395 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
396 }
397}
398
399/*
387 * Handle oneshot mode broadcasting 400 * Handle oneshot mode broadcasting
388 */ 401 */
389static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) 402static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 469248782c23..b1c05bf75ee0 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -36,6 +36,7 @@ extern void tick_broadcast_switch_to_oneshot(void);
36extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); 36extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
37extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); 37extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
38extern int tick_broadcast_oneshot_active(void); 38extern int tick_broadcast_oneshot_active(void);
39extern void tick_check_oneshot_broadcast(int cpu);
39# else /* BROADCAST */ 40# else /* BROADCAST */
40static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 41static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
41{ 42{
@@ -45,6 +46,7 @@ static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
45static inline void tick_broadcast_switch_to_oneshot(void) { } 46static inline void tick_broadcast_switch_to_oneshot(void) { }
46static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 47static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
47static inline int tick_broadcast_oneshot_active(void) { return 0; } 48static inline int tick_broadcast_oneshot_active(void) { return 0; }
49static inline void tick_check_oneshot_broadcast(int cpu) { }
48# endif /* !BROADCAST */ 50# endif /* !BROADCAST */
49 51
50#else /* !ONESHOT */ 52#else /* !ONESHOT */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b711ffcb106c..0581c11fe6c6 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -155,7 +155,7 @@ void tick_nohz_update_jiffies(void)
155 touch_softlockup_watchdog(); 155 touch_softlockup_watchdog();
156} 156}
157 157
158void tick_nohz_stop_idle(int cpu) 158static void tick_nohz_stop_idle(int cpu)
159{ 159{
160 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 160 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
161 161
@@ -377,6 +377,32 @@ ktime_t tick_nohz_get_sleep_length(void)
377 return ts->sleep_length; 377 return ts->sleep_length;
378} 378}
379 379
380static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
381{
382 hrtimer_cancel(&ts->sched_timer);
383 ts->sched_timer.expires = ts->idle_tick;
384
385 while (1) {
386 /* Forward the time to expire in the future */
387 hrtimer_forward(&ts->sched_timer, now, tick_period);
388
389 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
390 hrtimer_start(&ts->sched_timer,
391 ts->sched_timer.expires,
392 HRTIMER_MODE_ABS);
393 /* Check, if the timer was already in the past */
394 if (hrtimer_active(&ts->sched_timer))
395 break;
396 } else {
397 if (!tick_program_event(ts->sched_timer.expires, 0))
398 break;
399 }
400 /* Update jiffies and reread time */
401 tick_do_update_jiffies64(now);
402 now = ktime_get();
403 }
404}
405
380/** 406/**
381 * tick_nohz_restart_sched_tick - restart the idle tick from the idle task 407 * tick_nohz_restart_sched_tick - restart the idle tick from the idle task
382 * 408 *
@@ -430,28 +456,7 @@ void tick_nohz_restart_sched_tick(void)
430 */ 456 */
431 ts->tick_stopped = 0; 457 ts->tick_stopped = 0;
432 ts->idle_exittime = now; 458 ts->idle_exittime = now;
433 hrtimer_cancel(&ts->sched_timer); 459 tick_nohz_restart(ts, now);
434 ts->sched_timer.expires = ts->idle_tick;
435
436 while (1) {
437 /* Forward the time to expire in the future */
438 hrtimer_forward(&ts->sched_timer, now, tick_period);
439
440 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
441 hrtimer_start(&ts->sched_timer,
442 ts->sched_timer.expires,
443 HRTIMER_MODE_ABS);
444 /* Check, if the timer was already in the past */
445 if (hrtimer_active(&ts->sched_timer))
446 break;
447 } else {
448 if (!tick_program_event(ts->sched_timer.expires, 0))
449 break;
450 }
451 /* Update jiffies and reread time */
452 tick_do_update_jiffies64(now);
453 now = ktime_get();
454 }
455 local_irq_enable(); 460 local_irq_enable();
456} 461}
457 462
@@ -503,10 +508,6 @@ static void tick_nohz_handler(struct clock_event_device *dev)
503 update_process_times(user_mode(regs)); 508 update_process_times(user_mode(regs));
504 profile_tick(CPU_PROFILING); 509 profile_tick(CPU_PROFILING);
505 510
506 /* Do not restart, when we are in the idle loop */
507 if (ts->tick_stopped)
508 return;
509
510 while (tick_nohz_reprogram(ts, now)) { 511 while (tick_nohz_reprogram(ts, now)) {
511 now = ktime_get(); 512 now = ktime_get();
512 tick_do_update_jiffies64(now); 513 tick_do_update_jiffies64(now);
@@ -552,6 +553,27 @@ static void tick_nohz_switch_to_nohz(void)
552 smp_processor_id()); 553 smp_processor_id());
553} 554}
554 555
556/*
557 * When NOHZ is enabled and the tick is stopped, we need to kick the
558 * tick timer from irq_enter() so that the jiffies update is kept
559 * alive during long running softirqs. That's ugly as hell, but
560 * correctness is key even if we need to fix the offending softirq in
561 * the first place.
562 *
563 * Note, this is different to tick_nohz_restart. We just kick the
564 * timer and do not touch the other magic bits which need to be done
565 * when idle is left.
566 */
567static void tick_nohz_kick_tick(int cpu)
568{
569 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
570
571 if (!ts->tick_stopped)
572 return;
573
574 tick_nohz_restart(ts, ktime_get());
575}
576
555#else 577#else
556 578
557static inline void tick_nohz_switch_to_nohz(void) { } 579static inline void tick_nohz_switch_to_nohz(void) { }
@@ -559,6 +581,19 @@ static inline void tick_nohz_switch_to_nohz(void) { }
559#endif /* NO_HZ */ 581#endif /* NO_HZ */
560 582
561/* 583/*
584 * Called from irq_enter to notify about the possible interruption of idle()
585 */
586void tick_check_idle(int cpu)
587{
588 tick_check_oneshot_broadcast(cpu);
589#ifdef CONFIG_NO_HZ
590 tick_nohz_stop_idle(cpu);
591 tick_nohz_update_jiffies();
592 tick_nohz_kick_tick(cpu);
593#endif
594}
595
596/*
562 * High resolution timer specific code 597 * High resolution timer specific code
563 */ 598 */
564#ifdef CONFIG_HIGH_RES_TIMERS 599#ifdef CONFIG_HIGH_RES_TIMERS
@@ -611,10 +646,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
611 profile_tick(CPU_PROFILING); 646 profile_tick(CPU_PROFILING);
612 } 647 }
613 648
614 /* Do not restart, when we are in the idle loop */
615 if (ts->tick_stopped)
616 return HRTIMER_NORESTART;
617
618 hrtimer_forward(timer, now, tick_period); 649 hrtimer_forward(timer, now, tick_period);
619 650
620 return HRTIMER_RESTART; 651 return HRTIMER_RESTART;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e91c29f961c9..e7acfb482a68 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -58,27 +58,26 @@ struct clocksource *clock;
58 58
59#ifdef CONFIG_GENERIC_TIME 59#ifdef CONFIG_GENERIC_TIME
60/** 60/**
61 * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook 61 * clocksource_forward_now - update clock to the current time
62 * 62 *
63 * private function, must hold xtime_lock lock when being 63 * Forward the current clock to update its state since the last call to
64 * called. Returns the number of nanoseconds since the 64 * update_wall_time(). This is useful before significant clock changes,
65 * last call to update_wall_time() (adjusted by NTP scaling) 65 * as it avoids having to deal with this time offset explicitly.
66 */ 66 */
67static inline s64 __get_nsec_offset(void) 67static void clocksource_forward_now(void)
68{ 68{
69 cycle_t cycle_now, cycle_delta; 69 cycle_t cycle_now, cycle_delta;
70 s64 ns_offset; 70 s64 nsec;
71 71
72 /* read clocksource: */
73 cycle_now = clocksource_read(clock); 72 cycle_now = clocksource_read(clock);
74
75 /* calculate the delta since the last update_wall_time: */
76 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 73 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
74 clock->cycle_last = cycle_now;
77 75
78 /* convert to nanoseconds: */ 76 nsec = cyc2ns(clock, cycle_delta);
79 ns_offset = cyc2ns(clock, cycle_delta); 77 timespec_add_ns(&xtime, nsec);
80 78
81 return ns_offset; 79 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
80 clock->raw_time.tv_nsec += nsec;
82} 81}
83 82
84/** 83/**
@@ -89,6 +88,7 @@ static inline s64 __get_nsec_offset(void)
89 */ 88 */
90void getnstimeofday(struct timespec *ts) 89void getnstimeofday(struct timespec *ts)
91{ 90{
91 cycle_t cycle_now, cycle_delta;
92 unsigned long seq; 92 unsigned long seq;
93 s64 nsecs; 93 s64 nsecs;
94 94
@@ -96,7 +96,15 @@ void getnstimeofday(struct timespec *ts)
96 seq = read_seqbegin(&xtime_lock); 96 seq = read_seqbegin(&xtime_lock);
97 97
98 *ts = xtime; 98 *ts = xtime;
99 nsecs = __get_nsec_offset(); 99
100 /* read clocksource: */
101 cycle_now = clocksource_read(clock);
102
103 /* calculate the delta since the last update_wall_time: */
104 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
105
106 /* convert to nanoseconds: */
107 nsecs = cyc2ns(clock, cycle_delta);
100 108
101 } while (read_seqretry(&xtime_lock, seq)); 109 } while (read_seqretry(&xtime_lock, seq));
102 110
@@ -129,22 +137,22 @@ EXPORT_SYMBOL(do_gettimeofday);
129 */ 137 */
130int do_settimeofday(struct timespec *tv) 138int do_settimeofday(struct timespec *tv)
131{ 139{
140 struct timespec ts_delta;
132 unsigned long flags; 141 unsigned long flags;
133 time_t wtm_sec, sec = tv->tv_sec;
134 long wtm_nsec, nsec = tv->tv_nsec;
135 142
136 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) 143 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
137 return -EINVAL; 144 return -EINVAL;
138 145
139 write_seqlock_irqsave(&xtime_lock, flags); 146 write_seqlock_irqsave(&xtime_lock, flags);
140 147
141 nsec -= __get_nsec_offset(); 148 clocksource_forward_now();
149
150 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
151 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
152 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta);
142 153
143 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); 154 xtime = *tv;
144 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
145 155
146 set_normalized_timespec(&xtime, sec, nsec);
147 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
148 update_xtime_cache(0); 156 update_xtime_cache(0);
149 157
150 clock->error = 0; 158 clock->error = 0;
@@ -170,22 +178,19 @@ EXPORT_SYMBOL(do_settimeofday);
170static void change_clocksource(void) 178static void change_clocksource(void)
171{ 179{
172 struct clocksource *new; 180 struct clocksource *new;
173 cycle_t now;
174 u64 nsec;
175 181
176 new = clocksource_get_next(); 182 new = clocksource_get_next();
177 183
178 if (clock == new) 184 if (clock == new)
179 return; 185 return;
180 186
181 new->cycle_last = 0; 187 clocksource_forward_now();
182 now = clocksource_read(new);
183 nsec = __get_nsec_offset();
184 timespec_add_ns(&xtime, nsec);
185 188
186 clock = new; 189 new->raw_time = clock->raw_time;
187 clock->cycle_last = now;
188 190
191 clock = new;
192 clock->cycle_last = 0;
193 clock->cycle_last = clocksource_read(new);
189 clock->error = 0; 194 clock->error = 0;
190 clock->xtime_nsec = 0; 195 clock->xtime_nsec = 0;
191 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 196 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -200,11 +205,44 @@ static void change_clocksource(void)
200 */ 205 */
201} 206}
202#else 207#else
208static inline void clocksource_forward_now(void) { }
203static inline void change_clocksource(void) { } 209static inline void change_clocksource(void) { }
204static inline s64 __get_nsec_offset(void) { return 0; }
205#endif 210#endif
206 211
207/** 212/**
213 * getrawmonotonic - Returns the raw monotonic time in a timespec
214 * @ts: pointer to the timespec to be set
215 *
216 * Returns the raw monotonic time (completely un-modified by ntp)
217 */
218void getrawmonotonic(struct timespec *ts)
219{
220 unsigned long seq;
221 s64 nsecs;
222 cycle_t cycle_now, cycle_delta;
223
224 do {
225 seq = read_seqbegin(&xtime_lock);
226
227 /* read clocksource: */
228 cycle_now = clocksource_read(clock);
229
230 /* calculate the delta since the last update_wall_time: */
231 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
232
233 /* convert to nanoseconds: */
234 nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
235
236 *ts = clock->raw_time;
237
238 } while (read_seqretry(&xtime_lock, seq));
239
240 timespec_add_ns(ts, nsecs);
241}
242EXPORT_SYMBOL(getrawmonotonic);
243
244
245/**
208 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres 246 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
209 */ 247 */
210int timekeeping_valid_for_hres(void) 248int timekeeping_valid_for_hres(void)
@@ -265,8 +303,6 @@ void __init timekeeping_init(void)
265static int timekeeping_suspended; 303static int timekeeping_suspended;
266/* time in seconds when suspend began */ 304/* time in seconds when suspend began */
267static unsigned long timekeeping_suspend_time; 305static unsigned long timekeeping_suspend_time;
268/* xtime offset when we went into suspend */
269static s64 timekeeping_suspend_nsecs;
270 306
271/** 307/**
272 * timekeeping_resume - Resumes the generic timekeeping subsystem. 308 * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -292,8 +328,6 @@ static int timekeeping_resume(struct sys_device *dev)
292 wall_to_monotonic.tv_sec -= sleep_length; 328 wall_to_monotonic.tv_sec -= sleep_length;
293 total_sleep_time += sleep_length; 329 total_sleep_time += sleep_length;
294 } 330 }
295 /* Make sure that we have the correct xtime reference */
296 timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
297 update_xtime_cache(0); 331 update_xtime_cache(0);
298 /* re-base the last cycle value */ 332 /* re-base the last cycle value */
299 clock->cycle_last = 0; 333 clock->cycle_last = 0;
@@ -319,8 +353,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
319 timekeeping_suspend_time = read_persistent_clock(); 353 timekeeping_suspend_time = read_persistent_clock();
320 354
321 write_seqlock_irqsave(&xtime_lock, flags); 355 write_seqlock_irqsave(&xtime_lock, flags);
322 /* Get the current xtime offset */ 356 clocksource_forward_now();
323 timekeeping_suspend_nsecs = __get_nsec_offset();
324 timekeeping_suspended = 1; 357 timekeeping_suspended = 1;
325 write_sequnlock_irqrestore(&xtime_lock, flags); 358 write_sequnlock_irqrestore(&xtime_lock, flags);
326 359
@@ -454,23 +487,29 @@ void update_wall_time(void)
454#else 487#else
455 offset = clock->cycle_interval; 488 offset = clock->cycle_interval;
456#endif 489#endif
457 clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; 490 clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
458 491
459 /* normally this loop will run just once, however in the 492 /* normally this loop will run just once, however in the
460 * case of lost or late ticks, it will accumulate correctly. 493 * case of lost or late ticks, it will accumulate correctly.
461 */ 494 */
462 while (offset >= clock->cycle_interval) { 495 while (offset >= clock->cycle_interval) {
463 /* accumulate one interval */ 496 /* accumulate one interval */
464 clock->xtime_nsec += clock->xtime_interval;
465 clock->cycle_last += clock->cycle_interval;
466 offset -= clock->cycle_interval; 497 offset -= clock->cycle_interval;
498 clock->cycle_last += clock->cycle_interval;
467 499
500 clock->xtime_nsec += clock->xtime_interval;
468 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { 501 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
469 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; 502 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
470 xtime.tv_sec++; 503 xtime.tv_sec++;
471 second_overflow(); 504 second_overflow();
472 } 505 }
473 506
507 clock->raw_time.tv_nsec += clock->raw_interval;
508 if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
509 clock->raw_time.tv_nsec -= NSEC_PER_SEC;
510 clock->raw_time.tv_sec++;
511 }
512
474 /* accumulate error between NTP and clock interval */ 513 /* accumulate error between NTP and clock interval */
475 clock->error += tick_length; 514 clock->error += tick_length;
476 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); 515 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
@@ -479,9 +518,12 @@ void update_wall_time(void)
479 /* correct the clock when NTP error is too big */ 518 /* correct the clock when NTP error is too big */
480 clocksource_adjust(offset); 519 clocksource_adjust(offset);
481 520
482 /* store full nanoseconds into xtime */ 521 /* store full nanoseconds into xtime after rounding it up and
483 xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; 522 * add the remainder to the error difference.
523 */
524 xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1;
484 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; 525 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
526 clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);
485 527
486 update_xtime_cache(cyc2ns(clock, offset)); 528 update_xtime_cache(cyc2ns(clock, offset));
487 529
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a40e20fd0001..f6426911e35a 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -47,13 +47,14 @@ static void print_name_offset(struct seq_file *m, void *sym)
47} 47}
48 48
49static void 49static void
50print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) 50print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
51 int idx, u64 now)
51{ 52{
52#ifdef CONFIG_TIMER_STATS 53#ifdef CONFIG_TIMER_STATS
53 char tmp[TASK_COMM_LEN + 1]; 54 char tmp[TASK_COMM_LEN + 1];
54#endif 55#endif
55 SEQ_printf(m, " #%d: ", idx); 56 SEQ_printf(m, " #%d: ", idx);
56 print_name_offset(m, timer); 57 print_name_offset(m, taddr);
57 SEQ_printf(m, ", "); 58 SEQ_printf(m, ", ");
58 print_name_offset(m, timer->function); 59 print_name_offset(m, timer->function);
59 SEQ_printf(m, ", S:%02lx", timer->state); 60 SEQ_printf(m, ", S:%02lx", timer->state);
@@ -99,7 +100,7 @@ next_one:
99 tmp = *timer; 100 tmp = *timer;
100 spin_unlock_irqrestore(&base->cpu_base->lock, flags); 101 spin_unlock_irqrestore(&base->cpu_base->lock, flags);
101 102
102 print_timer(m, &tmp, i, now); 103 print_timer(m, timer, &tmp, i, now);
103 next++; 104 next++;
104 goto next_one; 105 goto next_one;
105 } 106 }
@@ -109,6 +110,7 @@ next_one:
109static void 110static void
110print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) 111print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
111{ 112{
113 SEQ_printf(m, " .base: %p\n", base);
112 SEQ_printf(m, " .index: %d\n", 114 SEQ_printf(m, " .index: %d\n",
113 base->index); 115 base->index);
114 SEQ_printf(m, " .resolution: %Lu nsecs\n", 116 SEQ_printf(m, " .resolution: %Lu nsecs\n",
@@ -183,12 +185,16 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
183 185
184#ifdef CONFIG_GENERIC_CLOCKEVENTS 186#ifdef CONFIG_GENERIC_CLOCKEVENTS
185static void 187static void
186print_tickdevice(struct seq_file *m, struct tick_device *td) 188print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
187{ 189{
188 struct clock_event_device *dev = td->evtdev; 190 struct clock_event_device *dev = td->evtdev;
189 191
190 SEQ_printf(m, "\n"); 192 SEQ_printf(m, "\n");
191 SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); 193 SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
194 if (cpu < 0)
195 SEQ_printf(m, "Broadcast device\n");
196 else
197 SEQ_printf(m, "Per CPU device: %d\n", cpu);
192 198
193 SEQ_printf(m, "Clock Event Device: "); 199 SEQ_printf(m, "Clock Event Device: ");
194 if (!dev) { 200 if (!dev) {
@@ -222,7 +228,7 @@ static void timer_list_show_tickdevices(struct seq_file *m)
222 int cpu; 228 int cpu;
223 229
224#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 230#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
225 print_tickdevice(m, tick_get_broadcast_device()); 231 print_tickdevice(m, tick_get_broadcast_device(), -1);
226 SEQ_printf(m, "tick_broadcast_mask: %08lx\n", 232 SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
227 tick_get_broadcast_mask()->bits[0]); 233 tick_get_broadcast_mask()->bits[0]);
228#ifdef CONFIG_TICK_ONESHOT 234#ifdef CONFIG_TICK_ONESHOT
@@ -232,7 +238,7 @@ static void timer_list_show_tickdevices(struct seq_file *m)
232 SEQ_printf(m, "\n"); 238 SEQ_printf(m, "\n");
233#endif 239#endif
234 for_each_online_cpu(cpu) 240 for_each_online_cpu(cpu)
235 print_tickdevice(m, tick_get_device(cpu)); 241 print_tickdevice(m, tick_get_device(cpu), cpu);
236 SEQ_printf(m, "\n"); 242 SEQ_printf(m, "\n");
237} 243}
238#else 244#else
@@ -244,7 +250,7 @@ static int timer_list_show(struct seq_file *m, void *v)
244 u64 now = ktime_to_ns(ktime_get()); 250 u64 now = ktime_to_ns(ktime_get());
245 int cpu; 251 int cpu;
246 252
247 SEQ_printf(m, "Timer List Version: v0.3\n"); 253 SEQ_printf(m, "Timer List Version: v0.4\n");
248 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 254 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
249 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 255 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
250 256
diff --git a/kernel/timer.c b/kernel/timer.c
index 510fe69351ca..56becf373c58 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1436,9 +1436,11 @@ static void __cpuinit migrate_timers(int cpu)
1436 BUG_ON(cpu_online(cpu)); 1436 BUG_ON(cpu_online(cpu));
1437 old_base = per_cpu(tvec_bases, cpu); 1437 old_base = per_cpu(tvec_bases, cpu);
1438 new_base = get_cpu_var(tvec_bases); 1438 new_base = get_cpu_var(tvec_bases);
1439 1439 /*
1440 local_irq_disable(); 1440 * The caller is globally serialized and nobody else
1441 spin_lock(&new_base->lock); 1441 * takes two locks at once, deadlock is not possible.
1442 */
1443 spin_lock_irq(&new_base->lock);
1442 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1444 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1443 1445
1444 BUG_ON(old_base->running_timer); 1446 BUG_ON(old_base->running_timer);
@@ -1453,8 +1455,7 @@ static void __cpuinit migrate_timers(int cpu)
1453 } 1455 }
1454 1456
1455 spin_unlock(&old_base->lock); 1457 spin_unlock(&old_base->lock);
1456 spin_unlock(&new_base->lock); 1458 spin_unlock_irq(&new_base->lock);
1457 local_irq_enable();
1458 put_cpu_var(tvec_bases); 1459 put_cpu_var(tvec_bases);
1459} 1460}
1460#endif /* CONFIG_HOTPLUG_CPU */ 1461#endif /* CONFIG_HOTPLUG_CPU */