sched: add optional support for CONFIG_HAVE_UNSTABLE_SCHED_CLOCK

this replaces the rq->clock stuff (and possibly cpu_clock()). - architectures that have an 'imperfect' hardware clock can set CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - the 'jiffie' window might be superfulous when we update tick_gtod before the __update_sched_clock() call in sched_clock_tick() - cpu_clock() might be implemented as: sched_clock_cpu(smp_processor_id()) if the accuracy proves good enough - how far can TSC drift in a single jiffie when considering the filtering and idle hooks? [ mingo@elte.hu: various fixes and cleanups ] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2008-05-03 12:29:28 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-05-05 17:56:18 -0400
commit: 3e51f33fcc7f55e6df25d15b55ed10c8b4da84cd (patch)
tree: 3752f9ea8e014ec40e95a1b197b0a3d18e1056a8 /kernel
parent: a5574cf65b5f03ce9ade3918764fe22e5e2371e3 (diff)
5 files changed, 251 insertions, 161 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 188c43223f52..1c9938addb9d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
            rcupdate.o extable.o params.o posix-timers.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
-            notifier.o ksysfs.o pm_qos_params.o
+            notifier.o ksysfs.o pm_qos_params.o sched_clock.o
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/kernel/sched.c b/kernel/sched.c
index 9457106b18af..58fb8af15776 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,16 +75,6 @@
 #include <asm/irq_regs.h>
 /*
- * Scheduler clock - returns current time in nanosec units.
- * This is default implementation.
- * Architectures and sub-architectures can override this.
- */
-unsigned long long __attribute__((weak)) sched_clock(void)
-{
-        return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
-}
-/*
 * Convert user-nice values [ -20 ... 0 ... 19 ]
 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
 * and back.
@@ -557,13 +547,7 @@ struct rq {
        unsigned long next_balance;
        struct mm_struct *prev_mm;
-        u64 clock, prev_clock_raw;
+        u64 clock;
-        s64 clock_max_delta;
-        unsigned int clock_warps, clock_overflows, clock_underflows;
-        u64 idle_clock;
-        unsigned int clock_deep_idle_events;
-        u64 tick_timestamp;
        atomic_t nr_iowait;
@@ -628,82 +612,6 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
-#ifdef CONFIG_NO_HZ
-static inline bool nohz_on(int cpu)
-{
-        return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE;
-}
-static inline u64 max_skipped_ticks(struct rq *rq)
-{
-        return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1;
-}
-static inline void update_last_tick_seen(struct rq *rq)
-{
-        rq->last_tick_seen = jiffies;
-}
-#else
-static inline u64 max_skipped_ticks(struct rq *rq)
-{
-        return 1;
-}
-static inline void update_last_tick_seen(struct rq *rq)
-{
-}
-#endif
-/*
- * Update the per-runqueue clock, as finegrained as the platform can give
- * us, but without assuming monotonicity, etc.:
- */
-static void __update_rq_clock(struct rq *rq)
-{
-        u64 prev_raw = rq->prev_clock_raw;
-        u64 now = sched_clock();
-        s64 delta = now - prev_raw;
-        u64 clock = rq->clock;
-#ifdef CONFIG_SCHED_DEBUG
-        WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-#endif
-        /*
-         * Protect against sched_clock() occasionally going backwards:
-         */
-        if (unlikely(delta < 0)) {
-                clock++;
-                rq->clock_warps++;
-        } else {
-                /*
-                 * Catch too large forward jumps too:
-                 */
-                u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC;
-                u64 max_time = rq->tick_timestamp + max_jump;
-                if (unlikely(clock + delta > max_time)) {
-                        if (clock < max_time)
-                                clock = max_time;
-                        else
-                                clock++;
-                        rq->clock_overflows++;
-                } else {
-                        if (unlikely(delta > rq->clock_max_delta))
-                                rq->clock_max_delta = delta;
-                        clock += delta;
-                }
-        }
-        rq->prev_clock_raw = now;
-        rq->clock = clock;
-}
-static void update_rq_clock(struct rq *rq)
-{
-        if (likely(smp_processor_id() == cpu_of(rq)))
-                __update_rq_clock(rq);
-}
 /*
 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 * See detach_destroy_domains: synchronize_sched for details.
@@ -719,6 +627,11 @@ static void update_rq_clock(struct rq *rq)
 #define task_rq(p)              cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
+static inline void update_rq_clock(struct rq *rq)
+{
+        rq->clock = sched_clock_cpu(cpu_of(rq));
+}
 /*
 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
 */
@@ -935,7 +848,6 @@ static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
 static unsigned long long __cpu_clock(int cpu)
 {
        unsigned long long now;
-        struct rq *rq;
        /*
         * Only call sched_clock() if the scheduler has already been
@@ -944,9 +856,7 @@ static unsigned long long __cpu_clock(int cpu)
        if (unlikely(!scheduler_running))
                return 0;
-        rq = cpu_rq(cpu);
+        now = sched_clock_cpu(cpu);
-        update_rq_clock(rq);
-        now = rq->clock;
        return now;
 }
@@ -1120,45 +1030,6 @@ static struct rq *this_rq_lock(void)
        return rq;
 }
-/*
- * We are going deep-idle (irqs are disabled):
- */
-void sched_clock_idle_sleep_event(void)
-{
-        struct rq *rq = cpu_rq(smp_processor_id());
-        WARN_ON(!irqs_disabled());
-        spin_lock(&rq->lock);
-        __update_rq_clock(rq);
-        spin_unlock(&rq->lock);
-        rq->clock_deep_idle_events++;
-}
-EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
-/*
- * We just idled delta nanoseconds (called with irqs disabled):
- */
-void sched_clock_idle_wakeup_event(u64 delta_ns)
-{
-        struct rq *rq = cpu_rq(smp_processor_id());
-        u64 now = sched_clock();
-        WARN_ON(!irqs_disabled());
-        rq->idle_clock += delta_ns;
-        /*
-         * Override the previous timestamp and ignore all
-         * sched_clock() deltas that occured while we idled,
-         * and use the PM-provided delta_ns to advance the
-         * rq clock:
-         */
-        spin_lock(&rq->lock);
-        rq->prev_clock_raw = now;
-        rq->clock += delta_ns;
-        spin_unlock(&rq->lock);
-        touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 static void __resched_task(struct task_struct *p, int tif_bit);
 static inline void resched_task(struct task_struct *p)
@@ -1283,7 +1154,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
        WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
        spin_lock(&rq->lock);
-        __update_rq_clock(rq);
+        update_rq_clock(rq);
        rq->curr->sched_class->task_tick(rq, rq->curr, 1);
        spin_unlock(&rq->lock);
@@ -4476,19 +4347,11 @@ void scheduler_tick(void)
        int cpu = smp_processor_id();
        struct rq *rq = cpu_rq(cpu);
        struct task_struct *curr = rq->curr;
-        u64 next_tick = rq->tick_timestamp + TICK_NSEC;
+        sched_clock_tick();
        spin_lock(&rq->lock);
-        __update_rq_clock(rq);
+        update_rq_clock(rq);
-        /*
-         * Let rq->clock advance by at least TICK_NSEC:
-         */
-        if (unlikely(rq->clock < next_tick)) {
-                rq->clock = next_tick;
-                rq->clock_underflows++;
-        }
-        rq->tick_timestamp = rq->clock;
-        update_last_tick_seen(rq);
        update_cpu_load(rq);
        curr->sched_class->task_tick(rq, curr, 0);
        spin_unlock(&rq->lock);
@@ -4642,7 +4505,7 @@ need_resched_nonpreemptible:
         * Do the rq-clock update outside the rq lock:
         */
        local_irq_disable();
-        __update_rq_clock(rq);
+        update_rq_clock(rq);
        spin_lock(&rq->lock);
        clear_tsk_need_resched(prev);
@@ -8226,8 +8089,6 @@ void __init sched_init(void)
                spin_lock_init(&rq->lock);
                lockdep_set_class(&rq->lock, &rq->rq_lock_key);
                rq->nr_running = 0;
-                rq->clock = 1;
-                update_last_tick_seen(rq);
                init_cfs_rq(&rq->cfs, rq);
                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8371,6 +8232,7 @@ EXPORT_SYMBOL(__might_sleep);
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
        int on_rq;
        update_rq_clock(rq);
        on_rq = p->se.on_rq;
        if (on_rq)
@@ -8402,7 +8264,6 @@ void normalize_rt_tasks(void)
                p->se.sleep_start               = 0;
                p->se.block_start               = 0;
 #endif
-                task_rq(p)->clock               = 0;
                if (!rt_task(p)) {
                        /*
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
new file mode 100644
index 000000000000..9c597e37f7de
--- /dev/null
+++ b/kernel/sched_clock.c
@@ -0,0 +1,236 @@
+/*
+ * sched_clock for unstable cpu clocks
+ *
+ *  Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Based on code by:
+ *   Ingo Molnar <mingo@redhat.com>
+ *   Guillaume Chazarain <guichaz@gmail.com>
+ *
+ * Create a semi stable clock from a mixture of other events, including:
+ *  - gtod
+ *  - jiffies
+ *  - sched_clock()
+ *  - explicit idle events
+ *
+ * We use gtod as base and the unstable clock deltas. The deltas are filtered,
+ * making it monotonic and keeping it within an expected window.  This window
+ * is set up using jiffies.
+ *
+ * Furthermore, explicit sleep and wakeup hooks allow us to account for time
+ * that is otherwise invisible (TSC gets stopped).
+ *
+ * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
+ * consistent between cpus (never more than 1 jiffies difference).
+ */
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+struct sched_clock_data {
+        /*
+         * Raw spinlock - this is a special case: this might be called
+         * from within instrumentation code so we dont want to do any
+         * instrumentation ourselves.
+         */
+        raw_spinlock_t          lock;
+        unsigned long           prev_jiffies;
+        u64                     prev_raw;
+        u64                     tick_raw;
+        u64                     tick_gtod;
+        u64                     clock;
+};
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
+static inline struct sched_clock_data *this_scd(void)
+{
+        return &__get_cpu_var(sched_clock_data);
+}
+static inline struct sched_clock_data *cpu_sdc(int cpu)
+{
+        return &per_cpu(sched_clock_data, cpu);
+}
+void sched_clock_init(void)
+{
+        u64 ktime_now = ktime_to_ns(ktime_get());
+        u64 now = 0;
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                struct sched_clock_data *scd = cpu_sdc(cpu);
+                scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+                scd->prev_jiffies = jiffies;
+                scd->prev_raw = now;
+                scd->tick_raw = now;
+                scd->tick_gtod = ktime_now;
+                scd->clock = ktime_now;
+        }
+}
+/*
+ * update the percpu scd from the raw @now value
+ *
+ *  - filter out backward motion
+ *  - use jiffies to generate a min,max window to clip the raw values
+ */
+static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
+{
+        unsigned long now_jiffies = jiffies;
+        long delta_jiffies = now_jiffies - scd->prev_jiffies;
+        u64 clock = scd->clock;
+        u64 min_clock, max_clock;
+        s64 delta = now - scd->prev_raw;
+        WARN_ON_ONCE(!irqs_disabled());
+        min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
+        if (unlikely(delta < 0)) {
+                clock++;
+                goto out;
+        }
+        max_clock = min_clock + TICK_NSEC;
+        if (unlikely(clock + delta > max_clock)) {
+                if (clock < max_clock)
+                        clock = max_clock;
+                else
+                        clock++;
+        } else {
+                clock += delta;
+        }
+ out:
+        if (unlikely(clock < min_clock))
+                clock = min_clock;
+        scd->prev_raw = now;
+        scd->prev_jiffies = now_jiffies;
+        scd->clock = clock;
+}
+static void lock_double_clock(struct sched_clock_data *data1,
+                                struct sched_clock_data *data2)
+{
+        if (data1 < data2) {
+                __raw_spin_lock(&data1->lock);
+                __raw_spin_lock(&data2->lock);
+        } else {
+                __raw_spin_lock(&data2->lock);
+                __raw_spin_lock(&data1->lock);
+        }
+}
+u64 sched_clock_cpu(int cpu)
+{
+        struct sched_clock_data *scd = cpu_sdc(cpu);
+        u64 now, clock;
+        WARN_ON_ONCE(!irqs_disabled());
+        now = sched_clock();
+        if (cpu != raw_smp_processor_id()) {
+                /*
+                 * in order to update a remote cpu's clock based on our
+                 * unstable raw time rebase it against:
+                 *   tick_raw           (offset between raw counters)
+                 *   tick_gotd          (tick offset between cpus)
+                 */
+                struct sched_clock_data *my_scd = this_scd();
+                lock_double_clock(scd, my_scd);
+                now -= my_scd->tick_raw;
+                now += scd->tick_raw;
+                now -= my_scd->tick_gtod;
+                now += scd->tick_gtod;
+                __raw_spin_unlock(&my_scd->lock);
+        } else {
+                __raw_spin_lock(&scd->lock);
+        }
+        __update_sched_clock(scd, now);
+        clock = scd->clock;
+        __raw_spin_unlock(&scd->lock);
+        return clock;
+}
+void sched_clock_tick(void)
+{
+        struct sched_clock_data *scd = this_scd();
+        u64 now, now_gtod;
+        WARN_ON_ONCE(!irqs_disabled());
+        now = sched_clock();
+        now_gtod = ktime_to_ns(ktime_get());
+        __raw_spin_lock(&scd->lock);
+        __update_sched_clock(scd, now);
+        /*
+         * update tick_gtod after __update_sched_clock() because that will
+         * already observe 1 new jiffy; adding a new tick_gtod to that would
+         * increase the clock 2 jiffies.
+         */
+        scd->tick_raw = now;
+        scd->tick_gtod = now_gtod;
+        __raw_spin_unlock(&scd->lock);
+}
+/*
+ * We are going deep-idle (irqs are disabled):
+ */
+void sched_clock_idle_sleep_event(void)
+{
+        sched_clock_cpu(smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
+/*
+ * We just idled delta nanoseconds (called with irqs disabled):
+ */
+void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+        struct sched_clock_data *scd = this_scd();
+        u64 now = sched_clock();
+        /*
+         * Override the previous timestamp and ignore all
+         * sched_clock() deltas that occured while we idled,
+         * and use the PM-provided delta_ns to advance the
+         * rq clock:
+         */
+        __raw_spin_lock(&scd->lock);
+        scd->prev_raw = now;
+        scd->clock += delta_ns;
+        __raw_spin_unlock(&scd->lock);
+        touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
+#endif
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ * This is default implementation.
+ * Architectures and sub-architectures can override this.
+ */
+unsigned long long __attribute__((weak)) sched_clock(void)
+{
+        return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6b4a12558e88..5f06118fbc31 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -204,13 +204,6 @@ static void print_cpu(struct seq_file *m, int cpu)
        PN(next_balance);
        P(curr->pid);
        PN(clock);
-        PN(idle_clock);
-        PN(prev_clock_raw);
-        P(clock_warps);
-        P(clock_overflows);
-        P(clock_underflows);
-        P(clock_deep_idle_events);
-        PN(clock_max_delta);
        P(cpu_load[0]);
        P(cpu_load[1]);
        P(cpu_load[2]);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d99e01f6929a..c863663d204d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -959,7 +959,7 @@ static void yield_task_fair(struct rq *rq)
                return;
        if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
-                __update_rq_clock(rq);
+                update_rq_clock(rq);
                /*
                 * Update run-time statistics of the 'current'.
                 */
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2008-05-03 12:29:28 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-05-05 17:56:18 -0400
commit	3e51f33fcc7f55e6df25d15b55ed10c8b4da84cd (patch)
tree	3752f9ea8e014ec40e95a1b197b0a3d18e1056a8 /kernel
parent	a5574cf65b5f03ce9ade3918764fe22e5e2371e3 (diff)

diff --git a/kernel/Makefile b/kernel/Makefile index 188c43223f52..1c9938addb9d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
9	rcupdate.o extable.o params.o posix-timers.o \	9	rcupdate.o extable.o params.o posix-timers.o \
10	kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \	10	kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11	hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \	11	hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12	notifier.o ksysfs.o pm_qos_params.o	12	notifier.o ksysfs.o pm_qos_params.o sched_clock.o
13		13
14	obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o	14	obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
15	obj-$(CONFIG_STACKTRACE) += stacktrace.o	15	obj-$(CONFIG_STACKTRACE) += stacktrace.o


diff --git a/kernel/sched.c b/kernel/sched.c index 9457106b18af..58fb8af15776 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -75,16 +75,6 @@
75	#include <asm/irq_regs.h>	75	#include <asm/irq_regs.h>
76		76
77	/*	77	/*
78	* Scheduler clock - returns current time in nanosec units.
79	* This is default implementation.
80	* Architectures and sub-architectures can override this.
81	*/
82	unsigned long long __attribute__((weak)) sched_clock(void)
83	{
84	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
85	}
86
87	/*
88	* Convert user-nice values [ -20 ... 0 ... 19 ]	78	* Convert user-nice values [ -20 ... 0 ... 19 ]
89	* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],	79	* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
90	* and back.	80	* and back.
@@ -557,13 +547,7 @@ struct rq {
557	unsigned long next_balance;	547	unsigned long next_balance;
558	struct mm_struct *prev_mm;	548	struct mm_struct *prev_mm;
559		549
560	u64 clock, prev_clock_raw;	550	u64 clock;
561	s64 clock_max_delta;
562
563	unsigned int clock_warps, clock_overflows, clock_underflows;
564	u64 idle_clock;
565	unsigned int clock_deep_idle_events;
566	u64 tick_timestamp;
567		551
568	atomic_t nr_iowait;	552	atomic_t nr_iowait;
569		553
@@ -628,82 +612,6 @@ static inline int cpu_of(struct rq *rq)
628	#endif	612	#endif
629	}	613	}
630		614
631	#ifdef CONFIG_NO_HZ
632	static inline bool nohz_on(int cpu)
633	{
634	return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE;
635	}
636
637	static inline u64 max_skipped_ticks(struct rq *rq)
638	{
639	return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1;
640	}
641
642	static inline void update_last_tick_seen(struct rq *rq)
643	{
644	rq->last_tick_seen = jiffies;
645	}
646	#else
647	static inline u64 max_skipped_ticks(struct rq *rq)
648	{
649	return 1;
650	}
651
652	static inline void update_last_tick_seen(struct rq *rq)
653	{
654	}
655	#endif
656
657	/*
658	* Update the per-runqueue clock, as finegrained as the platform can give
659	* us, but without assuming monotonicity, etc.:
660	*/
661	static void __update_rq_clock(struct rq *rq)
662	{
663	u64 prev_raw = rq->prev_clock_raw;
664	u64 now = sched_clock();
665	s64 delta = now - prev_raw;
666	u64 clock = rq->clock;
667
668	#ifdef CONFIG_SCHED_DEBUG
669	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
670	#endif
671	/*
672	* Protect against sched_clock() occasionally going backwards:
673	*/
674	if (unlikely(delta < 0)) {
675	clock++;
676	rq->clock_warps++;
677	} else {
678	/*
679	* Catch too large forward jumps too:
680	*/
681	u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC;
682	u64 max_time = rq->tick_timestamp + max_jump;
683
684	if (unlikely(clock + delta > max_time)) {
685	if (clock < max_time)
686	clock = max_time;
687	else
688	clock++;
689	rq->clock_overflows++;
690	} else {
691	if (unlikely(delta > rq->clock_max_delta))
692	rq->clock_max_delta = delta;
693	clock += delta;
694	}
695	}
696
697	rq->prev_clock_raw = now;
698	rq->clock = clock;
699	}
700
701	static void update_rq_clock(struct rq *rq)
702	{
703	if (likely(smp_processor_id() == cpu_of(rq)))
704	__update_rq_clock(rq);
705	}
706
707	/*	615	/*
708	* The domain tree (rq->sd) is protected by RCU's quiescent state transition.	616	* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
709	* See detach_destroy_domains: synchronize_sched for details.	617	* See detach_destroy_domains: synchronize_sched for details.
@@ -719,6 +627,11 @@ static void update_rq_clock(struct rq *rq)
719	#define task_rq(p) cpu_rq(task_cpu(p))	627	#define task_rq(p) cpu_rq(task_cpu(p))
720	#define cpu_curr(cpu) (cpu_rq(cpu)->curr)	628	#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
721		629
		630	static inline void update_rq_clock(struct rq *rq)
		631	{
		632	rq->clock = sched_clock_cpu(cpu_of(rq));
		633	}
		634
722	/*	635	/*
723	* Tunables that become constants when CONFIG_SCHED_DEBUG is off:	636	* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
724	*/	637	*/
@@ -935,7 +848,6 @@ static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
935	static unsigned long long __cpu_clock(int cpu)	848	static unsigned long long __cpu_clock(int cpu)
936	{	849	{
937	unsigned long long now;	850	unsigned long long now;
938	struct rq *rq;
939		851
940	/*	852	/*
941	* Only call sched_clock() if the scheduler has already been	853	* Only call sched_clock() if the scheduler has already been
@@ -944,9 +856,7 @@ static unsigned long long __cpu_clock(int cpu)
944	if (unlikely(!scheduler_running))	856	if (unlikely(!scheduler_running))
945	return 0;	857	return 0;
946		858
947	rq = cpu_rq(cpu);	859	now = sched_clock_cpu(cpu);
948	update_rq_clock(rq);
949	now = rq->clock;
950		860
951	return now;	861	return now;
952	}	862	}
@@ -1120,45 +1030,6 @@ static struct rq *this_rq_lock(void)
1120	return rq;	1030	return rq;
1121	}	1031	}
1122		1032
1123	/*
1124	* We are going deep-idle (irqs are disabled):
1125	*/
1126	void sched_clock_idle_sleep_event(void)
1127	{
1128	struct rq *rq = cpu_rq(smp_processor_id());
1129
1130	WARN_ON(!irqs_disabled());
1131	spin_lock(&rq->lock);
1132	__update_rq_clock(rq);
1133	spin_unlock(&rq->lock);
1134	rq->clock_deep_idle_events++;
1135	}
1136	EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
1137
1138	/*
1139	* We just idled delta nanoseconds (called with irqs disabled):
1140	*/
1141	void sched_clock_idle_wakeup_event(u64 delta_ns)
1142	{
1143	struct rq *rq = cpu_rq(smp_processor_id());
1144	u64 now = sched_clock();
1145
1146	WARN_ON(!irqs_disabled());
1147	rq->idle_clock += delta_ns;
1148	/*
1149	* Override the previous timestamp and ignore all
1150	* sched_clock() deltas that occured while we idled,
1151	* and use the PM-provided delta_ns to advance the
1152	* rq clock:
1153	*/
1154	spin_lock(&rq->lock);
1155	rq->prev_clock_raw = now;
1156	rq->clock += delta_ns;
1157	spin_unlock(&rq->lock);
1158	touch_softlockup_watchdog();
1159	}
1160	EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
1161
1162	static void __resched_task(struct task_struct *p, int tif_bit);	1033	static void __resched_task(struct task_struct *p, int tif_bit);
1163		1034
1164	static inline void resched_task(struct task_struct *p)	1035	static inline void resched_task(struct task_struct *p)
@@ -1283,7 +1154,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1283	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());	1154	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1284		1155
1285	spin_lock(&rq->lock);	1156	spin_lock(&rq->lock);
1286	__update_rq_clock(rq);	1157	update_rq_clock(rq);
1287	rq->curr->sched_class->task_tick(rq, rq->curr, 1);	1158	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1288	spin_unlock(&rq->lock);	1159	spin_unlock(&rq->lock);
1289		1160
@@ -4476,19 +4347,11 @@ void scheduler_tick(void)
4476	int cpu = smp_processor_id();	4347	int cpu = smp_processor_id();
4477	struct rq *rq = cpu_rq(cpu);	4348	struct rq *rq = cpu_rq(cpu);
4478	struct task_struct *curr = rq->curr;	4349	struct task_struct *curr = rq->curr;
4479	u64 next_tick = rq->tick_timestamp + TICK_NSEC;	4350
		4351	sched_clock_tick();
4480		4352
4481	spin_lock(&rq->lock);	4353	spin_lock(&rq->lock);
4482	__update_rq_clock(rq);	4354	update_rq_clock(rq);
4483	/*
4484	* Let rq->clock advance by at least TICK_NSEC:
4485	*/
4486	if (unlikely(rq->clock < next_tick)) {
4487	rq->clock = next_tick;
4488	rq->clock_underflows++;
4489	}
4490	rq->tick_timestamp = rq->clock;
4491	update_last_tick_seen(rq);
4492	update_cpu_load(rq);	4355	update_cpu_load(rq);
4493	curr->sched_class->task_tick(rq, curr, 0);	4356	curr->sched_class->task_tick(rq, curr, 0);
4494	spin_unlock(&rq->lock);	4357	spin_unlock(&rq->lock);
@@ -4642,7 +4505,7 @@ need_resched_nonpreemptible:
4642	* Do the rq-clock update outside the rq lock:	4505	* Do the rq-clock update outside the rq lock:
4643	*/	4506	*/
4644	local_irq_disable();	4507	local_irq_disable();
4645	__update_rq_clock(rq);	4508	update_rq_clock(rq);
4646	spin_lock(&rq->lock);	4509	spin_lock(&rq->lock);
4647	clear_tsk_need_resched(prev);	4510	clear_tsk_need_resched(prev);
4648		4511
@@ -8226,8 +8089,6 @@ void __init sched_init(void)
8226	spin_lock_init(&rq->lock);	8089	spin_lock_init(&rq->lock);
8227	lockdep_set_class(&rq->lock, &rq->rq_lock_key);	8090	lockdep_set_class(&rq->lock, &rq->rq_lock_key);
8228	rq->nr_running = 0;	8091	rq->nr_running = 0;
8229	rq->clock = 1;
8230	update_last_tick_seen(rq);
8231	init_cfs_rq(&rq->cfs, rq);	8092	init_cfs_rq(&rq->cfs, rq);
8232	init_rt_rq(&rq->rt, rq);	8093	init_rt_rq(&rq->rt, rq);
8233	#ifdef CONFIG_FAIR_GROUP_SCHED	8094	#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8371,6 +8232,7 @@ EXPORT_SYMBOL(__might_sleep);
8371	static void normalize_task(struct rq rq, struct task_struct p)	8232	static void normalize_task(struct rq rq, struct task_struct p)
8372	{	8233	{
8373	int on_rq;	8234	int on_rq;
		8235
8374	update_rq_clock(rq);	8236	update_rq_clock(rq);
8375	on_rq = p->se.on_rq;	8237	on_rq = p->se.on_rq;
8376	if (on_rq)	8238	if (on_rq)
@@ -8402,7 +8264,6 @@ void normalize_rt_tasks(void)
8402	p->se.sleep_start = 0;	8264	p->se.sleep_start = 0;
8403	p->se.block_start = 0;	8265	p->se.block_start = 0;
8404	#endif	8266	#endif
8405	task_rq(p)->clock = 0;
8406		8267
8407	if (!rt_task(p)) {	8268	if (!rt_task(p)) {
8408	/*	8269	/*


diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c new file mode 100644 index 000000000000..9c597e37f7de --- /dev/null +++ b/kernel/sched_clock.c
@@ -0,0 +1,236 @@
		1	/*
		2	* sched_clock for unstable cpu clocks
		3	*
		4	* Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
		5	*
		6	* Based on code by:
		7	* Ingo Molnar <mingo@redhat.com>
		8	* Guillaume Chazarain <guichaz@gmail.com>
		9	*
		10	* Create a semi stable clock from a mixture of other events, including:
		11	* - gtod
		12	* - jiffies
		13	* - sched_clock()
		14	* - explicit idle events
		15	*
		16	* We use gtod as base and the unstable clock deltas. The deltas are filtered,
		17	* making it monotonic and keeping it within an expected window. This window
		18	* is set up using jiffies.
		19	*
		20	* Furthermore, explicit sleep and wakeup hooks allow us to account for time
		21	* that is otherwise invisible (TSC gets stopped).
		22	*
		23	* The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
		24	* consistent between cpus (never more than 1 jiffies difference).
		25	*/
		26	#include <linux/sched.h>
		27	#include <linux/percpu.h>
		28	#include <linux/spinlock.h>
		29	#include <linux/ktime.h>
		30	#include <linux/module.h>
		31
		32
		33	#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
		34
		35	struct sched_clock_data {
		36	/*
		37	* Raw spinlock - this is a special case: this might be called
		38	* from within instrumentation code so we dont want to do any
		39	* instrumentation ourselves.
		40	*/
		41	raw_spinlock_t lock;
		42
		43	unsigned long prev_jiffies;
		44	u64 prev_raw;
		45	u64 tick_raw;
		46	u64 tick_gtod;
		47	u64 clock;
		48	};
		49
		50	static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
		51
		52	static inline struct sched_clock_data *this_scd(void)
		53	{
		54	return &__get_cpu_var(sched_clock_data);
		55	}
		56
		57	static inline struct sched_clock_data *cpu_sdc(int cpu)
		58	{
		59	return &per_cpu(sched_clock_data, cpu);
		60	}
		61
		62	void sched_clock_init(void)
		63	{
		64	u64 ktime_now = ktime_to_ns(ktime_get());
		65	u64 now = 0;
		66	int cpu;
		67
		68	for_each_possible_cpu(cpu) {
		69	struct sched_clock_data *scd = cpu_sdc(cpu);
		70
		71	scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
		72	scd->prev_jiffies = jiffies;
		73	scd->prev_raw = now;
		74	scd->tick_raw = now;
		75	scd->tick_gtod = ktime_now;
		76	scd->clock = ktime_now;
		77	}
		78	}
		79
		80	/*
		81	* update the percpu scd from the raw @now value
		82	*
		83	* - filter out backward motion
		84	* - use jiffies to generate a min,max window to clip the raw values
		85	*/
		86	static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
		87	{
		88	unsigned long now_jiffies = jiffies;
		89	long delta_jiffies = now_jiffies - scd->prev_jiffies;
		90	u64 clock = scd->clock;
		91	u64 min_clock, max_clock;
		92	s64 delta = now - scd->prev_raw;
		93
		94	WARN_ON_ONCE(!irqs_disabled());
		95	min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
		96
		97	if (unlikely(delta < 0)) {
		98	clock++;
		99	goto out;
		100	}
		101
		102	max_clock = min_clock + TICK_NSEC;
		103
		104	if (unlikely(clock + delta > max_clock)) {
		105	if (clock < max_clock)
		106	clock = max_clock;
		107	else
		108	clock++;
		109	} else {
		110	clock += delta;
		111	}
		112
		113	out:
		114	if (unlikely(clock < min_clock))
		115	clock = min_clock;
		116
		117	scd->prev_raw = now;
		118	scd->prev_jiffies = now_jiffies;
		119	scd->clock = clock;
		120	}
		121
		122	static void lock_double_clock(struct sched_clock_data *data1,
		123	struct sched_clock_data *data2)
		124	{
		125	if (data1 < data2) {
		126	__raw_spin_lock(&data1->lock);
		127	__raw_spin_lock(&data2->lock);
		128	} else {
		129	__raw_spin_lock(&data2->lock);
		130	__raw_spin_lock(&data1->lock);
		131	}
		132	}
		133
		134	u64 sched_clock_cpu(int cpu)
		135	{
		136	struct sched_clock_data *scd = cpu_sdc(cpu);
		137	u64 now, clock;
		138
		139	WARN_ON_ONCE(!irqs_disabled());
		140	now = sched_clock();
		141
		142	if (cpu != raw_smp_processor_id()) {
		143	/*
		144	* in order to update a remote cpu's clock based on our
		145	* unstable raw time rebase it against:
		146	* tick_raw (offset between raw counters)
		147	* tick_gotd (tick offset between cpus)
		148	*/
		149	struct sched_clock_data *my_scd = this_scd();
		150
		151	lock_double_clock(scd, my_scd);
		152
		153	now -= my_scd->tick_raw;
		154	now += scd->tick_raw;
		155
		156	now -= my_scd->tick_gtod;
		157	now += scd->tick_gtod;
		158
		159	__raw_spin_unlock(&my_scd->lock);
		160	} else {
		161	__raw_spin_lock(&scd->lock);
		162	}
		163
		164	__update_sched_clock(scd, now);
		165	clock = scd->clock;
		166
		167	__raw_spin_unlock(&scd->lock);
		168
		169	return clock;
		170	}
		171
		172	void sched_clock_tick(void)
		173	{
		174	struct sched_clock_data *scd = this_scd();
		175	u64 now, now_gtod;
		176
		177	WARN_ON_ONCE(!irqs_disabled());
		178
		179	now = sched_clock();
		180	now_gtod = ktime_to_ns(ktime_get());
		181
		182	__raw_spin_lock(&scd->lock);
		183	__update_sched_clock(scd, now);
		184	/*
		185	* update tick_gtod after __update_sched_clock() because that will
		186	* already observe 1 new jiffy; adding a new tick_gtod to that would
		187	* increase the clock 2 jiffies.
		188	*/
		189	scd->tick_raw = now;
		190	scd->tick_gtod = now_gtod;
		191	__raw_spin_unlock(&scd->lock);
		192	}
		193
		194	/*
		195	* We are going deep-idle (irqs are disabled):
		196	*/
		197	void sched_clock_idle_sleep_event(void)
		198	{
		199	sched_clock_cpu(smp_processor_id());
		200	}
		201	EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
		202
		203	/*
		204	* We just idled delta nanoseconds (called with irqs disabled):
		205	*/
		206	void sched_clock_idle_wakeup_event(u64 delta_ns)
		207	{
		208	struct sched_clock_data *scd = this_scd();
		209	u64 now = sched_clock();
		210
		211	/*
		212	* Override the previous timestamp and ignore all
		213	* sched_clock() deltas that occured while we idled,
		214	* and use the PM-provided delta_ns to advance the
		215	* rq clock:
		216	*/
		217	__raw_spin_lock(&scd->lock);
		218	scd->prev_raw = now;
		219	scd->clock += delta_ns;
		220	__raw_spin_unlock(&scd->lock);
		221
		222	touch_softlockup_watchdog();
		223	}
		224	EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
		225
		226	#endif
		227
		228	/*
		229	* Scheduler clock - returns current time in nanosec units.
		230	* This is default implementation.
		231	* Architectures and sub-architectures can override this.
		232	*/
		233	unsigned long long __attribute__((weak)) sched_clock(void)
		234	{
		235	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
		236	}


diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 6b4a12558e88..5f06118fbc31 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c
@@ -204,13 +204,6 @@ static void print_cpu(struct seq_file *m, int cpu)
204	PN(next_balance);	204	PN(next_balance);
205	P(curr->pid);	205	P(curr->pid);
206	PN(clock);	206	PN(clock);
207	PN(idle_clock);
208	PN(prev_clock_raw);
209	P(clock_warps);
210	P(clock_overflows);
211	P(clock_underflows);
212	P(clock_deep_idle_events);
213	PN(clock_max_delta);
214	P(cpu_load[0]);	207	P(cpu_load[0]);
215	P(cpu_load[1]);	208	P(cpu_load[1]);
216	P(cpu_load[2]);	209	P(cpu_load[2]);


diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d99e01f6929a..c863663d204d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -959,7 +959,7 @@ static void yield_task_fair(struct rq *rq)
959	return;	959	return;
960		960
961	if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {	961	if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
962	__update_rq_clock(rq);	962	update_rq_clock(rq);
963	/*	963	/*
964	* Update run-time statistics of the 'current'.	964	* Update run-time statistics of the 'current'.
965	*/	965	*/