i386: move xen

Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Thomas Gleixner <tglx@linutronix.de> 2007-10-11 05:16:51 -0400
committer: Thomas Gleixner <tglx@linutronix.de> 2007-10-11 05:16:51 -0400
commit: 9702785a747aa27baf46ff504beab6528f21f2dd (patch)
tree: ab69d6f802f5b680c33999dc089e44982c74595d /arch/x86/xen/time.c
parent: 334e621a01f86d5bc25e4f742e1eaae6e2d2a97a (diff)
1 files changed, 593 insertions, 0 deletions
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
new file mode 100644
index 00000000000..dfd6db69ead
--- /dev/null
+++ b/arch/x86/xen/time.c
@@ -0,0 +1,593 @@
+/*
+ * Xen time implementation.
+ *
+ * This is implemented in terms of a clocksource driver which uses
+ * the hypervisor clock as a nanosecond timebase, and a clockevent
+ * driver which uses the hypervisor's timer mechanism.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/kernel_stat.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include "xen-ops.h"
+#define XEN_SHIFT 22
+/* Xen may fire a timer up to this many ns early */
+#define TIMER_SLOP      100000
+#define NS_PER_TICK     (1000000000LL / HZ)
+static cycle_t xen_clocksource_read(void);
+/* These are perodically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+        u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+        u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+        u32 tsc_to_nsec_mul;
+        int tsc_shift;
+        u32 version;
+};
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+/* runstate info updated by Xen */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+/* snapshots of runstate info */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
+/* unused ns of stolen and blocked time */
+static DEFINE_PER_CPU(u64, residual_stolen);
+static DEFINE_PER_CPU(u64, residual_blocked);
+/* return an consistent snapshot of 64-bit time/counter value */
+static u64 get64(const u64 *p)
+{
+        u64 ret;
+        if (BITS_PER_LONG < 64) {
+                u32 *p32 = (u32 *)p;
+                u32 h, l;
+                /*
+                 * Read high then low, and then make sure high is
+                 * still the same; this will only loop if low wraps
+                 * and carries into high.
+                 * XXX some clean way to make this endian-proof?
+                 */
+                do {
+                        h = p32[1];
+                        barrier();
+                        l = p32[0];
+                        barrier();
+                } while (p32[1] != h);
+                ret = (((u64)h) << 32) | l;
+        } else
+                ret = *p;
+        return ret;
+}
+/*
+ * Runstate accounting
+ */
+static void get_runstate_snapshot(struct vcpu_runstate_info *res)
+{
+        u64 state_time;
+        struct vcpu_runstate_info *state;
+        BUG_ON(preemptible());
+        state = &__get_cpu_var(runstate);
+        /*
+         * The runstate info is always updated by the hypervisor on
+         * the current CPU, so there's no need to use anything
+         * stronger than a compiler barrier when fetching it.
+         */
+        do {
+                state_time = get64(&state->state_entry_time);
+                barrier();
+                *res = *state;
+                barrier();
+        } while (get64(&state->state_entry_time) != state_time);
+}
+static void setup_runstate_info(int cpu)
+{
+        struct vcpu_register_runstate_memory_area area;
+        area.addr.v = &per_cpu(runstate, cpu);
+        if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
+                               cpu, &area))
+                BUG();
+}
+static void do_stolen_accounting(void)
+{
+        struct vcpu_runstate_info state;
+        struct vcpu_runstate_info *snap;
+        s64 blocked, runnable, offline, stolen;
+        cputime_t ticks;
+        get_runstate_snapshot(&state);
+        WARN_ON(state.state != RUNSTATE_running);
+        snap = &__get_cpu_var(runstate_snapshot);
+        /* work out how much time the VCPU has not been runn*ing*  */
+        blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
+        runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
+        offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
+        *snap = state;
+        /* Add the appropriate number of ticks of stolen time,
+           including any left-overs from last time.  Passing NULL to
+           account_steal_time accounts the time as stolen. */
+        stolen = runnable + offline + __get_cpu_var(residual_stolen);
+        if (stolen < 0)
+                stolen = 0;
+        ticks = 0;
+        while (stolen >= NS_PER_TICK) {
+                ticks++;
+                stolen -= NS_PER_TICK;
+        }
+        __get_cpu_var(residual_stolen) = stolen;
+        account_steal_time(NULL, ticks);
+        /* Add the appropriate number of ticks of blocked time,
+           including any left-overs from last time.  Passing idle to
+           account_steal_time accounts the time as idle/wait. */
+        blocked += __get_cpu_var(residual_blocked);
+        if (blocked < 0)
+                blocked = 0;
+        ticks = 0;
+        while (blocked >= NS_PER_TICK) {
+                ticks++;
+                blocked -= NS_PER_TICK;
+        }
+        __get_cpu_var(residual_blocked) = blocked;
+        account_steal_time(idle_task(smp_processor_id()), ticks);
+}
+/*
+ * Xen sched_clock implementation.  Returns the number of unstolen
+ * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
+ * states.
+ */
+unsigned long long xen_sched_clock(void)
+{
+        struct vcpu_runstate_info state;
+        cycle_t now;
+        u64 ret;
+        s64 offset;
+        /*
+         * Ideally sched_clock should be called on a per-cpu basis
+         * anyway, so preempt should already be disabled, but that's
+         * not current practice at the moment.
+         */
+        preempt_disable();
+        now = xen_clocksource_read();
+        get_runstate_snapshot(&state);
+        WARN_ON(state.state != RUNSTATE_running);
+        offset = now - state.state_entry_time;
+        if (offset < 0)
+                offset = 0;
+        ret = state.time[RUNSTATE_blocked] +
+                state.time[RUNSTATE_running] +
+                offset;
+        preempt_enable();
+        return ret;
+}
+/* Get the CPU speed from Xen */
+unsigned long xen_cpu_khz(void)
+{
+        u64 cpu_khz = 1000000ULL << 32;
+        const struct vcpu_time_info *info =
+                &HYPERVISOR_shared_info->vcpu_info[0].time;
+        do_div(cpu_khz, info->tsc_to_system_mul);
+        if (info->tsc_shift < 0)
+                cpu_khz <<= -info->tsc_shift;
+        else
+                cpu_khz >>= info->tsc_shift;
+        return cpu_khz;
+}
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.
+ */
+static unsigned get_time_values_from_xen(void)
+{
+        struct vcpu_time_info   *src;
+        struct shadow_time_info *dst;
+        /* src is shared memory with the hypervisor, so we need to
+           make sure we get a consistent snapshot, even in the face of
+           being preempted. */
+        src = &__get_cpu_var(xen_vcpu)->time;
+        dst = &__get_cpu_var(shadow_time);
+        do {
+                dst->version = src->version;
+                rmb();          /* fetch version before data */
+                dst->tsc_timestamp     = src->tsc_timestamp;
+                dst->system_timestamp  = src->system_time;
+                dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+                dst->tsc_shift         = src->tsc_shift;
+                rmb();          /* test version after fetching data */
+        } while ((src->version & 1) | (dst->version ^ src->version));
+        return dst->version;
+}
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+        u64 product;
+#ifdef __i386__
+        u32 tmp1, tmp2;
+#endif
+        if (shift < 0)
+                delta >>= -shift;
+        else
+                delta <<= shift;
+#ifdef __i386__
+        __asm__ (
+                "mul  %5       ; "
+                "mov  %4,%%eax ; "
+                "mov  %%edx,%4 ; "
+                "mul  %5       ; "
+                "xor  %5,%5    ; "
+                "add  %4,%%eax ; "
+                "adc  %5,%%edx ; "
+                : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+                : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+        __asm__ (
+                "mul %%rdx ; shrd $32,%%rdx,%%rax"
+                : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+        return product;
+}
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+        u64 now, delta;
+        now = native_read_tsc();
+        delta = now - shadow->tsc_timestamp;
+        return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+static cycle_t xen_clocksource_read(void)
+{
+        struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
+        cycle_t ret;
+        unsigned version;
+        do {
+                version = get_time_values_from_xen();
+                barrier();
+                ret = shadow->system_timestamp + get_nsec_offset(shadow);
+                barrier();
+        } while (version != __get_cpu_var(xen_vcpu)->time.version);
+        put_cpu_var(shadow_time);
+        return ret;
+}
+static void xen_read_wallclock(struct timespec *ts)
+{
+        const struct shared_info *s = HYPERVISOR_shared_info;
+        u32 version;
+        u64 delta;
+        struct timespec now;
+        /* get wallclock at system boot */
+        do {
+                version = s->wc_version;
+                rmb();          /* fetch version before time */
+                now.tv_sec  = s->wc_sec;
+                now.tv_nsec = s->wc_nsec;
+                rmb();          /* fetch time before checking version */
+        } while ((s->wc_version & 1) | (version ^ s->wc_version));
+        delta = xen_clocksource_read(); /* time since system boot */
+        delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+        now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+        now.tv_sec = delta;
+        set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
+unsigned long xen_get_wallclock(void)
+{
+        struct timespec ts;
+        xen_read_wallclock(&ts);
+        return ts.tv_sec;
+}
+int xen_set_wallclock(unsigned long now)
+{
+        /* do nothing for domU */
+        return -1;
+}
+static struct clocksource xen_clocksource __read_mostly = {
+        .name = "xen",
+        .rating = 400,
+        .read = xen_clocksource_read,
+        .mask = ~0,
+        .mult = 1<<XEN_SHIFT,           /* time directly in nanoseconds */
+        .shift = XEN_SHIFT,
+        .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+/*
+   Xen clockevent implementation
+   Xen has two clockevent implementations:
+   The old timer_op one works with all released versions of Xen prior
+   to version 3.0.4.  This version of the hypervisor provides a
+   single-shot timer with nanosecond resolution.  However, sharing the
+   same event channel is a 100Hz tick which is delivered while the
+   vcpu is running.  We don't care about or use this tick, but it will
+   cause the core time code to think the timer fired too soon, and
+   will end up resetting it each time.  It could be filtered, but
+   doing so has complications when the ktime clocksource is not yet
+   the xen clocksource (ie, at boot time).
+   The new vcpu_op-based timer interface allows the tick timer period
+   to be changed or turned off.  The tick timer is not useful as a
+   periodic timer because events are only delivered to running vcpus.
+   The one-shot timer can report when a timeout is in the past, so
+   set_next_event is capable of returning -ETIME when appropriate.
+   This interface is used when available.
+*/
+/*
+  Get a hypervisor absolute time.  In theory we could maintain an
+  offset between the kernel's time and the hypervisor's time, and
+  apply that to a kernel's absolute timeout.  Unfortunately the
+  hypervisor and kernel times can drift even if the kernel is using
+  the Xen clocksource, because ntp can warp the kernel's clocksource.
+*/
+static s64 get_abs_timeout(unsigned long delta)
+{
+        return xen_clocksource_read() + delta;
+}
+static void xen_timerop_set_mode(enum clock_event_mode mode,
+                                 struct clock_event_device *evt)
+{
+        switch (mode) {
+        case CLOCK_EVT_MODE_PERIODIC:
+                /* unsupported */
+                WARN_ON(1);
+                break;
+        case CLOCK_EVT_MODE_ONESHOT:
+        case CLOCK_EVT_MODE_RESUME:
+                break;
+        case CLOCK_EVT_MODE_UNUSED:
+        case CLOCK_EVT_MODE_SHUTDOWN:
+                HYPERVISOR_set_timer_op(0);  /* cancel timeout */
+                break;
+        }
+}
+static int xen_timerop_set_next_event(unsigned long delta,
+                                      struct clock_event_device *evt)
+{
+        WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+        if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
+                BUG();
+        /* We may have missed the deadline, but there's no real way of
+           knowing for sure.  If the event was in the past, then we'll
+           get an immediate interrupt. */
+        return 0;
+}
+static const struct clock_event_device xen_timerop_clockevent = {
+        .name = "xen",
+        .features = CLOCK_EVT_FEAT_ONESHOT,
+        .max_delta_ns = 0xffffffff,
+        .min_delta_ns = TIMER_SLOP,
+        .mult = 1,
+        .shift = 0,
+        .rating = 500,
+        .set_mode = xen_timerop_set_mode,
+        .set_next_event = xen_timerop_set_next_event,
+};
+static void xen_vcpuop_set_mode(enum clock_event_mode mode,
+                                struct clock_event_device *evt)
+{
+        int cpu = smp_processor_id();
+        switch (mode) {
+        case CLOCK_EVT_MODE_PERIODIC:
+                WARN_ON(1);     /* unsupported */
+                break;
+        case CLOCK_EVT_MODE_ONESHOT:
+                if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+                        BUG();
+                break;
+        case CLOCK_EVT_MODE_UNUSED:
+        case CLOCK_EVT_MODE_SHUTDOWN:
+                if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
+                    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+                        BUG();
+                break;
+        case CLOCK_EVT_MODE_RESUME:
+                break;
+        }
+}
+static int xen_vcpuop_set_next_event(unsigned long delta,
+                                     struct clock_event_device *evt)
+{
+        int cpu = smp_processor_id();
+        struct vcpu_set_singleshot_timer single;
+        int ret;
+        WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+        single.timeout_abs_ns = get_abs_timeout(delta);
+        single.flags = VCPU_SSHOTTMR_future;
+        ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
+        BUG_ON(ret != 0 && ret != -ETIME);
+        return ret;
+}
+static const struct clock_event_device xen_vcpuop_clockevent = {
+        .name = "xen",
+        .features = CLOCK_EVT_FEAT_ONESHOT,
+        .max_delta_ns = 0xffffffff,
+        .min_delta_ns = TIMER_SLOP,
+        .mult = 1,
+        .shift = 0,
+        .rating = 500,
+        .set_mode = xen_vcpuop_set_mode,
+        .set_next_event = xen_vcpuop_set_next_event,
+};
+static const struct clock_event_device *xen_clockevent =
+        &xen_timerop_clockevent;
+static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
+static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
+{
+        struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
+        irqreturn_t ret;
+        ret = IRQ_NONE;
+        if (evt->event_handler) {
+                evt->event_handler(evt);
+                ret = IRQ_HANDLED;
+        }
+        do_stolen_accounting();
+        return ret;
+}
+void xen_setup_timer(int cpu)
+{
+        const char *name;
+        struct clock_event_device *evt;
+        int irq;
+        printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
+        name = kasprintf(GFP_KERNEL, "timer%d", cpu);
+        if (!name)
+                name = "<timer kasprintf failed>";
+        irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
+                                      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+                                      name, NULL);
+        evt = &per_cpu(xen_clock_events, cpu);
+        memcpy(evt, xen_clockevent, sizeof(*evt));
+        evt->cpumask = cpumask_of_cpu(cpu);
+        evt->irq = irq;
+        setup_runstate_info(cpu);
+}
+void xen_setup_cpu_clockevents(void)
+{
+        BUG_ON(preemptible());
+        clockevents_register_device(&__get_cpu_var(xen_clock_events));
+}
+__init void xen_time_init(void)
+{
+        int cpu = smp_processor_id();
+        get_time_values_from_xen();
+        clocksource_register(&xen_clocksource);
+        if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
+                /* Successfully turned off 100Hz tick, so we have the
+                   vcpuop-based timer interface */
+                printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
+                xen_clockevent = &xen_vcpuop_clockevent;
+        }
+        /* Set initial system time with full resolution */
+        xen_read_wallclock(&xtime);
+        set_normalized_timespec(&wall_to_monotonic,
+                                -xtime.tv_sec, -xtime.tv_nsec);
+        tsc_disable = 0;
+        xen_setup_timer(cpu);
+        xen_setup_cpu_clockevents();
+}
author	Thomas Gleixner <tglx@linutronix.de>	2007-10-11 05:16:51 -0400
committer	Thomas Gleixner <tglx@linutronix.de>	2007-10-11 05:16:51 -0400
commit	9702785a747aa27baf46ff504beab6528f21f2dd (patch)
tree	ab69d6f802f5b680c33999dc089e44982c74595d /arch/x86/xen/time.c
parent	334e621a01f86d5bc25e4f742e1eaae6e2d2a97a (diff)

diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c new file mode 100644 index 00000000000..dfd6db69ead --- /dev/null +++ b/arch/x86/xen/time.c
@@ -0,0 +1,593 @@
	1	/*
	2	* Xen time implementation.
	3	*
	4	* This is implemented in terms of a clocksource driver which uses
	5	* the hypervisor clock as a nanosecond timebase, and a clockevent
	6	* driver which uses the hypervisor's timer mechanism.
	7	*
	8	* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
	9	*/
	10	#include <linux/kernel.h>
	11	#include <linux/interrupt.h>
	12	#include <linux/clocksource.h>
	13	#include <linux/clockchips.h>
	14	#include <linux/kernel_stat.h>
	15
	16	#include <asm/xen/hypervisor.h>
	17	#include <asm/xen/hypercall.h>
	18
	19	#include <xen/events.h>
	20	#include <xen/interface/xen.h>
	21	#include <xen/interface/vcpu.h>
	22
	23	#include "xen-ops.h"
	24
	25	#define XEN_SHIFT 22
	26
	27	/* Xen may fire a timer up to this many ns early */
	28	#define TIMER_SLOP 100000
	29	#define NS_PER_TICK (1000000000LL / HZ)
	30
	31	static cycle_t xen_clocksource_read(void);
	32
	33	/* These are perodically updated in shared_info, and then copied here. */
	34	struct shadow_time_info {
	35	u64 tsc_timestamp; /* TSC at last update of time vals. */
	36	u64 system_timestamp; /* Time, in nanosecs, since boot. */
	37	u32 tsc_to_nsec_mul;
	38	int tsc_shift;
	39	u32 version;
	40	};
	41
	42	static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
	43
	44	/* runstate info updated by Xen */
	45	static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
	46
	47	/* snapshots of runstate info */
	48	static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
	49
	50	/* unused ns of stolen and blocked time */
	51	static DEFINE_PER_CPU(u64, residual_stolen);
	52	static DEFINE_PER_CPU(u64, residual_blocked);
	53
	54	/* return an consistent snapshot of 64-bit time/counter value */
	55	static u64 get64(const u64 *p)
	56	{
	57	u64 ret;
	58
	59	if (BITS_PER_LONG < 64) {
	60	u32 p32 = (u32 )p;
	61	u32 h, l;
	62
	63	/*
	64	* Read high then low, and then make sure high is
	65	* still the same; this will only loop if low wraps
	66	* and carries into high.
	67	* XXX some clean way to make this endian-proof?
	68	*/
	69	do {
	70	h = p32[1];
	71	barrier();
	72	l = p32[0];
	73	barrier();
	74	} while (p32[1] != h);
	75
	76	ret = (((u64)h) << 32) \| l;
	77	} else
	78	ret = *p;
	79
	80	return ret;
	81	}
	82
	83	/*
	84	* Runstate accounting
	85	*/
	86	static void get_runstate_snapshot(struct vcpu_runstate_info *res)
	87	{
	88	u64 state_time;
	89	struct vcpu_runstate_info *state;
	90
	91	BUG_ON(preemptible());
	92
	93	state = &__get_cpu_var(runstate);
	94
	95	/*
	96	* The runstate info is always updated by the hypervisor on
	97	* the current CPU, so there's no need to use anything
	98	* stronger than a compiler barrier when fetching it.
	99	*/
	100	do {
	101	state_time = get64(&state->state_entry_time);
	102	barrier();
	103	res = state;
	104	barrier();
	105	} while (get64(&state->state_entry_time) != state_time);
	106	}
	107
	108	static void setup_runstate_info(int cpu)
	109	{
	110	struct vcpu_register_runstate_memory_area area;
	111
	112	area.addr.v = &per_cpu(runstate, cpu);
	113
	114	if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
	115	cpu, &area))
	116	BUG();
	117	}
	118
	119	static void do_stolen_accounting(void)
	120	{
	121	struct vcpu_runstate_info state;
	122	struct vcpu_runstate_info *snap;
	123	s64 blocked, runnable, offline, stolen;
	124	cputime_t ticks;
	125
	126	get_runstate_snapshot(&state);
	127
	128	WARN_ON(state.state != RUNSTATE_running);
	129
	130	snap = &__get_cpu_var(runstate_snapshot);
	131
	132	/* work out how much time the VCPU has not been running */
	133	blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
	134	runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
	135	offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
	136
	137	*snap = state;
	138
	139	/* Add the appropriate number of ticks of stolen time,
	140	including any left-overs from last time. Passing NULL to
	141	account_steal_time accounts the time as stolen. */
	142	stolen = runnable + offline + __get_cpu_var(residual_stolen);
	143
	144	if (stolen < 0)
	145	stolen = 0;
	146
	147	ticks = 0;
	148	while (stolen >= NS_PER_TICK) {
	149	ticks++;
	150	stolen -= NS_PER_TICK;
	151	}
	152	__get_cpu_var(residual_stolen) = stolen;
	153	account_steal_time(NULL, ticks);
	154
	155	/* Add the appropriate number of ticks of blocked time,
	156	including any left-overs from last time. Passing idle to
	157	account_steal_time accounts the time as idle/wait. */
	158	blocked += __get_cpu_var(residual_blocked);
	159
	160	if (blocked < 0)
	161	blocked = 0;
	162
	163	ticks = 0;
	164	while (blocked >= NS_PER_TICK) {
	165	ticks++;
	166	blocked -= NS_PER_TICK;
	167	}
	168	__get_cpu_var(residual_blocked) = blocked;
	169	account_steal_time(idle_task(smp_processor_id()), ticks);
	170	}
	171
	172	/*
	173	* Xen sched_clock implementation. Returns the number of unstolen
	174	* nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
	175	* states.
	176	*/
	177	unsigned long long xen_sched_clock(void)
	178	{
	179	struct vcpu_runstate_info state;
	180	cycle_t now;
	181	u64 ret;
	182	s64 offset;
	183
	184	/*
	185	* Ideally sched_clock should be called on a per-cpu basis
	186	* anyway, so preempt should already be disabled, but that's
	187	* not current practice at the moment.
	188	*/
	189	preempt_disable();
	190
	191	now = xen_clocksource_read();
	192
	193	get_runstate_snapshot(&state);
	194
	195	WARN_ON(state.state != RUNSTATE_running);
	196
	197	offset = now - state.state_entry_time;
	198	if (offset < 0)
	199	offset = 0;
	200
	201	ret = state.time[RUNSTATE_blocked] +
	202	state.time[RUNSTATE_running] +
	203	offset;
	204
	205	preempt_enable();
	206
	207	return ret;
	208	}
	209
	210
	211	/* Get the CPU speed from Xen */
	212	unsigned long xen_cpu_khz(void)
	213	{
	214	u64 cpu_khz = 1000000ULL << 32;
	215	const struct vcpu_time_info *info =
	216	&HYPERVISOR_shared_info->vcpu_info[0].time;
	217
	218	do_div(cpu_khz, info->tsc_to_system_mul);
	219	if (info->tsc_shift < 0)
	220	cpu_khz <<= -info->tsc_shift;
	221	else
	222	cpu_khz >>= info->tsc_shift;
	223
	224	return cpu_khz;
	225	}
	226
	227	/*
	228	* Reads a consistent set of time-base values from Xen, into a shadow data
	229	* area.
	230	*/
	231	static unsigned get_time_values_from_xen(void)
	232	{
	233	struct vcpu_time_info *src;
	234	struct shadow_time_info *dst;
	235
	236	/* src is shared memory with the hypervisor, so we need to
	237	make sure we get a consistent snapshot, even in the face of
	238	being preempted. */
	239	src = &__get_cpu_var(xen_vcpu)->time;
	240	dst = &__get_cpu_var(shadow_time);
	241
	242	do {
	243	dst->version = src->version;
	244	rmb(); /* fetch version before data */
	245	dst->tsc_timestamp = src->tsc_timestamp;
	246	dst->system_timestamp = src->system_time;
	247	dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
	248	dst->tsc_shift = src->tsc_shift;
	249	rmb(); /* test version after fetching data */
	250	} while ((src->version & 1) \| (dst->version ^ src->version));
	251
	252	return dst->version;
	253	}
	254
	255	/*
	256	* Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
	257	* yielding a 64-bit result.
	258	*/
	259	static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
	260	{
	261	u64 product;
	262	#ifdef __i386__
	263	u32 tmp1, tmp2;
	264	#endif
	265
	266	if (shift < 0)
	267	delta >>= -shift;
	268	else
	269	delta <<= shift;
	270
	271	#ifdef __i386__
	272	__asm__ (
	273	"mul %5 ; "
	274	"mov %4,%%eax ; "
	275	"mov %%edx,%4 ; "
	276	"mul %5 ; "
	277	"xor %5,%5 ; "
	278	"add %4,%%eax ; "
	279	"adc %5,%%edx ; "
	280	: "=A" (product), "=r" (tmp1), "=r" (tmp2)
	281	: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
	282	#elif __x86_64__
	283	__asm__ (
	284	"mul %%rdx ; shrd $32,%%rdx,%%rax"
	285	: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
	286	#else
	287	#error implement me!
	288	#endif
	289
	290	return product;
	291	}
	292
	293	static u64 get_nsec_offset(struct shadow_time_info *shadow)
	294	{
	295	u64 now, delta;
	296	now = native_read_tsc();
	297	delta = now - shadow->tsc_timestamp;
	298	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
	299	}
	300
	301	static cycle_t xen_clocksource_read(void)
	302	{
	303	struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
	304	cycle_t ret;
	305	unsigned version;
	306
	307	do {
	308	version = get_time_values_from_xen();
	309	barrier();
	310	ret = shadow->system_timestamp + get_nsec_offset(shadow);
	311	barrier();
	312	} while (version != __get_cpu_var(xen_vcpu)->time.version);
	313
	314	put_cpu_var(shadow_time);
	315
	316	return ret;
	317	}
	318
	319	static void xen_read_wallclock(struct timespec *ts)
	320	{
	321	const struct shared_info *s = HYPERVISOR_shared_info;
	322	u32 version;
	323	u64 delta;
	324	struct timespec now;
	325
	326	/* get wallclock at system boot */
	327	do {
	328	version = s->wc_version;
	329	rmb(); /* fetch version before time */
	330	now.tv_sec = s->wc_sec;
	331	now.tv_nsec = s->wc_nsec;
	332	rmb(); /* fetch time before checking version */
	333	} while ((s->wc_version & 1) \| (version ^ s->wc_version));
	334
	335	delta = xen_clocksource_read(); /* time since system boot */
	336	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
	337
	338	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
	339	now.tv_sec = delta;
	340
	341	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
	342	}
	343
	344	unsigned long xen_get_wallclock(void)
	345	{
	346	struct timespec ts;
	347
	348	xen_read_wallclock(&ts);
	349
	350	return ts.tv_sec;
	351	}
	352
	353	int xen_set_wallclock(unsigned long now)
	354	{
	355	/* do nothing for domU */
	356	return -1;
	357	}
	358
	359	static struct clocksource xen_clocksource __read_mostly = {
	360	.name = "xen",
	361	.rating = 400,
	362	.read = xen_clocksource_read,
	363	.mask = ~0,
	364	.mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
	365	.shift = XEN_SHIFT,
	366	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
	367	};
	368
	369	/*
	370	Xen clockevent implementation
	371
	372	Xen has two clockevent implementations:
	373
	374	The old timer_op one works with all released versions of Xen prior
	375	to version 3.0.4. This version of the hypervisor provides a
	376	single-shot timer with nanosecond resolution. However, sharing the
	377	same event channel is a 100Hz tick which is delivered while the
	378	vcpu is running. We don't care about or use this tick, but it will
	379	cause the core time code to think the timer fired too soon, and
	380	will end up resetting it each time. It could be filtered, but
	381	doing so has complications when the ktime clocksource is not yet
	382	the xen clocksource (ie, at boot time).
	383
	384	The new vcpu_op-based timer interface allows the tick timer period
	385	to be changed or turned off. The tick timer is not useful as a
	386	periodic timer because events are only delivered to running vcpus.
	387	The one-shot timer can report when a timeout is in the past, so
	388	set_next_event is capable of returning -ETIME when appropriate.
	389	This interface is used when available.
	390	*/
	391
	392
	393	/*
	394	Get a hypervisor absolute time. In theory we could maintain an
	395	offset between the kernel's time and the hypervisor's time, and
	396	apply that to a kernel's absolute timeout. Unfortunately the
	397	hypervisor and kernel times can drift even if the kernel is using
	398	the Xen clocksource, because ntp can warp the kernel's clocksource.
	399	*/
	400	static s64 get_abs_timeout(unsigned long delta)
	401	{
	402	return xen_clocksource_read() + delta;
	403	}
	404
	405	static void xen_timerop_set_mode(enum clock_event_mode mode,
	406	struct clock_event_device *evt)
	407	{
	408	switch (mode) {
	409	case CLOCK_EVT_MODE_PERIODIC:
	410	/* unsupported */
	411	WARN_ON(1);
	412	break;
	413
	414	case CLOCK_EVT_MODE_ONESHOT:
	415	case CLOCK_EVT_MODE_RESUME:
	416	break;
	417
	418	case CLOCK_EVT_MODE_UNUSED:
	419	case CLOCK_EVT_MODE_SHUTDOWN:
	420	HYPERVISOR_set_timer_op(0); /* cancel timeout */
	421	break;
	422	}
	423	}
	424
	425	static int xen_timerop_set_next_event(unsigned long delta,
	426	struct clock_event_device *evt)
	427	{
	428	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
	429
	430	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
	431	BUG();
	432
	433	/* We may have missed the deadline, but there's no real way of
	434	knowing for sure. If the event was in the past, then we'll
	435	get an immediate interrupt. */
	436
	437	return 0;
	438	}
	439
	440	static const struct clock_event_device xen_timerop_clockevent = {
	441	.name = "xen",
	442	.features = CLOCK_EVT_FEAT_ONESHOT,
	443
	444	.max_delta_ns = 0xffffffff,
	445	.min_delta_ns = TIMER_SLOP,
	446
	447	.mult = 1,
	448	.shift = 0,
	449	.rating = 500,
	450
	451	.set_mode = xen_timerop_set_mode,
	452	.set_next_event = xen_timerop_set_next_event,
	453	};
	454
	455
	456
	457	static void xen_vcpuop_set_mode(enum clock_event_mode mode,
	458	struct clock_event_device *evt)
	459	{
	460	int cpu = smp_processor_id();
	461
	462	switch (mode) {
	463	case CLOCK_EVT_MODE_PERIODIC:
	464	WARN_ON(1); /* unsupported */
	465	break;
	466
	467	case CLOCK_EVT_MODE_ONESHOT:
	468	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
	469	BUG();
	470	break;
	471
	472	case CLOCK_EVT_MODE_UNUSED:
	473	case CLOCK_EVT_MODE_SHUTDOWN:
	474	if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) \|\|
	475	HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
	476	BUG();
	477	break;
	478	case CLOCK_EVT_MODE_RESUME:
	479	break;
	480	}
	481	}
	482
	483	static int xen_vcpuop_set_next_event(unsigned long delta,
	484	struct clock_event_device *evt)
	485	{
	486	int cpu = smp_processor_id();
	487	struct vcpu_set_singleshot_timer single;
	488	int ret;
	489
	490	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
	491
	492	single.timeout_abs_ns = get_abs_timeout(delta);
	493	single.flags = VCPU_SSHOTTMR_future;
	494
	495	ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
	496
	497	BUG_ON(ret != 0 && ret != -ETIME);
	498
	499	return ret;
	500	}
	501
	502	static const struct clock_event_device xen_vcpuop_clockevent = {
	503	.name = "xen",
	504	.features = CLOCK_EVT_FEAT_ONESHOT,
	505
	506	.max_delta_ns = 0xffffffff,
	507	.min_delta_ns = TIMER_SLOP,
	508
	509	.mult = 1,
	510	.shift = 0,
	511	.rating = 500,
	512
	513	.set_mode = xen_vcpuop_set_mode,
	514	.set_next_event = xen_vcpuop_set_next_event,
	515	};
	516
	517	static const struct clock_event_device *xen_clockevent =
	518	&xen_timerop_clockevent;
	519	static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
	520
	521	static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
	522	{
	523	struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
	524	irqreturn_t ret;
	525
	526	ret = IRQ_NONE;
	527	if (evt->event_handler) {
	528	evt->event_handler(evt);
	529	ret = IRQ_HANDLED;
	530	}
	531
	532	do_stolen_accounting();
	533
	534	return ret;
	535	}
	536
	537	void xen_setup_timer(int cpu)
	538	{
	539	const char *name;
	540	struct clock_event_device *evt;
	541	int irq;
	542
	543	printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
	544
	545	name = kasprintf(GFP_KERNEL, "timer%d", cpu);
	546	if (!name)
	547	name = "<timer kasprintf failed>";
	548
	549	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
	550	IRQF_DISABLED\|IRQF_PERCPU\|IRQF_NOBALANCING,
	551	name, NULL);
	552
	553	evt = &per_cpu(xen_clock_events, cpu);
	554	memcpy(evt, xen_clockevent, sizeof(*evt));
	555
	556	evt->cpumask = cpumask_of_cpu(cpu);
	557	evt->irq = irq;
	558
	559	setup_runstate_info(cpu);
	560	}
	561
	562	void xen_setup_cpu_clockevents(void)
	563	{
	564	BUG_ON(preemptible());
	565
	566	clockevents_register_device(&__get_cpu_var(xen_clock_events));
	567	}
	568
	569	__init void xen_time_init(void)
	570	{
	571	int cpu = smp_processor_id();
	572
	573	get_time_values_from_xen();
	574
	575	clocksource_register(&xen_clocksource);
	576
	577	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
	578	/* Successfully turned off 100Hz tick, so we have the
	579	vcpuop-based timer interface */
	580	printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
	581	xen_clockevent = &xen_vcpuop_clockevent;
	582	}
	583
	584	/* Set initial system time with full resolution */
	585	xen_read_wallclock(&xtime);
	586	set_normalized_timespec(&wall_to_monotonic,
	587	-xtime.tv_sec, -xtime.tv_nsec);
	588
	589	tsc_disable = 0;
	590
	591	xen_setup_timer(cpu);
	592	xen_setup_cpu_clockevents();
	593	}