aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorZachary Amsden <zach@vmware.com>2007-02-13 07:26:21 -0500
committerAndi Kleen <andi@basil.nowhere.org>2007-02-13 07:26:21 -0500
commitbbab4f3bb7f528d2b8ccb5de9ae5f6ff3fb29684 (patch)
tree141d035b9d79711e6679fadc31c9583f908dfedb
parent7ce0bcfd1667736f1293cff845139bbee53186de (diff)
[PATCH] i386: vMI timer patches
VMI timer code. It works by taking over the local APIC clock when APIC is configured, which requires a couple hooks into the APIC code. The backend timer code could be commonized into the timer infrastructure, but there are some pieces missing (stolen time, in particular), and the exact semantics of when to do accounting for NO_IDLE need to be shared between different hypervisors as well. So for now, VMI timer is a separate module. [Adrian Bunk: cleanups] Subject: VMI timer patches Signed-off-by: Zachary Amsden <zach@vmware.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Andi Kleen <ak@suse.de> Cc: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Chris Wright <chrisw@sous-sol.org> Signed-off-by: Andrew Morton <akpm@osdl.org>
-rw-r--r--arch/i386/Kconfig9
-rw-r--r--arch/i386/kernel/Makefile2
-rw-r--r--arch/i386/kernel/apic.c2
-rw-r--r--arch/i386/kernel/entry.S5
-rw-r--r--arch/i386/kernel/paravirt.c2
-rw-r--r--arch/i386/kernel/smpboot.c4
-rw-r--r--arch/i386/kernel/time.c4
-rw-r--r--arch/i386/kernel/tsc.c4
-rw-r--r--arch/i386/kernel/vmi.c45
-rw-r--r--arch/i386/kernel/vmitime.c495
-rw-r--r--include/asm-i386/apic.h2
-rw-r--r--include/asm-i386/paravirt.h12
-rw-r--r--include/asm-i386/time.h1
-rw-r--r--include/asm-i386/timer.h2
-rw-r--r--include/asm-i386/vmi_time.h103
15 files changed, 687 insertions, 5 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index a3b3f6ee3642..595fb771366e 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -1272,3 +1272,12 @@ config X86_TRAMPOLINE
1272config KTIME_SCALAR 1272config KTIME_SCALAR
1273 bool 1273 bool
1274 default y 1274 default y
1275
1276config NO_IDLE_HZ
1277 bool
1278 depends on PARAVIRT
1279 default y
1280 help
1281 Switches the regular HZ timer off when the system is going idle.
1282 This helps a hypervisor detect that the Linux system is idle,
1283 reducing the overhead of idle systems.
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 9cfb58911f14..97f1e961d684 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -40,7 +40,7 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
40obj-$(CONFIG_HPET_TIMER) += hpet.o 40obj-$(CONFIG_HPET_TIMER) += hpet.o
41obj-$(CONFIG_K8_NB) += k8.o 41obj-$(CONFIG_K8_NB) += k8.o
42 42
43obj-$(CONFIG_VMI) += vmi.o 43obj-$(CONFIG_VMI) += vmi.o vmitime.o
44 44
45# Make sure this is linked after any other paravirt_ops structs: see head.S 45# Make sure this is linked after any other paravirt_ops structs: see head.S
46obj-$(CONFIG_PARAVIRT) += paravirt.o 46obj-$(CONFIG_PARAVIRT) += paravirt.o
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 776d9be26af9..629c5ed94260 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -1395,7 +1395,7 @@ int __init APIC_init_uniprocessor (void)
1395 if (!skip_ioapic_setup && nr_ioapics) 1395 if (!skip_ioapic_setup && nr_ioapics)
1396 setup_IO_APIC(); 1396 setup_IO_APIC();
1397#endif 1397#endif
1398 setup_boot_APIC_clock(); 1398 setup_boot_clock();
1399 1399
1400 return 0; 1400 return 0;
1401} 1401}
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 8c6a22a42d2e..d4b4ffc9eacb 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -626,6 +626,11 @@ ENTRY(name) \
626/* The include is where all of the SMP etc. interrupts come from */ 626/* The include is where all of the SMP etc. interrupts come from */
627#include "entry_arch.h" 627#include "entry_arch.h"
628 628
629/* This alternate entry is needed because we hijack the apic LVTT */
630#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC)
631BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR)
632#endif
633
629KPROBE_ENTRY(page_fault) 634KPROBE_ENTRY(page_fault)
630 RING0_EC_FRAME 635 RING0_EC_FRAME
631 pushl $do_page_fault 636 pushl $do_page_fault
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index 5bf81059a7e6..2003733310dc 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -544,6 +544,8 @@ struct paravirt_ops paravirt_ops = {
544 .apic_write = native_apic_write, 544 .apic_write = native_apic_write,
545 .apic_write_atomic = native_apic_write_atomic, 545 .apic_write_atomic = native_apic_write_atomic,
546 .apic_read = native_apic_read, 546 .apic_read = native_apic_read,
547 .setup_boot_clock = setup_boot_APIC_clock,
548 .setup_secondary_clock = setup_secondary_APIC_clock,
547#endif 549#endif
548 .set_lazy_mode = (void *)native_nop, 550 .set_lazy_mode = (void *)native_nop,
549 551
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 42502d820e4f..5a00b07e7194 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -554,7 +554,7 @@ static void __cpuinit start_secondary(void *unused)
554 smp_callin(); 554 smp_callin();
555 while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) 555 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
556 rep_nop(); 556 rep_nop();
557 setup_secondary_APIC_clock(); 557 setup_secondary_clock();
558 if (nmi_watchdog == NMI_IO_APIC) { 558 if (nmi_watchdog == NMI_IO_APIC) {
559 disable_8259A_irq(0); 559 disable_8259A_irq(0);
560 enable_NMI_through_LVT0(NULL); 560 enable_NMI_through_LVT0(NULL);
@@ -1331,7 +1331,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
1331 1331
1332 smpboot_setup_io_apic(); 1332 smpboot_setup_io_apic();
1333 1333
1334 setup_boot_APIC_clock(); 1334 setup_boot_clock();
1335 1335
1336 /* 1336 /*
1337 * Synchronize the TSC with the AP 1337 * Synchronize the TSC with the AP
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index c505b16c0990..9603ccaba997 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -232,6 +232,7 @@ EXPORT_SYMBOL(get_cmos_time);
232static void sync_cmos_clock(unsigned long dummy); 232static void sync_cmos_clock(unsigned long dummy);
233 233
234static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); 234static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
235int no_sync_cmos_clock;
235 236
236static void sync_cmos_clock(unsigned long dummy) 237static void sync_cmos_clock(unsigned long dummy)
237{ 238{
@@ -275,7 +276,8 @@ static void sync_cmos_clock(unsigned long dummy)
275 276
276void notify_arch_cmos_timer(void) 277void notify_arch_cmos_timer(void)
277{ 278{
278 mod_timer(&sync_cmos_timer, jiffies + 1); 279 if (!no_sync_cmos_clock)
280 mod_timer(&sync_cmos_timer, jiffies + 1);
279} 281}
280 282
281static long clock_cmos_diff; 283static long clock_cmos_diff;
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 2cfc7b09b925..12fef14995a5 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -23,6 +23,7 @@
23 * an extra value to store the TSC freq 23 * an extra value to store the TSC freq
24 */ 24 */
25unsigned int tsc_khz; 25unsigned int tsc_khz;
26unsigned long long (*custom_sched_clock)(void);
26 27
27int tsc_disable; 28int tsc_disable;
28 29
@@ -107,6 +108,9 @@ unsigned long long sched_clock(void)
107{ 108{
108 unsigned long long this_offset; 109 unsigned long long this_offset;
109 110
111 if (unlikely(custom_sched_clock))
112 return (*custom_sched_clock)();
113
110 /* 114 /*
111 * in the NUMA case we dont use the TSC as they are not 115 * in the NUMA case we dont use the TSC as they are not
112 * synchronized across all CPUs. 116 * synchronized across all CPUs.
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
index a94d64b10f75..bb5a7abf949c 100644
--- a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -34,6 +34,7 @@
34#include <asm/apic.h> 34#include <asm/apic.h>
35#include <asm/processor.h> 35#include <asm/processor.h>
36#include <asm/timer.h> 36#include <asm/timer.h>
37#include <asm/vmi_time.h>
37 38
38/* Convenient for calling VMI functions indirectly in the ROM */ 39/* Convenient for calling VMI functions indirectly in the ROM */
39typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void); 40typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
@@ -67,6 +68,7 @@ struct {
67 void (*set_linear_mapping)(int, u32, u32, u32); 68 void (*set_linear_mapping)(int, u32, u32, u32);
68 void (*flush_tlb)(int); 69 void (*flush_tlb)(int);
69 void (*set_initial_ap_state)(int, int); 70 void (*set_initial_ap_state)(int, int);
71 void (*halt)(void);
70} vmi_ops; 72} vmi_ops;
71 73
72/* XXX move this to alternative.h */ 74/* XXX move this to alternative.h */
@@ -252,6 +254,19 @@ static void vmi_nop(void)
252{ 254{
253} 255}
254 256
257/* For NO_IDLE_HZ, we stop the clock when halting the kernel */
258#ifdef CONFIG_NO_IDLE_HZ
259static fastcall void vmi_safe_halt(void)
260{
261 int idle = vmi_stop_hz_timer();
262 vmi_ops.halt();
263 if (idle) {
264 local_irq_disable();
265 vmi_account_time_restart_hz_timer();
266 local_irq_enable();
267 }
268}
269#endif
255 270
256#ifdef CONFIG_DEBUG_PAGE_TYPE 271#ifdef CONFIG_DEBUG_PAGE_TYPE
257 272
@@ -727,7 +742,12 @@ static inline int __init activate_vmi(void)
727 (char *)paravirt_ops.save_fl); 742 (char *)paravirt_ops.save_fl);
728 patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE], 743 patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE],
729 (char *)paravirt_ops.irq_disable); 744 (char *)paravirt_ops.irq_disable);
745#ifndef CONFIG_NO_IDLE_HZ
730 para_fill(safe_halt, Halt); 746 para_fill(safe_halt, Halt);
747#else
748 vmi_ops.halt = vmi_get_function(VMI_CALL_Halt);
749 paravirt_ops.safe_halt = vmi_safe_halt;
750#endif
731 para_fill(wbinvd, WBINVD); 751 para_fill(wbinvd, WBINVD);
732 /* paravirt_ops.read_msr = vmi_rdmsr */ 752 /* paravirt_ops.read_msr = vmi_rdmsr */
733 /* paravirt_ops.write_msr = vmi_wrmsr */ 753 /* paravirt_ops.write_msr = vmi_wrmsr */
@@ -838,6 +858,31 @@ static inline int __init activate_vmi(void)
838#endif 858#endif
839 859
840 /* 860 /*
861 * Check for VMI timer functionality by probing for a cycle frequency method
862 */
863 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
864 if (rel->type != VMI_RELOCATION_NONE) {
865 vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
866 vmi_timer_ops.get_cycle_counter =
867 vmi_get_function(VMI_CALL_GetCycleCounter);
868 vmi_timer_ops.get_wallclock =
869 vmi_get_function(VMI_CALL_GetWallclockTime);
870 vmi_timer_ops.wallclock_updated =
871 vmi_get_function(VMI_CALL_WallclockUpdated);
872 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
873 vmi_timer_ops.cancel_alarm =
874 vmi_get_function(VMI_CALL_CancelAlarm);
875 paravirt_ops.time_init = vmi_time_init;
876 paravirt_ops.get_wallclock = vmi_get_wallclock;
877 paravirt_ops.set_wallclock = vmi_set_wallclock;
878#ifdef CONFIG_X86_LOCAL_APIC
879 paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm;
880 paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm;
881#endif
882 custom_sched_clock = vmi_sched_clock;
883 }
884
885 /*
841 * Alternative instruction rewriting doesn't happen soon enough 886 * Alternative instruction rewriting doesn't happen soon enough
842 * to convert VMI_IRET to a call instead of a jump; so we have 887 * to convert VMI_IRET to a call instead of a jump; so we have
843 * to do this before IRQs get reenabled. Fortunately, it is 888 * to do this before IRQs get reenabled. Fortunately, it is
diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c
new file mode 100644
index 000000000000..7c3033dbe5f5
--- /dev/null
+++ b/arch/i386/kernel/vmitime.c
@@ -0,0 +1,495 @@
1/*
2 * VMI paravirtual timer support routines.
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to dhecht@vmware.com
22 *
23 */
24
25/*
26 * Portions of this code from arch/i386/kernel/timers/timer_tsc.c.
27 * Portions of the CONFIG_NO_IDLE_HZ code from arch/s390/kernel/time.c.
28 * See comments there for proper credits.
29 */
30
31#include <linux/spinlock.h>
32#include <linux/init.h>
33#include <linux/errno.h>
34#include <linux/jiffies.h>
35#include <linux/interrupt.h>
36#include <linux/kernel_stat.h>
37#include <linux/rcupdate.h>
38#include <linux/clocksource.h>
39
40#include <asm/timer.h>
41#include <asm/io.h>
42#include <asm/apic.h>
43#include <asm/div64.h>
44#include <asm/timer.h>
45#include <asm/desc.h>
46
47#include <asm/vmi.h>
48#include <asm/vmi_time.h>
49
50#include <mach_timer.h>
51#include <io_ports.h>
52
53#ifdef CONFIG_X86_LOCAL_APIC
54#define VMI_ALARM_WIRING VMI_ALARM_WIRED_LVTT
55#else
56#define VMI_ALARM_WIRING VMI_ALARM_WIRED_IRQ0
57#endif
58
59/* Cached VMI operations */
60struct vmi_timer_ops vmi_timer_ops;
61
62#ifdef CONFIG_NO_IDLE_HZ
63
64/* /proc/sys/kernel/hz_timer state. */
65int sysctl_hz_timer;
66
67/* Some stats */
68static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_irqs);
69static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_jiffies);
70static DEFINE_PER_CPU(unsigned long, idle_start_jiffies);
71
72#endif /* CONFIG_NO_IDLE_HZ */
73
74/* Number of alarms per second. By default this is CONFIG_VMI_ALARM_HZ. */
75static int alarm_hz = CONFIG_VMI_ALARM_HZ;
76
77/* Cache of the value get_cycle_frequency / HZ. */
78static signed long long cycles_per_jiffy;
79
80/* Cache of the value get_cycle_frequency / alarm_hz. */
81static signed long long cycles_per_alarm;
82
83/* The number of cycles accounted for by the 'jiffies'/'xtime' count.
84 * Protected by xtime_lock. */
85static unsigned long long real_cycles_accounted_system;
86
87/* The number of cycles accounted for by update_process_times(), per cpu. */
88static DEFINE_PER_CPU(unsigned long long, process_times_cycles_accounted_cpu);
89
90/* The number of stolen cycles accounted, per cpu. */
91static DEFINE_PER_CPU(unsigned long long, stolen_cycles_accounted_cpu);
92
93/* Clock source. */
94static cycle_t read_real_cycles(void)
95{
96 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
97}
98
99static cycle_t read_available_cycles(void)
100{
101 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
102}
103
104#if 0
105static cycle_t read_stolen_cycles(void)
106{
107 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_STOLEN);
108}
109#endif /* 0 */
110
111static struct clocksource clocksource_vmi = {
112 .name = "vmi-timer",
113 .rating = 450,
114 .read = read_real_cycles,
115 .mask = CLOCKSOURCE_MASK(64),
116 .mult = 0, /* to be set */
117 .shift = 22,
118 .is_continuous = 1,
119};
120
121
122/* Timer interrupt handler. */
123static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id);
124
125static struct irqaction vmi_timer_irq = {
126 vmi_timer_interrupt,
127 SA_INTERRUPT,
128 CPU_MASK_NONE,
129 "VMI-alarm",
130 NULL,
131 NULL
132};
133
134/* Alarm rate */
135static int __init vmi_timer_alarm_rate_setup(char* str)
136{
137 int alarm_rate;
138 if (get_option(&str, &alarm_rate) == 1 && alarm_rate > 0) {
139 alarm_hz = alarm_rate;
140 printk(KERN_WARNING "VMI timer alarm HZ set to %d\n", alarm_hz);
141 }
142 return 1;
143}
144__setup("vmi_timer_alarm_hz=", vmi_timer_alarm_rate_setup);
145
146
147/* Initialization */
148static void vmi_get_wallclock_ts(struct timespec *ts)
149{
150 unsigned long long wallclock;
151 wallclock = vmi_timer_ops.get_wallclock(); // nsec units
152 ts->tv_nsec = do_div(wallclock, 1000000000);
153 ts->tv_sec = wallclock;
154}
155
156static void update_xtime_from_wallclock(void)
157{
158 struct timespec ts;
159 vmi_get_wallclock_ts(&ts);
160 do_settimeofday(&ts);
161}
162
163unsigned long vmi_get_wallclock(void)
164{
165 struct timespec ts;
166 vmi_get_wallclock_ts(&ts);
167 return ts.tv_sec;
168}
169
170int vmi_set_wallclock(unsigned long now)
171{
172 return -1;
173}
174
175unsigned long long vmi_sched_clock(void)
176{
177 return read_available_cycles();
178}
179
180void __init vmi_time_init(void)
181{
182 unsigned long long cycles_per_sec, cycles_per_msec;
183
184 setup_irq(0, &vmi_timer_irq);
185#ifdef CONFIG_X86_LOCAL_APIC
186 set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt);
187#endif
188
189 no_sync_cmos_clock = 1;
190
191 vmi_get_wallclock_ts(&xtime);
192 set_normalized_timespec(&wall_to_monotonic,
193 -xtime.tv_sec, -xtime.tv_nsec);
194
195 real_cycles_accounted_system = read_real_cycles();
196 update_xtime_from_wallclock();
197 per_cpu(process_times_cycles_accounted_cpu, 0) = read_available_cycles();
198
199 cycles_per_sec = vmi_timer_ops.get_cycle_frequency();
200
201 cycles_per_jiffy = cycles_per_sec;
202 (void)do_div(cycles_per_jiffy, HZ);
203 cycles_per_alarm = cycles_per_sec;
204 (void)do_div(cycles_per_alarm, alarm_hz);
205 cycles_per_msec = cycles_per_sec;
206 (void)do_div(cycles_per_msec, 1000);
207 cpu_khz = cycles_per_msec;
208
209 printk(KERN_WARNING "VMI timer cycles/sec = %llu ; cycles/jiffy = %llu ;"
210 "cycles/alarm = %llu\n", cycles_per_sec, cycles_per_jiffy,
211 cycles_per_alarm);
212
213 clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
214 clocksource_vmi.shift);
215 if (clocksource_register(&clocksource_vmi))
216 printk(KERN_WARNING "Error registering VMITIME clocksource.");
217
218 /* Disable PIT. */
219 outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
220
221 /* schedule the alarm. do this in phase with process_times_cycles_accounted_cpu
222 * reduce the latency calling update_process_times. */
223 vmi_timer_ops.set_alarm(
224 VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
225 per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
226 cycles_per_alarm);
227}
228
229#ifdef CONFIG_X86_LOCAL_APIC
230
231void __init vmi_timer_setup_boot_alarm(void)
232{
233 local_irq_disable();
234
235 /* Route the interrupt to the correct vector. */
236 apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
237
238 /* Cancel the IRQ0 wired alarm, and setup the LVTT alarm. */
239 vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
240 vmi_timer_ops.set_alarm(
241 VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
242 per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
243 cycles_per_alarm);
244 local_irq_enable();
245}
246
247/* Initialize the time accounting variables for an AP on an SMP system.
248 * Also, set the local alarm for the AP. */
249void __init vmi_timer_setup_secondary_alarm(void)
250{
251 int cpu = smp_processor_id();
252
253 /* Route the interrupt to the correct vector. */
254 apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
255
256 per_cpu(process_times_cycles_accounted_cpu, cpu) = read_available_cycles();
257
258 vmi_timer_ops.set_alarm(
259 VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
260 per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
261 cycles_per_alarm);
262}
263
264#endif
265
266/* Update system wide (real) time accounting (e.g. jiffies, xtime). */
267static void vmi_account_real_cycles(unsigned long long cur_real_cycles)
268{
269 long long cycles_not_accounted;
270
271 write_seqlock(&xtime_lock);
272
273 cycles_not_accounted = cur_real_cycles - real_cycles_accounted_system;
274 while (cycles_not_accounted >= cycles_per_jiffy) {
275 /* systems wide jiffies and wallclock. */
276 do_timer(1);
277
278 cycles_not_accounted -= cycles_per_jiffy;
279 real_cycles_accounted_system += cycles_per_jiffy;
280 }
281
282 if (vmi_timer_ops.wallclock_updated())
283 update_xtime_from_wallclock();
284
285 write_sequnlock(&xtime_lock);
286}
287
288/* Update per-cpu process times. */
289static void vmi_account_process_times_cycles(struct pt_regs *regs, int cpu,
290 unsigned long long cur_process_times_cycles)
291{
292 long long cycles_not_accounted;
293 cycles_not_accounted = cur_process_times_cycles -
294 per_cpu(process_times_cycles_accounted_cpu, cpu);
295
296 while (cycles_not_accounted >= cycles_per_jiffy) {
297 /* Account time to the current process. This includes
298 * calling into the scheduler to decrement the timeslice
299 * and possibly reschedule.*/
300 update_process_times(user_mode(regs));
301 /* XXX handle /proc/profile multiplier. */
302 profile_tick(CPU_PROFILING);
303
304 cycles_not_accounted -= cycles_per_jiffy;
305 per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
306 }
307}
308
309#ifdef CONFIG_NO_IDLE_HZ
310/* Update per-cpu idle times. Used when a no-hz halt is ended. */
311static void vmi_account_no_hz_idle_cycles(int cpu,
312 unsigned long long cur_process_times_cycles)
313{
314 long long cycles_not_accounted;
315 unsigned long no_idle_hz_jiffies = 0;
316
317 cycles_not_accounted = cur_process_times_cycles -
318 per_cpu(process_times_cycles_accounted_cpu, cpu);
319
320 while (cycles_not_accounted >= cycles_per_jiffy) {
321 no_idle_hz_jiffies++;
322 cycles_not_accounted -= cycles_per_jiffy;
323 per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
324 }
325 /* Account time to the idle process. */
326 account_steal_time(idle_task(cpu), jiffies_to_cputime(no_idle_hz_jiffies));
327}
328#endif
329
330/* Update per-cpu stolen time. */
331static void vmi_account_stolen_cycles(int cpu,
332 unsigned long long cur_real_cycles,
333 unsigned long long cur_avail_cycles)
334{
335 long long stolen_cycles_not_accounted;
336 unsigned long stolen_jiffies = 0;
337
338 if (cur_real_cycles < cur_avail_cycles)
339 return;
340
341 stolen_cycles_not_accounted = cur_real_cycles - cur_avail_cycles -
342 per_cpu(stolen_cycles_accounted_cpu, cpu);
343
344 while (stolen_cycles_not_accounted >= cycles_per_jiffy) {
345 stolen_jiffies++;
346 stolen_cycles_not_accounted -= cycles_per_jiffy;
347 per_cpu(stolen_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
348 }
349 /* HACK: pass NULL to force time onto cpustat->steal. */
350 account_steal_time(NULL, jiffies_to_cputime(stolen_jiffies));
351}
352
353/* Body of either IRQ0 interrupt handler (UP no local-APIC) or
354 * local-APIC LVTT interrupt handler (UP & local-APIC or SMP). */
355static void vmi_local_timer_interrupt(int cpu)
356{
357 unsigned long long cur_real_cycles, cur_process_times_cycles;
358
359 cur_real_cycles = read_real_cycles();
360 cur_process_times_cycles = read_available_cycles();
361 /* Update system wide (real) time state (xtime, jiffies). */
362 vmi_account_real_cycles(cur_real_cycles);
363 /* Update per-cpu process times. */
364 vmi_account_process_times_cycles(get_irq_regs(), cpu, cur_process_times_cycles);
365 /* Update time stolen from this cpu by the hypervisor. */
366 vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
367}
368
369#ifdef CONFIG_NO_IDLE_HZ
370
371/* Must be called only from idle loop, with interrupts disabled. */
372int vmi_stop_hz_timer(void)
373{
374 /* Note that cpu_set, cpu_clear are (SMP safe) atomic on x86. */
375
376 unsigned long seq, next;
377 unsigned long long real_cycles_expiry;
378 int cpu = smp_processor_id();
379 int idle;
380
381 BUG_ON(!irqs_disabled());
382 if (sysctl_hz_timer != 0)
383 return 0;
384
385 cpu_set(cpu, nohz_cpu_mask);
386 smp_mb();
387 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
388 (next = next_timer_interrupt(), time_before_eq(next, jiffies))) {
389 cpu_clear(cpu, nohz_cpu_mask);
390 next = jiffies;
391 idle = 0;
392 } else
393 idle = 1;
394
395 /* Convert jiffies to the real cycle counter. */
396 do {
397 seq = read_seqbegin(&xtime_lock);
398 real_cycles_expiry = real_cycles_accounted_system +
399 (long)(next - jiffies) * cycles_per_jiffy;
400 } while (read_seqretry(&xtime_lock, seq));
401
402 /* This cpu is going idle. Disable the periodic alarm. */
403 if (idle) {
404 vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
405 per_cpu(idle_start_jiffies, cpu) = jiffies;
406 }
407
408 /* Set the real time alarm to expire at the next event. */
409 vmi_timer_ops.set_alarm(
410 VMI_ALARM_WIRING | VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL,
411 real_cycles_expiry, 0);
412
413 return idle;
414}
415
416static void vmi_reenable_hz_timer(int cpu)
417{
418 /* For /proc/vmi/info idle_hz stat. */
419 per_cpu(vmi_idle_no_hz_jiffies, cpu) += jiffies - per_cpu(idle_start_jiffies, cpu);
420 per_cpu(vmi_idle_no_hz_irqs, cpu)++;
421
422 /* Don't bother explicitly cancelling the one-shot alarm -- at
423 * worse we will receive a spurious timer interrupt. */
424 vmi_timer_ops.set_alarm(
425 VMI_ALARM_WIRING | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
426 per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
427 cycles_per_alarm);
428 /* Indicate this cpu is no longer nohz idle. */
429 cpu_clear(cpu, nohz_cpu_mask);
430}
431
432/* Called from interrupt handlers when (local) HZ timer is disabled. */
433void vmi_account_time_restart_hz_timer(void)
434{
435 unsigned long long cur_real_cycles, cur_process_times_cycles;
436 int cpu = smp_processor_id();
437
438 BUG_ON(!irqs_disabled());
439 /* Account the time during which the HZ timer was disabled. */
440 cur_real_cycles = read_real_cycles();
441 cur_process_times_cycles = read_available_cycles();
442 /* Update system wide (real) time state (xtime, jiffies). */
443 vmi_account_real_cycles(cur_real_cycles);
444 /* Update per-cpu idle times. */
445 vmi_account_no_hz_idle_cycles(cpu, cur_process_times_cycles);
446 /* Update time stolen from this cpu by the hypervisor. */
447 vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
448 /* Reenable the hz timer. */
449 vmi_reenable_hz_timer(cpu);
450}
451
452#endif /* CONFIG_NO_IDLE_HZ */
453
454/* UP (and no local-APIC) VMI-timer alarm interrupt handler.
455 * Handler for IRQ0. Not used when SMP or X86_LOCAL_APIC after
456 * APIC setup and setup_boot_vmi_alarm() is called. */
457static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
458{
459 vmi_local_timer_interrupt(smp_processor_id());
460 return IRQ_HANDLED;
461}
462
463#ifdef CONFIG_X86_LOCAL_APIC
464
465/* SMP VMI-timer alarm interrupt handler. Handler for LVTT vector.
466 * Also used in UP when CONFIG_X86_LOCAL_APIC.
467 * The wrapper code is from arch/i386/kernel/apic.c#smp_apic_timer_interrupt. */
468void smp_apic_vmi_timer_interrupt(struct pt_regs *regs)
469{
470 struct pt_regs *old_regs = set_irq_regs(regs);
471 int cpu = smp_processor_id();
472
473 /*
474 * the NMI deadlock-detector uses this.
475 */
476 per_cpu(irq_stat,cpu).apic_timer_irqs++;
477
478 /*
479 * NOTE! We'd better ACK the irq immediately,
480 * because timer handling can be slow.
481 */
482 ack_APIC_irq();
483
484 /*
485 * update_process_times() expects us to have done irq_enter().
486 * Besides, if we don't timer interrupts ignore the global
487 * interrupt lock, which is the WrongThing (tm) to do.
488 */
489 irq_enter();
490 vmi_local_timer_interrupt(cpu);
491 irq_exit();
492 set_irq_regs(old_regs);
493}
494
495#endif /* CONFIG_X86_LOCAL_APIC */
diff --git a/include/asm-i386/apic.h b/include/asm-i386/apic.h
index 41a44319905f..3a61206fd108 100644
--- a/include/asm-i386/apic.h
+++ b/include/asm-i386/apic.h
@@ -43,6 +43,8 @@ extern void generic_apic_probe(void);
43#define apic_write native_apic_write 43#define apic_write native_apic_write
44#define apic_write_atomic native_apic_write_atomic 44#define apic_write_atomic native_apic_write_atomic
45#define apic_read native_apic_read 45#define apic_read native_apic_read
46#define setup_boot_clock setup_boot_APIC_clock
47#define setup_secondary_clock setup_secondary_APIC_clock
46#endif 48#endif
47 49
48static __inline fastcall void native_apic_write(unsigned long reg, 50static __inline fastcall void native_apic_write(unsigned long reg,
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index 6ccf36499b2a..12ef95924da6 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -121,6 +121,8 @@ struct paravirt_ops
121 void (fastcall *apic_write)(unsigned long reg, unsigned long v); 121 void (fastcall *apic_write)(unsigned long reg, unsigned long v);
122 void (fastcall *apic_write_atomic)(unsigned long reg, unsigned long v); 122 void (fastcall *apic_write_atomic)(unsigned long reg, unsigned long v);
123 unsigned long (fastcall *apic_read)(unsigned long reg); 123 unsigned long (fastcall *apic_read)(unsigned long reg);
124 void (*setup_boot_clock)(void);
125 void (*setup_secondary_clock)(void);
124#endif 126#endif
125 127
126 void (fastcall *flush_tlb_user)(void); 128 void (fastcall *flush_tlb_user)(void);
@@ -323,6 +325,16 @@ static inline unsigned long apic_read(unsigned long reg)
323{ 325{
324 return paravirt_ops.apic_read(reg); 326 return paravirt_ops.apic_read(reg);
325} 327}
328
329static inline void setup_boot_clock(void)
330{
331 paravirt_ops.setup_boot_clock();
332}
333
334static inline void setup_secondary_clock(void)
335{
336 paravirt_ops.setup_secondary_clock();
337}
326#endif 338#endif
327 339
328#ifdef CONFIG_SMP 340#ifdef CONFIG_SMP
diff --git a/include/asm-i386/time.h b/include/asm-i386/time.h
index ea8065af825a..571b4294dc2e 100644
--- a/include/asm-i386/time.h
+++ b/include/asm-i386/time.h
@@ -30,6 +30,7 @@ static inline int native_set_wallclock(unsigned long nowtime)
30 30
31#ifdef CONFIG_PARAVIRT 31#ifdef CONFIG_PARAVIRT
32#include <asm/paravirt.h> 32#include <asm/paravirt.h>
33extern unsigned long long native_sched_clock(void);
33#else /* !CONFIG_PARAVIRT */ 34#else /* !CONFIG_PARAVIRT */
34 35
35#define get_wallclock() native_get_wallclock() 36#define get_wallclock() native_get_wallclock()
diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h
index 1ee64e34cd35..4752c3a6a708 100644
--- a/include/asm-i386/timer.h
+++ b/include/asm-i386/timer.h
@@ -9,6 +9,8 @@ void setup_pit_timer(void);
9extern int pit_latch_buggy; 9extern int pit_latch_buggy;
10extern int timer_ack; 10extern int timer_ack;
11extern int no_timer_check; 11extern int no_timer_check;
12extern unsigned long long (*custom_sched_clock)(void);
13extern int no_sync_cmos_clock;
12extern int recalibrate_cpu_khz(void); 14extern int recalibrate_cpu_khz(void);
13 15
14#endif 16#endif
diff --git a/include/asm-i386/vmi_time.h b/include/asm-i386/vmi_time.h
new file mode 100644
index 000000000000..c12931211007
--- /dev/null
+++ b/include/asm-i386/vmi_time.h
@@ -0,0 +1,103 @@
1/*
2 * VMI Time wrappers
3 *
4 * Copyright (C) 2006, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to dhecht@vmware.com
22 *
23 */
24
25#ifndef __VMI_TIME_H
26#define __VMI_TIME_H
27
28/*
29 * Raw VMI call indices for timer functions
30 */
31#define VMI_CALL_GetCycleFrequency 66
32#define VMI_CALL_GetCycleCounter 67
33#define VMI_CALL_SetAlarm 68
34#define VMI_CALL_CancelAlarm 69
35#define VMI_CALL_GetWallclockTime 70
36#define VMI_CALL_WallclockUpdated 71
37
38/* Cached VMI timer operations */
39extern struct vmi_timer_ops {
40 u64 (*get_cycle_frequency)(void);
41 u64 (*get_cycle_counter)(int);
42 u64 (*get_wallclock)(void);
43 int (*wallclock_updated)(void);
44 void (*set_alarm)(u32 flags, u64 expiry, u64 period);
45 void (*cancel_alarm)(u32 flags);
46} vmi_timer_ops;
47
48/* Prototypes */
49extern void __init vmi_time_init(void);
50extern unsigned long vmi_get_wallclock(void);
51extern int vmi_set_wallclock(unsigned long now);
52extern unsigned long long vmi_sched_clock(void);
53
54#ifdef CONFIG_X86_LOCAL_APIC
55extern void __init vmi_timer_setup_boot_alarm(void);
56extern void __init vmi_timer_setup_secondary_alarm(void);
57extern void apic_vmi_timer_interrupt(void);
58#endif
59
60#ifdef CONFIG_NO_IDLE_HZ
61extern int vmi_stop_hz_timer(void);
62extern void vmi_account_time_restart_hz_timer(void);
63#endif
64
65/*
66 * When run under a hypervisor, a vcpu is always in one of three states:
67 * running, halted, or ready. The vcpu is in the 'running' state if it
68 * is executing. When the vcpu executes the halt interface, the vcpu
69 * enters the 'halted' state and remains halted until there is some work
70 * pending for the vcpu (e.g. an alarm expires, host I/O completes on
71 * behalf of virtual I/O). At this point, the vcpu enters the 'ready'
72 * state (waiting for the hypervisor to reschedule it). Finally, at any
73 * time when the vcpu is not in the 'running' state nor the 'halted'
74 * state, it is in the 'ready' state.
75 *
76 * Real time is advances while the vcpu is 'running', 'ready', or
77 * 'halted'. Stolen time is the time in which the vcpu is in the
78 * 'ready' state. Available time is the remaining time -- the vcpu is
79 * either 'running' or 'halted'.
80 *
81 * All three views of time are accessible through the VMI cycle
82 * counters.
83 */
84
85/* The cycle counters. */
86#define VMI_CYCLES_REAL 0
87#define VMI_CYCLES_AVAILABLE 1
88#define VMI_CYCLES_STOLEN 2
89
90/* The alarm interface 'flags' bits */
91#define VMI_ALARM_COUNTERS 2
92
93#define VMI_ALARM_COUNTER_MASK 0x000000ff
94
95#define VMI_ALARM_WIRED_IRQ0 0x00000000
96#define VMI_ALARM_WIRED_LVTT 0x00010000
97
98#define VMI_ALARM_IS_ONESHOT 0x00000000
99#define VMI_ALARM_IS_PERIODIC 0x00000100
100
101#define CONFIG_VMI_ALARM_HZ 100
102
103#endif