diff options
author | Jeremy Fitzhardinge <jeremy@xensource.com> | 2007-07-17 21:37:05 -0400 |
---|---|---|
committer | Jeremy Fitzhardinge <jeremy@goop.org> | 2007-07-18 11:47:43 -0400 |
commit | f91a8b447b9af64f589f6e13fec7f09b5927563d (patch) | |
tree | 9e220f5049d9963173464dd51906eb210c744fdb /arch/i386 | |
parent | 9a4029fd3409eb224eb62c32d9792071382694ec (diff) |
xen: Account for stolen time
This patch accounts for the time stolen from our VCPUs. Stolen time is
time where a vcpu is runnable and could be running, but all available
physical CPUs are being used for something else.
This accounting gets run on each timer interrupt, just as a way to get
it run relatively often, and when interesting things are going on.
Stolen time is not really used by much in the kernel; it is reported
in /proc/stats, and that's about it.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Acked-by: Chris Wright <chrisw@sous-sol.org>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Diffstat (limited to 'arch/i386')
-rw-r--r-- | arch/i386/xen/time.c | 159 |
1 files changed, 150 insertions, 9 deletions
diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c index b457980ff3c2..acbfd9969462 100644 --- a/arch/i386/xen/time.c +++ b/arch/i386/xen/time.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
12 | #include <linux/clocksource.h> | 12 | #include <linux/clocksource.h> |
13 | #include <linux/clockchips.h> | 13 | #include <linux/clockchips.h> |
14 | #include <linux/kernel_stat.h> | ||
14 | 15 | ||
15 | #include <asm/xen/hypervisor.h> | 16 | #include <asm/xen/hypervisor.h> |
16 | #include <asm/xen/hypercall.h> | 17 | #include <asm/xen/hypercall.h> |
@@ -25,6 +26,7 @@ | |||
25 | 26 | ||
26 | /* Xen may fire a timer up to this many ns early */ | 27 | /* Xen may fire a timer up to this many ns early */ |
27 | #define TIMER_SLOP 100000 | 28 | #define TIMER_SLOP 100000 |
29 | #define NS_PER_TICK (1000000000LL / HZ) | ||
28 | 30 | ||
29 | /* These are perodically updated in shared_info, and then copied here. */ | 31 | /* These are perodically updated in shared_info, and then copied here. */ |
30 | struct shadow_time_info { | 32 | struct shadow_time_info { |
@@ -37,6 +39,139 @@ struct shadow_time_info { | |||
37 | 39 | ||
38 | static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); | 40 | static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); |
39 | 41 | ||
42 | /* runstate info updated by Xen */ | ||
43 | static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); | ||
44 | |||
45 | /* snapshots of runstate info */ | ||
46 | static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); | ||
47 | |||
48 | /* unused ns of stolen and blocked time */ | ||
49 | static DEFINE_PER_CPU(u64, residual_stolen); | ||
50 | static DEFINE_PER_CPU(u64, residual_blocked); | ||
51 | |||
52 | /* return an consistent snapshot of 64-bit time/counter value */ | ||
53 | static u64 get64(const u64 *p) | ||
54 | { | ||
55 | u64 ret; | ||
56 | |||
57 | if (BITS_PER_LONG < 64) { | ||
58 | u32 *p32 = (u32 *)p; | ||
59 | u32 h, l; | ||
60 | |||
61 | /* | ||
62 | * Read high then low, and then make sure high is | ||
63 | * still the same; this will only loop if low wraps | ||
64 | * and carries into high. | ||
65 | * XXX some clean way to make this endian-proof? | ||
66 | */ | ||
67 | do { | ||
68 | h = p32[1]; | ||
69 | barrier(); | ||
70 | l = p32[0]; | ||
71 | barrier(); | ||
72 | } while (p32[1] != h); | ||
73 | |||
74 | ret = (((u64)h) << 32) | l; | ||
75 | } else | ||
76 | ret = *p; | ||
77 | |||
78 | return ret; | ||
79 | } | ||
80 | |||
81 | /* | ||
82 | * Runstate accounting | ||
83 | */ | ||
84 | static void get_runstate_snapshot(struct vcpu_runstate_info *res) | ||
85 | { | ||
86 | u64 state_time; | ||
87 | struct vcpu_runstate_info *state; | ||
88 | |||
89 | preempt_disable(); | ||
90 | |||
91 | state = &__get_cpu_var(runstate); | ||
92 | |||
93 | /* | ||
94 | * The runstate info is always updated by the hypervisor on | ||
95 | * the current CPU, so there's no need to use anything | ||
96 | * stronger than a compiler barrier when fetching it. | ||
97 | */ | ||
98 | do { | ||
99 | state_time = get64(&state->state_entry_time); | ||
100 | barrier(); | ||
101 | *res = *state; | ||
102 | barrier(); | ||
103 | } while (get64(&state->state_entry_time) != state_time); | ||
104 | |||
105 | preempt_enable(); | ||
106 | } | ||
107 | |||
108 | static void setup_runstate_info(int cpu) | ||
109 | { | ||
110 | struct vcpu_register_runstate_memory_area area; | ||
111 | |||
112 | area.addr.v = &per_cpu(runstate, cpu); | ||
113 | |||
114 | if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, | ||
115 | cpu, &area)) | ||
116 | BUG(); | ||
117 | } | ||
118 | |||
119 | static void do_stolen_accounting(void) | ||
120 | { | ||
121 | struct vcpu_runstate_info state; | ||
122 | struct vcpu_runstate_info *snap; | ||
123 | s64 blocked, runnable, offline, stolen; | ||
124 | cputime_t ticks; | ||
125 | |||
126 | get_runstate_snapshot(&state); | ||
127 | |||
128 | WARN_ON(state.state != RUNSTATE_running); | ||
129 | |||
130 | snap = &__get_cpu_var(runstate_snapshot); | ||
131 | |||
132 | /* work out how much time the VCPU has not been runn*ing* */ | ||
133 | blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; | ||
134 | runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; | ||
135 | offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; | ||
136 | |||
137 | *snap = state; | ||
138 | |||
139 | /* Add the appropriate number of ticks of stolen time, | ||
140 | including any left-overs from last time. Passing NULL to | ||
141 | account_steal_time accounts the time as stolen. */ | ||
142 | stolen = runnable + offline + __get_cpu_var(residual_stolen); | ||
143 | |||
144 | if (stolen < 0) | ||
145 | stolen = 0; | ||
146 | |||
147 | ticks = 0; | ||
148 | while (stolen >= NS_PER_TICK) { | ||
149 | ticks++; | ||
150 | stolen -= NS_PER_TICK; | ||
151 | } | ||
152 | __get_cpu_var(residual_stolen) = stolen; | ||
153 | account_steal_time(NULL, ticks); | ||
154 | |||
155 | /* Add the appropriate number of ticks of blocked time, | ||
156 | including any left-overs from last time. Passing idle to | ||
157 | account_steal_time accounts the time as idle/wait. */ | ||
158 | blocked += __get_cpu_var(residual_blocked); | ||
159 | |||
160 | if (blocked < 0) | ||
161 | blocked = 0; | ||
162 | |||
163 | ticks = 0; | ||
164 | while (blocked >= NS_PER_TICK) { | ||
165 | ticks++; | ||
166 | blocked -= NS_PER_TICK; | ||
167 | } | ||
168 | __get_cpu_var(residual_blocked) = blocked; | ||
169 | account_steal_time(idle_task(smp_processor_id()), ticks); | ||
170 | } | ||
171 | |||
172 | |||
173 | |||
174 | /* Get the CPU speed from Xen */ | ||
40 | unsigned long xen_cpu_khz(void) | 175 | unsigned long xen_cpu_khz(void) |
41 | { | 176 | { |
42 | u64 cpu_khz = 1000000ULL << 32; | 177 | u64 cpu_khz = 1000000ULL << 32; |
@@ -56,13 +191,11 @@ unsigned long xen_cpu_khz(void) | |||
56 | * Reads a consistent set of time-base values from Xen, into a shadow data | 191 | * Reads a consistent set of time-base values from Xen, into a shadow data |
57 | * area. | 192 | * area. |
58 | */ | 193 | */ |
59 | static void get_time_values_from_xen(void) | 194 | static unsigned get_time_values_from_xen(void) |
60 | { | 195 | { |
61 | struct vcpu_time_info *src; | 196 | struct vcpu_time_info *src; |
62 | struct shadow_time_info *dst; | 197 | struct shadow_time_info *dst; |
63 | 198 | ||
64 | preempt_disable(); | ||
65 | |||
66 | /* src is shared memory with the hypervisor, so we need to | 199 | /* src is shared memory with the hypervisor, so we need to |
67 | make sure we get a consistent snapshot, even in the face of | 200 | make sure we get a consistent snapshot, even in the face of |
68 | being preempted. */ | 201 | being preempted. */ |
@@ -79,7 +212,7 @@ static void get_time_values_from_xen(void) | |||
79 | rmb(); /* test version after fetching data */ | 212 | rmb(); /* test version after fetching data */ |
80 | } while ((src->version & 1) | (dst->version ^ src->version)); | 213 | } while ((src->version & 1) | (dst->version ^ src->version)); |
81 | 214 | ||
82 | preempt_enable(); | 215 | return dst->version; |
83 | } | 216 | } |
84 | 217 | ||
85 | /* | 218 | /* |
@@ -123,7 +256,7 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) | |||
123 | static u64 get_nsec_offset(struct shadow_time_info *shadow) | 256 | static u64 get_nsec_offset(struct shadow_time_info *shadow) |
124 | { | 257 | { |
125 | u64 now, delta; | 258 | u64 now, delta; |
126 | rdtscll(now); | 259 | now = native_read_tsc(); |
127 | delta = now - shadow->tsc_timestamp; | 260 | delta = now - shadow->tsc_timestamp; |
128 | return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); | 261 | return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); |
129 | } | 262 | } |
@@ -132,10 +265,14 @@ cycle_t xen_clocksource_read(void) | |||
132 | { | 265 | { |
133 | struct shadow_time_info *shadow = &get_cpu_var(shadow_time); | 266 | struct shadow_time_info *shadow = &get_cpu_var(shadow_time); |
134 | cycle_t ret; | 267 | cycle_t ret; |
268 | unsigned version; | ||
135 | 269 | ||
136 | get_time_values_from_xen(); | 270 | do { |
137 | 271 | version = get_time_values_from_xen(); | |
138 | ret = shadow->system_timestamp + get_nsec_offset(shadow); | 272 | barrier(); |
273 | ret = shadow->system_timestamp + get_nsec_offset(shadow); | ||
274 | barrier(); | ||
275 | } while (version != __get_cpu_var(xen_vcpu)->time.version); | ||
139 | 276 | ||
140 | put_cpu_var(shadow_time); | 277 | put_cpu_var(shadow_time); |
141 | 278 | ||
@@ -352,6 +489,8 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) | |||
352 | ret = IRQ_HANDLED; | 489 | ret = IRQ_HANDLED; |
353 | } | 490 | } |
354 | 491 | ||
492 | do_stolen_accounting(); | ||
493 | |||
355 | return ret; | 494 | return ret; |
356 | } | 495 | } |
357 | 496 | ||
@@ -378,6 +517,8 @@ static void xen_setup_timer(int cpu) | |||
378 | evt->irq = irq; | 517 | evt->irq = irq; |
379 | clockevents_register_device(evt); | 518 | clockevents_register_device(evt); |
380 | 519 | ||
520 | setup_runstate_info(cpu); | ||
521 | |||
381 | put_cpu_var(xen_clock_events); | 522 | put_cpu_var(xen_clock_events); |
382 | } | 523 | } |
383 | 524 | ||
@@ -390,7 +531,7 @@ __init void xen_time_init(void) | |||
390 | clocksource_register(&xen_clocksource); | 531 | clocksource_register(&xen_clocksource); |
391 | 532 | ||
392 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { | 533 | if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { |
393 | /* Successfully turned off 100hz tick, so we have the | 534 | /* Successfully turned off 100Hz tick, so we have the |
394 | vcpuop-based timer interface */ | 535 | vcpuop-based timer interface */ |
395 | printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); | 536 | printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); |
396 | xen_clockevent = &xen_vcpuop_clockevent; | 537 | xen_clockevent = &xen_vcpuop_clockevent; |