aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/kernel/time.c
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2006-02-23 18:06:59 -0500
committerPaul Mackerras <paulus@samba.org>2006-02-23 22:05:56 -0500
commitc6622f63db86fcbd41bf6fe05ddf2e00c1e51ced (patch)
tree102f3ea0a891212603a3722fece337d6a74d450c /arch/powerpc/kernel/time.c
parenta00428f5b149e36b8225b2a0812742a6dfb07b8c (diff)
powerpc: Implement accurate task and CPU time accounting
This implements accurate task and cpu time accounting for 64-bit powerpc kernels. Instead of accounting a whole jiffy of time to a task on a timer interrupt because that task happened to be running at the time, we now account time in units of timebase ticks according to the actual time spent by the task in user mode and kernel mode. We also count the time spent processing hardware and software interrupts accurately. This is conditional on CONFIG_VIRT_CPU_ACCOUNTING. If that is not set, we do tick-based approximate accounting as before. To get this accurate information, we read either the PURR (processor utilization of resources register) on POWER5 machines, or the timebase on other machines on * each entry to the kernel from usermode * each exit to usermode * transitions between process context, hard irq context and soft irq context in kernel mode * context switches. On POWER5 systems with shared-processor logical partitioning we also read both the PURR and the timebase at each timer interrupt and context switch in order to determine how much time has been taken by the hypervisor to run other partitions ("steal" time). Unfortunately, since we need values of the PURR on both threads at the same time to accurately calculate the steal time, and since we can only calculate steal time on a per-core basis, the apportioning of the steal time between idle time (time which we ceded to the hypervisor in the idle loop) and actual stolen time is somewhat approximate at the moment. This is all based quite heavily on what s390 does, and it uses the generic interfaces that were added by the s390 developers, i.e. account_system_time(), account_user_time(), etc. This patch doesn't add any new interfaces between the kernel and userspace, and doesn't change the units in which time is reported to userspace by things such as /proc/stat, /proc/<pid>/stat, getrusage(), times(), etc. Internally the various task and cpu times are stored in timebase units, but they are converted to USER_HZ units (1/100th of a second) when reported to userspace. Some precision is therefore lost but there should not be any accumulating error, since the internal accumulation is at full precision. Signed-off-by: Paul Mackerras <paulus@samba.org>
Diffstat (limited to 'arch/powerpc/kernel/time.c')
-rw-r--r--arch/powerpc/kernel/time.c236
1 files changed, 234 insertions, 2 deletions
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 2a7ddc579379..0b34db28916f 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -51,6 +51,7 @@
51#include <linux/percpu.h> 51#include <linux/percpu.h>
52#include <linux/rtc.h> 52#include <linux/rtc.h>
53#include <linux/jiffies.h> 53#include <linux/jiffies.h>
54#include <linux/posix-timers.h>
54 55
55#include <asm/io.h> 56#include <asm/io.h>
56#include <asm/processor.h> 57#include <asm/processor.h>
@@ -135,6 +136,220 @@ unsigned long tb_last_stamp;
135 */ 136 */
136DEFINE_PER_CPU(unsigned long, last_jiffy); 137DEFINE_PER_CPU(unsigned long, last_jiffy);
137 138
139#ifdef CONFIG_VIRT_CPU_ACCOUNTING
140/*
141 * Factors for converting from cputime_t (timebase ticks) to
142 * jiffies, milliseconds, seconds, and clock_t (1/USER_HZ seconds).
143 * These are all stored as 0.64 fixed-point binary fractions.
144 */
145u64 __cputime_jiffies_factor;
146u64 __cputime_msec_factor;
147u64 __cputime_sec_factor;
148u64 __cputime_clockt_factor;
149
150static void calc_cputime_factors(void)
151{
152 struct div_result res;
153
154 div128_by_32(HZ, 0, tb_ticks_per_sec, &res);
155 __cputime_jiffies_factor = res.result_low;
156 div128_by_32(1000, 0, tb_ticks_per_sec, &res);
157 __cputime_msec_factor = res.result_low;
158 div128_by_32(1, 0, tb_ticks_per_sec, &res);
159 __cputime_sec_factor = res.result_low;
160 div128_by_32(USER_HZ, 0, tb_ticks_per_sec, &res);
161 __cputime_clockt_factor = res.result_low;
162}
163
164/*
165 * Read the PURR on systems that have it, otherwise the timebase.
166 */
167static u64 read_purr(void)
168{
169 if (cpu_has_feature(CPU_FTR_PURR))
170 return mfspr(SPRN_PURR);
171 return mftb();
172}
173
174/*
175 * Account time for a transition between system, hard irq
176 * or soft irq state.
177 */
178void account_system_vtime(struct task_struct *tsk)
179{
180 u64 now, delta;
181 unsigned long flags;
182
183 local_irq_save(flags);
184 now = read_purr();
185 delta = now - get_paca()->startpurr;
186 get_paca()->startpurr = now;
187 if (!in_interrupt()) {
188 delta += get_paca()->system_time;
189 get_paca()->system_time = 0;
190 }
191 account_system_time(tsk, 0, delta);
192 local_irq_restore(flags);
193}
194
195/*
196 * Transfer the user and system times accumulated in the paca
197 * by the exception entry and exit code to the generic process
198 * user and system time records.
199 * Must be called with interrupts disabled.
200 */
201void account_process_vtime(struct task_struct *tsk)
202{
203 cputime_t utime;
204
205 utime = get_paca()->user_time;
206 get_paca()->user_time = 0;
207 account_user_time(tsk, utime);
208}
209
210static void account_process_time(struct pt_regs *regs)
211{
212 int cpu = smp_processor_id();
213
214 account_process_vtime(current);
215 run_local_timers();
216 if (rcu_pending(cpu))
217 rcu_check_callbacks(cpu, user_mode(regs));
218 scheduler_tick();
219 run_posix_cpu_timers(current);
220}
221
222#ifdef CONFIG_PPC_SPLPAR
223/*
224 * Stuff for accounting stolen time.
225 */
226struct cpu_purr_data {
227 int initialized; /* thread is running */
228 u64 tb0; /* timebase at origin time */
229 u64 purr0; /* PURR at origin time */
230 u64 tb; /* last TB value read */
231 u64 purr; /* last PURR value read */
232 u64 stolen; /* stolen time so far */
233 spinlock_t lock;
234};
235
236static DEFINE_PER_CPU(struct cpu_purr_data, cpu_purr_data);
237
238static void snapshot_tb_and_purr(void *data)
239{
240 struct cpu_purr_data *p = &__get_cpu_var(cpu_purr_data);
241
242 p->tb0 = mftb();
243 p->purr0 = mfspr(SPRN_PURR);
244 p->tb = p->tb0;
245 p->purr = 0;
246 wmb();
247 p->initialized = 1;
248}
249
250/*
251 * Called during boot when all cpus have come up.
252 */
253void snapshot_timebases(void)
254{
255 int cpu;
256
257 if (!cpu_has_feature(CPU_FTR_PURR))
258 return;
259 for_each_cpu(cpu)
260 spin_lock_init(&per_cpu(cpu_purr_data, cpu).lock);
261 on_each_cpu(snapshot_tb_and_purr, NULL, 0, 1);
262}
263
264void calculate_steal_time(void)
265{
266 u64 tb, purr, t0;
267 s64 stolen;
268 struct cpu_purr_data *p0, *pme, *phim;
269 int cpu;
270
271 if (!cpu_has_feature(CPU_FTR_PURR))
272 return;
273 cpu = smp_processor_id();
274 pme = &per_cpu(cpu_purr_data, cpu);
275 if (!pme->initialized)
276 return; /* this can happen in early boot */
277 p0 = &per_cpu(cpu_purr_data, cpu & ~1);
278 phim = &per_cpu(cpu_purr_data, cpu ^ 1);
279 spin_lock(&p0->lock);
280 tb = mftb();
281 purr = mfspr(SPRN_PURR) - pme->purr0;
282 if (!phim->initialized || !cpu_online(cpu ^ 1)) {
283 stolen = (tb - pme->tb) - (purr - pme->purr);
284 } else {
285 t0 = pme->tb0;
286 if (phim->tb0 < t0)
287 t0 = phim->tb0;
288 stolen = phim->tb - t0 - phim->purr - purr - p0->stolen;
289 }
290 if (stolen > 0) {
291 account_steal_time(current, stolen);
292 p0->stolen += stolen;
293 }
294 pme->tb = tb;
295 pme->purr = purr;
296 spin_unlock(&p0->lock);
297}
298
299/*
300 * Must be called before the cpu is added to the online map when
301 * a cpu is being brought up at runtime.
302 */
303static void snapshot_purr(void)
304{
305 int cpu;
306 u64 purr;
307 struct cpu_purr_data *p0, *pme, *phim;
308 unsigned long flags;
309
310 if (!cpu_has_feature(CPU_FTR_PURR))
311 return;
312 cpu = smp_processor_id();
313 pme = &per_cpu(cpu_purr_data, cpu);
314 p0 = &per_cpu(cpu_purr_data, cpu & ~1);
315 phim = &per_cpu(cpu_purr_data, cpu ^ 1);
316 spin_lock_irqsave(&p0->lock, flags);
317 pme->tb = pme->tb0 = mftb();
318 purr = mfspr(SPRN_PURR);
319 if (!phim->initialized) {
320 pme->purr = 0;
321 pme->purr0 = purr;
322 } else {
323 /* set p->purr and p->purr0 for no change in p0->stolen */
324 pme->purr = phim->tb - phim->tb0 - phim->purr - p0->stolen;
325 pme->purr0 = purr - pme->purr;
326 }
327 pme->initialized = 1;
328 spin_unlock_irqrestore(&p0->lock, flags);
329}
330
331#endif /* CONFIG_PPC_SPLPAR */
332
333#else /* ! CONFIG_VIRT_CPU_ACCOUNTING */
334#define calc_cputime_factors()
335#define account_process_time(regs) update_process_times(user_mode(regs))
336#define calculate_steal_time() do { } while (0)
337#endif
338
339#if !(defined(CONFIG_VIRT_CPU_ACCOUNTING) && defined(CONFIG_PPC_SPLPAR))
340#define snapshot_purr() do { } while (0)
341#endif
342
343/*
344 * Called when a cpu comes up after the system has finished booting,
345 * i.e. as a result of a hotplug cpu action.
346 */
347void snapshot_timebase(void)
348{
349 __get_cpu_var(last_jiffy) = get_tb();
350 snapshot_purr();
351}
352
138void __delay(unsigned long loops) 353void __delay(unsigned long loops)
139{ 354{
140 unsigned long start; 355 unsigned long start;
@@ -382,6 +597,7 @@ static void iSeries_tb_recal(void)
382 new_tb_ticks_per_jiffy, sign, tick_diff ); 597 new_tb_ticks_per_jiffy, sign, tick_diff );
383 tb_ticks_per_jiffy = new_tb_ticks_per_jiffy; 598 tb_ticks_per_jiffy = new_tb_ticks_per_jiffy;
384 tb_ticks_per_sec = new_tb_ticks_per_sec; 599 tb_ticks_per_sec = new_tb_ticks_per_sec;
600 calc_cputime_factors();
385 div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres ); 601 div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres );
386 do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; 602 do_gtod.tb_ticks_per_sec = tb_ticks_per_sec;
387 tb_to_xs = divres.result_low; 603 tb_to_xs = divres.result_low;
@@ -430,6 +646,7 @@ void timer_interrupt(struct pt_regs * regs)
430 irq_enter(); 646 irq_enter();
431 647
432 profile_tick(CPU_PROFILING, regs); 648 profile_tick(CPU_PROFILING, regs);
649 calculate_steal_time();
433 650
434#ifdef CONFIG_PPC_ISERIES 651#ifdef CONFIG_PPC_ISERIES
435 get_lppaca()->int_dword.fields.decr_int = 0; 652 get_lppaca()->int_dword.fields.decr_int = 0;
@@ -451,7 +668,7 @@ void timer_interrupt(struct pt_regs * regs)
451 * is the case. 668 * is the case.
452 */ 669 */
453 if (!cpu_is_offline(cpu)) 670 if (!cpu_is_offline(cpu))
454 update_process_times(user_mode(regs)); 671 account_process_time(regs);
455 672
456 /* 673 /*
457 * No need to check whether cpu is offline here; boot_cpuid 674 * No need to check whether cpu is offline here; boot_cpuid
@@ -508,13 +725,27 @@ void wakeup_decrementer(void)
508void __init smp_space_timers(unsigned int max_cpus) 725void __init smp_space_timers(unsigned int max_cpus)
509{ 726{
510 int i; 727 int i;
728 unsigned long half = tb_ticks_per_jiffy / 2;
511 unsigned long offset = tb_ticks_per_jiffy / max_cpus; 729 unsigned long offset = tb_ticks_per_jiffy / max_cpus;
512 unsigned long previous_tb = per_cpu(last_jiffy, boot_cpuid); 730 unsigned long previous_tb = per_cpu(last_jiffy, boot_cpuid);
513 731
514 /* make sure tb > per_cpu(last_jiffy, cpu) for all cpus always */ 732 /* make sure tb > per_cpu(last_jiffy, cpu) for all cpus always */
515 previous_tb -= tb_ticks_per_jiffy; 733 previous_tb -= tb_ticks_per_jiffy;
734 /*
735 * The stolen time calculation for POWER5 shared-processor LPAR
736 * systems works better if the two threads' timebase interrupts
737 * are staggered by half a jiffy with respect to each other.
738 */
516 for_each_cpu(i) { 739 for_each_cpu(i) {
517 if (i != boot_cpuid) { 740 if (i == boot_cpuid)
741 continue;
742 if (i == (boot_cpuid ^ 1))
743 per_cpu(last_jiffy, i) =
744 per_cpu(last_jiffy, boot_cpuid) - half;
745 else if (i & 1)
746 per_cpu(last_jiffy, i) =
747 per_cpu(last_jiffy, i ^ 1) + half;
748 else {
518 previous_tb += offset; 749 previous_tb += offset;
519 per_cpu(last_jiffy, i) = previous_tb; 750 per_cpu(last_jiffy, i) = previous_tb;
520 } 751 }
@@ -706,6 +937,7 @@ void __init time_init(void)
706 tb_ticks_per_sec = ppc_tb_freq; 937 tb_ticks_per_sec = ppc_tb_freq;
707 tb_ticks_per_usec = ppc_tb_freq / 1000000; 938 tb_ticks_per_usec = ppc_tb_freq / 1000000;
708 tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000); 939 tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000);
940 calc_cputime_factors();
709 941
710 /* 942 /*
711 * Calculate the length of each tick in ns. It will not be 943 * Calculate the length of each tick in ns. It will not be