aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/kernel/time.c
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2010-08-26 15:56:43 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2010-09-02 00:07:31 -0400
commitcf9efce0ce3136fa076f53e53154e98455229514 (patch)
tree0e110018b160aff4813b81e0e8c3a43a364edd48 /arch/powerpc/kernel/time.c
parent93c22703efa72c7527dbd586d1951c1f4a85fd70 (diff)
powerpc: Account time using timebase rather than PURR
Currently, when CONFIG_VIRT_CPU_ACCOUNTING is enabled, we use the PURR register for measuring the user and system time used by processes, as well as other related times such as hardirq and softirq times. This turns out to be quite confusing for users because it means that a program will often be measured as taking less time when run on a multi-threaded processor (SMT2 or SMT4 mode) than it does when run on a single-threaded processor (ST mode), even though the program takes longer to finish. The discrepancy is accounted for as stolen time, which is also confusing, particularly when there are no other partitions running. This changes the accounting to use the timebase instead, meaning that the reported user and system times are the actual number of real-time seconds that the program was executing on the processor thread, regardless of which SMT mode the processor is in. Thus a program will generally show greater user and system times when run on a multi-threaded processor than on a single-threaded processor. On pSeries systems on POWER5 or later processors, we measure the stolen time (time when this partition wasn't running) using the hypervisor dispatch trace log. We check for new entries in the log on every entry from user mode and on every transition from kernel process context to soft or hard IRQ context (i.e. when account_system_vtime() gets called). So that we can correctly distinguish time stolen from user time and time stolen from system time, without having to check the log on every exit to user mode, we store separate timestamps for exit to user mode and entry from user mode. On systems that have a SPURR (POWER6 and POWER7), we read the SPURR in account_system_vtime() (as before), and then apportion the SPURR ticks since the last time we read it between scaled user time and scaled system time according to the relative proportions of user time and system time over the same interval. This avoids having to read the SPURR on every kernel entry and exit. On systems that have PURR but not SPURR (i.e., POWER5), we do the same using the PURR rather than the SPURR. This disables the DTL user interface in /sys/debug/kernel/powerpc/dtl for now since it conflicts with the use of the dispatch trace log by the time accounting code. Signed-off-by: Paul Mackerras <paulus@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/kernel/time.c')
-rw-r--r--arch/powerpc/kernel/time.c268
1 files changed, 127 insertions, 141 deletions
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 8533b3b83f5d..fca20643c368 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -164,8 +164,6 @@ unsigned long ppc_proc_freq;
164EXPORT_SYMBOL(ppc_proc_freq); 164EXPORT_SYMBOL(ppc_proc_freq);
165unsigned long ppc_tb_freq; 165unsigned long ppc_tb_freq;
166 166
167static DEFINE_PER_CPU(u64, last_jiffy);
168
169#ifdef CONFIG_VIRT_CPU_ACCOUNTING 167#ifdef CONFIG_VIRT_CPU_ACCOUNTING
170/* 168/*
171 * Factors for converting from cputime_t (timebase ticks) to 169 * Factors for converting from cputime_t (timebase ticks) to
@@ -200,62 +198,151 @@ static void calc_cputime_factors(void)
200} 198}
201 199
202/* 200/*
203 * Read the PURR on systems that have it, otherwise the timebase. 201 * Read the SPURR on systems that have it, otherwise the PURR,
202 * or if that doesn't exist return the timebase value passed in.
204 */ 203 */
205static u64 read_purr(void) 204static u64 read_spurr(u64 tb)
206{ 205{
206 if (cpu_has_feature(CPU_FTR_SPURR))
207 return mfspr(SPRN_SPURR);
207 if (cpu_has_feature(CPU_FTR_PURR)) 208 if (cpu_has_feature(CPU_FTR_PURR))
208 return mfspr(SPRN_PURR); 209 return mfspr(SPRN_PURR);
209 return mftb(); 210 return tb;
210} 211}
211 212
213#ifdef CONFIG_PPC_SPLPAR
214
212/* 215/*
213 * Read the SPURR on systems that have it, otherwise the purr 216 * Scan the dispatch trace log and count up the stolen time.
217 * Should be called with interrupts disabled.
214 */ 218 */
215static u64 read_spurr(u64 purr) 219static u64 scan_dispatch_log(u64 stop_tb)
216{ 220{
217 /* 221 unsigned long i = local_paca->dtl_ridx;
218 * cpus without PURR won't have a SPURR 222 struct dtl_entry *dtl = local_paca->dtl_curr;
219 * We already know the former when we use this, so tell gcc 223 struct dtl_entry *dtl_end = local_paca->dispatch_log_end;
220 */ 224 struct lppaca *vpa = local_paca->lppaca_ptr;
221 if (cpu_has_feature(CPU_FTR_PURR) && cpu_has_feature(CPU_FTR_SPURR)) 225 u64 tb_delta;
222 return mfspr(SPRN_SPURR); 226 u64 stolen = 0;
223 return purr; 227 u64 dtb;
228
229 if (i == vpa->dtl_idx)
230 return 0;
231 while (i < vpa->dtl_idx) {
232 dtb = dtl->timebase;
233 tb_delta = dtl->enqueue_to_dispatch_time +
234 dtl->ready_to_enqueue_time;
235 barrier();
236 if (i + N_DISPATCH_LOG < vpa->dtl_idx) {
237 /* buffer has overflowed */
238 i = vpa->dtl_idx - N_DISPATCH_LOG;
239 dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
240 continue;
241 }
242 if (dtb > stop_tb)
243 break;
244 stolen += tb_delta;
245 ++i;
246 ++dtl;
247 if (dtl == dtl_end)
248 dtl = local_paca->dispatch_log;
249 }
250 local_paca->dtl_ridx = i;
251 local_paca->dtl_curr = dtl;
252 return stolen;
224} 253}
225 254
226/* 255/*
256 * Accumulate stolen time by scanning the dispatch trace log.
257 * Called on entry from user mode.
258 */
259void accumulate_stolen_time(void)
260{
261 u64 sst, ust;
262
263 sst = scan_dispatch_log(get_paca()->starttime_user);
264 ust = scan_dispatch_log(get_paca()->starttime);
265 get_paca()->system_time -= sst;
266 get_paca()->user_time -= ust;
267 get_paca()->stolen_time += ust + sst;
268}
269
270static inline u64 calculate_stolen_time(u64 stop_tb)
271{
272 u64 stolen = 0;
273
274 if (get_paca()->dtl_ridx != get_paca()->lppaca_ptr->dtl_idx) {
275 stolen = scan_dispatch_log(stop_tb);
276 get_paca()->system_time -= stolen;
277 }
278
279 stolen += get_paca()->stolen_time;
280 get_paca()->stolen_time = 0;
281 return stolen;
282}
283
284#else /* CONFIG_PPC_SPLPAR */
285static inline u64 calculate_stolen_time(u64 stop_tb)
286{
287 return 0;
288}
289
290#endif /* CONFIG_PPC_SPLPAR */
291
292/*
227 * Account time for a transition between system, hard irq 293 * Account time for a transition between system, hard irq
228 * or soft irq state. 294 * or soft irq state.
229 */ 295 */
230void account_system_vtime(struct task_struct *tsk) 296void account_system_vtime(struct task_struct *tsk)
231{ 297{
232 u64 now, nowscaled, delta, deltascaled, sys_time; 298 u64 now, nowscaled, delta, deltascaled;
233 unsigned long flags; 299 unsigned long flags;
300 u64 stolen, udelta, sys_scaled, user_scaled;
234 301
235 local_irq_save(flags); 302 local_irq_save(flags);
236 now = read_purr(); 303 now = mftb();
237 nowscaled = read_spurr(now); 304 nowscaled = read_spurr(now);
238 delta = now - get_paca()->startpurr; 305 get_paca()->system_time += now - get_paca()->starttime;
306 get_paca()->starttime = now;
239 deltascaled = nowscaled - get_paca()->startspurr; 307 deltascaled = nowscaled - get_paca()->startspurr;
240 get_paca()->startpurr = now;
241 get_paca()->startspurr = nowscaled; 308 get_paca()->startspurr = nowscaled;
242 if (!in_interrupt()) { 309
243 /* deltascaled includes both user and system time. 310 stolen = calculate_stolen_time(now);
244 * Hence scale it based on the purr ratio to estimate 311
245 * the system time */ 312 delta = get_paca()->system_time;
246 sys_time = get_paca()->system_time; 313 get_paca()->system_time = 0;
247 if (get_paca()->user_time) 314 udelta = get_paca()->user_time - get_paca()->utime_sspurr;
248 deltascaled = deltascaled * sys_time / 315 get_paca()->utime_sspurr = get_paca()->user_time;
249 (sys_time + get_paca()->user_time); 316
250 delta += sys_time; 317 /*
251 get_paca()->system_time = 0; 318 * Because we don't read the SPURR on every kernel entry/exit,
319 * deltascaled includes both user and system SPURR ticks.
320 * Apportion these ticks to system SPURR ticks and user
321 * SPURR ticks in the same ratio as the system time (delta)
322 * and user time (udelta) values obtained from the timebase
323 * over the same interval. The system ticks get accounted here;
324 * the user ticks get saved up in paca->user_time_scaled to be
325 * used by account_process_tick.
326 */
327 sys_scaled = delta;
328 user_scaled = udelta;
329 if (deltascaled != delta + udelta) {
330 if (udelta) {
331 sys_scaled = deltascaled * delta / (delta + udelta);
332 user_scaled = deltascaled - sys_scaled;
333 } else {
334 sys_scaled = deltascaled;
335 }
336 }
337 get_paca()->user_time_scaled += user_scaled;
338
339 if (in_irq() || idle_task(smp_processor_id()) != tsk) {
340 account_system_time(tsk, 0, delta, sys_scaled);
341 if (stolen)
342 account_steal_time(stolen);
343 } else {
344 account_idle_time(delta + stolen);
252 } 345 }
253 if (in_irq() || idle_task(smp_processor_id()) != tsk)
254 account_system_time(tsk, 0, delta, deltascaled);
255 else
256 account_idle_time(delta);
257 __get_cpu_var(cputime_last_delta) = delta;
258 __get_cpu_var(cputime_scaled_last_delta) = deltascaled;
259 local_irq_restore(flags); 346 local_irq_restore(flags);
260} 347}
261EXPORT_SYMBOL_GPL(account_system_vtime); 348EXPORT_SYMBOL_GPL(account_system_vtime);
@@ -265,125 +352,26 @@ EXPORT_SYMBOL_GPL(account_system_vtime);
265 * by the exception entry and exit code to the generic process 352 * by the exception entry and exit code to the generic process
266 * user and system time records. 353 * user and system time records.
267 * Must be called with interrupts disabled. 354 * Must be called with interrupts disabled.
355 * Assumes that account_system_vtime() has been called recently
356 * (i.e. since the last entry from usermode) so that
357 * get_paca()->user_time_scaled is up to date.
268 */ 358 */
269void account_process_tick(struct task_struct *tsk, int user_tick) 359void account_process_tick(struct task_struct *tsk, int user_tick)
270{ 360{
271 cputime_t utime, utimescaled; 361 cputime_t utime, utimescaled;
272 362
273 utime = get_paca()->user_time; 363 utime = get_paca()->user_time;
364 utimescaled = get_paca()->user_time_scaled;
274 get_paca()->user_time = 0; 365 get_paca()->user_time = 0;
275 utimescaled = cputime_to_scaled(utime); 366 get_paca()->user_time_scaled = 0;
367 get_paca()->utime_sspurr = 0;
276 account_user_time(tsk, utime, utimescaled); 368 account_user_time(tsk, utime, utimescaled);
277} 369}
278 370
279/*
280 * Stuff for accounting stolen time.
281 */
282struct cpu_purr_data {
283 int initialized; /* thread is running */
284 u64 tb; /* last TB value read */
285 u64 purr; /* last PURR value read */
286 u64 spurr; /* last SPURR value read */
287};
288
289/*
290 * Each entry in the cpu_purr_data array is manipulated only by its
291 * "owner" cpu -- usually in the timer interrupt but also occasionally
292 * in process context for cpu online. As long as cpus do not touch
293 * each others' cpu_purr_data, disabling local interrupts is
294 * sufficient to serialize accesses.
295 */
296static DEFINE_PER_CPU(struct cpu_purr_data, cpu_purr_data);
297
298static void snapshot_tb_and_purr(void *data)
299{
300 unsigned long flags;
301 struct cpu_purr_data *p = &__get_cpu_var(cpu_purr_data);
302
303 local_irq_save(flags);
304 p->tb = get_tb_or_rtc();
305 p->purr = mfspr(SPRN_PURR);
306 wmb();
307 p->initialized = 1;
308 local_irq_restore(flags);
309}
310
311/*
312 * Called during boot when all cpus have come up.
313 */
314void snapshot_timebases(void)
315{
316 if (!cpu_has_feature(CPU_FTR_PURR))
317 return;
318 on_each_cpu(snapshot_tb_and_purr, NULL, 1);
319}
320
321/*
322 * Must be called with interrupts disabled.
323 */
324void calculate_steal_time(void)
325{
326 u64 tb, purr;
327 s64 stolen;
328 struct cpu_purr_data *pme;
329
330 pme = &__get_cpu_var(cpu_purr_data);
331 if (!pme->initialized)
332 return; /* !CPU_FTR_PURR or early in early boot */
333 tb = mftb();
334 purr = mfspr(SPRN_PURR);
335 stolen = (tb - pme->tb) - (purr - pme->purr);
336 if (stolen > 0) {
337 if (idle_task(smp_processor_id()) != current)
338 account_steal_time(stolen);
339 else
340 account_idle_time(stolen);
341 }
342 pme->tb = tb;
343 pme->purr = purr;
344}
345
346#ifdef CONFIG_PPC_SPLPAR
347/*
348 * Must be called before the cpu is added to the online map when
349 * a cpu is being brought up at runtime.
350 */
351static void snapshot_purr(void)
352{
353 struct cpu_purr_data *pme;
354 unsigned long flags;
355
356 if (!cpu_has_feature(CPU_FTR_PURR))
357 return;
358 local_irq_save(flags);
359 pme = &__get_cpu_var(cpu_purr_data);
360 pme->tb = mftb();
361 pme->purr = mfspr(SPRN_PURR);
362 pme->initialized = 1;
363 local_irq_restore(flags);
364}
365
366#endif /* CONFIG_PPC_SPLPAR */
367
368#else /* ! CONFIG_VIRT_CPU_ACCOUNTING */ 371#else /* ! CONFIG_VIRT_CPU_ACCOUNTING */
369#define calc_cputime_factors() 372#define calc_cputime_factors()
370#define calculate_steal_time() do { } while (0)
371#endif 373#endif
372 374
373#if !(defined(CONFIG_VIRT_CPU_ACCOUNTING) && defined(CONFIG_PPC_SPLPAR))
374#define snapshot_purr() do { } while (0)
375#endif
376
377/*
378 * Called when a cpu comes up after the system has finished booting,
379 * i.e. as a result of a hotplug cpu action.
380 */
381void snapshot_timebase(void)
382{
383 __get_cpu_var(last_jiffy) = get_tb_or_rtc();
384 snapshot_purr();
385}
386
387void __delay(unsigned long loops) 375void __delay(unsigned long loops)
388{ 376{
389 unsigned long start; 377 unsigned long start;
@@ -585,8 +573,6 @@ void timer_interrupt(struct pt_regs * regs)
585 old_regs = set_irq_regs(regs); 573 old_regs = set_irq_regs(regs);
586 irq_enter(); 574 irq_enter();
587 575
588 calculate_steal_time();
589
590 if (test_perf_event_pending()) { 576 if (test_perf_event_pending()) {
591 clear_perf_event_pending(); 577 clear_perf_event_pending();
592 perf_event_do_pending(); 578 perf_event_do_pending();