perf_counter: record time running and time enabled for each counter

Impact: new functionality Currently, if there are more counters enabled than can fit on the CPU, the kernel will multiplex the counters on to the hardware using round-robin scheduling. That isn't too bad for sampling counters, but for counting counters it means that the value read from a counter represents some unknown fraction of the true count of events that occurred while the counter was enabled. This remedies the situation by keeping track of how long each counter is enabled for, and how long it is actually on the cpu and counting events. These times are recorded in nanoseconds using the task clock for per-task counters and the cpu clock for per-cpu counters. These values can be supplied to userspace on a read from the counter. Userspace requests that they be supplied after the counter value by setting the PERF_FORMAT_TOTAL_TIME_ENABLED and/or PERF_FORMAT_TOTAL_TIME_RUNNING bits in the hw_event.read_format field when creating the counter. (There is no way to change the read format after the counter is created, though it would be possible to add some way to do that.) Using this information it is possible for userspace to scale the count it reads from the counter to get an estimate of the true count: true_count_estimate = count * total_time_enabled / total_time_running This also lets userspace detect the situation where the counter never got to go on the cpu: total_time_running == 0. This functionality has been requested by the PAPI developers, and will be generally needed for interpreting the count values from counting counters correctly. In the implementation, this keeps 5 time values (in nanoseconds) for each counter: total_time_enabled and total_time_running are used when the counter is in state OFF or ERROR and for reporting back to userspace. When the counter is in state INACTIVE or ACTIVE, it is the tstamp_enabled, tstamp_running and tstamp_stopped values that are relevant, and total_time_enabled and total_time_running are determined from them. (tstamp_stopped is only used in INACTIVE state.) The reason for doing it like this is that it means that only counters being enabled or disabled at sched-in and sched-out time need to be updated. There are no new loops that iterate over all counters to update total_time_enabled or total_time_running. This also keeps separate child_total_time_running and child_total_time_enabled fields that get added in when reporting the totals to userspace. They are separate fields so that they can be atomic. We don't want to use atomics for total_time_running, total_time_enabled etc., because then we would have to use atomic sequences to update them, which are slower than regular arithmetic and memory accesses. It is possible to measure total_time_running by adding a task_clock counter to each group of counters, and total_time_enabled can be measured approximately with a top-level task_clock counter (though inaccuracies will creep in if you need to disable and enable groups since it is not possible in general to disable/enable the top-level task_clock counter simultaneously with another group). However, that adds extra overhead - I measured around 15% increase in the context switch latency reported by lat_ctx (from lmbench) when a task_clock counter was added to each of 2 groups, and around 25% increase when a task_clock counter was added to each of 4 groups. (In both cases a top-level task-clock counter was also added.) In contrast, the code added in this commit gives better information with no overhead that I could measure (in fact in some cases I measured lower times with this code, but the differences were all less than one standard deviation). [ v2: address review comments by Andrew Morton. ] Signed-off-by: Paul Mackerras <paulus@samba.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Andrew Morton <akpm@linux-foundation.org> Orig-LKML-Reference: <18890.6578.728637.139402@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Paul Mackerras <paulus@samba.org> 2009-03-25 07:46:58 -0400
committer: Ingo Molnar <mingo@elte.hu> 2009-04-06 03:30:36 -0400
commit: 53cfbf593758916aac41db728f029986a62f1254 (patch)
tree: c58a9c0f6e3cc050235e736e288e268bdb1f37eb /kernel/perf_counter.c
parent: 7730d8655880f41f2ea519aca2ca6a1413dfd2c9 (diff)
1 files changed, 136 insertions, 21 deletions
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 95e02575546b..3b862a7988cd 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -116,6 +116,7 @@ counter_sched_out(struct perf_counter *counter,
                return;
        counter->state = PERF_COUNTER_STATE_INACTIVE;
+        counter->tstamp_stopped = ctx->time_now;
        counter->hw_ops->disable(counter);
        counter->oncpu = -1;
@@ -252,6 +253,60 @@ retry:
 }
 /*
+ * Get the current time for this context.
+ * If this is a task context, we use the task's task clock,
+ * or for a per-cpu context, we use the cpu clock.
+ */
+static u64 get_context_time(struct perf_counter_context *ctx, int update)
+{
+        struct task_struct *curr = ctx->task;
+        if (!curr)
+                return cpu_clock(smp_processor_id());
+        return __task_delta_exec(curr, update) + curr->se.sum_exec_runtime;
+}
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_counter_context *ctx, int update)
+{
+        ctx->time_now = get_context_time(ctx, update) - ctx->time_lost;
+}
+/*
+ * Update the total_time_enabled and total_time_running fields for a counter.
+ */
+static void update_counter_times(struct perf_counter *counter)
+{
+        struct perf_counter_context *ctx = counter->ctx;
+        u64 run_end;
+        if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+                counter->total_time_enabled = ctx->time_now -
+                        counter->tstamp_enabled;
+                if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+                        run_end = counter->tstamp_stopped;
+                else
+                        run_end = ctx->time_now;
+                counter->total_time_running = run_end - counter->tstamp_running;
+        }
+}
+/*
+ * Update total_time_enabled and total_time_running for all counters in a group.
+ */
+static void update_group_times(struct perf_counter *leader)
+{
+        struct perf_counter *counter;
+        update_counter_times(leader);
+        list_for_each_entry(counter, &leader->sibling_list, list_entry)
+                update_counter_times(counter);
+}
+/*
 * Cross CPU call to disable a performance counter
 */
 static void __perf_counter_disable(void *info)
@@ -276,6 +331,8 @@ static void __perf_counter_disable(void *info)
         * If it is in error state, leave it in error state.
         */
        if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+                update_context_time(ctx, 1);
+                update_counter_times(counter);
                if (counter == counter->group_leader)
                        group_sched_out(counter, cpuctx, ctx);
                else
@@ -320,8 +377,10 @@ static void perf_counter_disable(struct perf_counter *counter)
         * Since we have the lock this context can't be scheduled
         * in, so we can change the state safely.
         */
-        if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+        if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+                update_counter_times(counter);
                counter->state = PERF_COUNTER_STATE_OFF;
+        }
        spin_unlock_irq(&ctx->lock);
 }
@@ -366,6 +425,8 @@ counter_sched_in(struct perf_counter *counter,
                return -EAGAIN;
        }
+        counter->tstamp_running += ctx->time_now - counter->tstamp_stopped;
        if (!is_software_counter(counter))
                cpuctx->active_oncpu++;
        ctx->nr_active++;
@@ -425,6 +486,17 @@ static int group_can_go_on(struct perf_counter *counter,
        return can_add_hw;
 }
+static void add_counter_to_ctx(struct perf_counter *counter,
+                               struct perf_counter_context *ctx)
+{
+        list_add_counter(counter, ctx);
+        ctx->nr_counters++;
+        counter->prev_state = PERF_COUNTER_STATE_OFF;
+        counter->tstamp_enabled = ctx->time_now;
+        counter->tstamp_running = ctx->time_now;
+        counter->tstamp_stopped = ctx->time_now;
+}
 /*
 * Cross CPU call to install and enable a performance counter
 */
@@ -449,6 +521,7 @@ static void __perf_install_in_context(void *info)
        curr_rq_lock_irq_save(&flags);
        spin_lock(&ctx->lock);
+        update_context_time(ctx, 1);
        /*
         * Protect the list operation against NMI by disabling the
@@ -456,9 +529,7 @@ static void __perf_install_in_context(void *info)
         */
        perf_flags = hw_perf_save_disable();
-        list_add_counter(counter, ctx);
+        add_counter_to_ctx(counter, ctx);
-        ctx->nr_counters++;
-        counter->prev_state = PERF_COUNTER_STATE_OFF;
        /*
         * Don't put the counter on if it is disabled or if
@@ -486,8 +557,10 @@ static void __perf_install_in_context(void *info)
                 */
                if (leader != counter)
                        group_sched_out(leader, cpuctx, ctx);
-                if (leader->hw_event.pinned)
+                if (leader->hw_event.pinned) {
+                        update_group_times(leader);
                        leader->state = PERF_COUNTER_STATE_ERROR;
+                }
        }
        if (!err && !ctx->task && cpuctx->max_pertask)
@@ -548,10 +621,8 @@ retry:
         * can add the counter safely, if it the call above did not
         * succeed.
         */
-        if (list_empty(&counter->list_entry)) {
+        if (list_empty(&counter->list_entry))
-                list_add_counter(counter, ctx);
+                add_counter_to_ctx(counter, ctx);
-                ctx->nr_counters++;
-        }
        spin_unlock_irq(&ctx->lock);
 }
@@ -576,11 +647,13 @@ static void __perf_counter_enable(void *info)
        curr_rq_lock_irq_save(&flags);
        spin_lock(&ctx->lock);
+        update_context_time(ctx, 1);
        counter->prev_state = counter->state;
        if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
                goto unlock;
        counter->state = PERF_COUNTER_STATE_INACTIVE;
+        counter->tstamp_enabled = ctx->time_now - counter->total_time_enabled;
        /*
         * If the counter is in a group and isn't the group leader,
@@ -602,8 +675,10 @@ static void __perf_counter_enable(void *info)
                 */
                if (leader != counter)
                        group_sched_out(leader, cpuctx, ctx);
-                if (leader->hw_event.pinned)
+                if (leader->hw_event.pinned) {
+                        update_group_times(leader);
                        leader->state = PERF_COUNTER_STATE_ERROR;
+                }
        }
 unlock:
@@ -659,8 +734,11 @@ static void perf_counter_enable(struct perf_counter *counter)
         * Since we have the lock this context can't be scheduled
         * in, so we can change the state safely.
         */
-        if (counter->state == PERF_COUNTER_STATE_OFF)
+        if (counter->state == PERF_COUNTER_STATE_OFF) {
                counter->state = PERF_COUNTER_STATE_INACTIVE;
+                counter->tstamp_enabled = ctx->time_now -
+                        counter->total_time_enabled;
+        }
 out:
        spin_unlock_irq(&ctx->lock);
 }
@@ -693,6 +771,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
        ctx->is_active = 0;
        if (likely(!ctx->nr_counters))
                goto out;
+        update_context_time(ctx, 0);
        flags = hw_perf_save_disable();
        if (ctx->nr_active) {
@@ -797,6 +876,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
        if (likely(!ctx->nr_counters))
                goto out;
+        /*
+         * Add any time since the last sched_out to the lost time
+         * so it doesn't get included in the total_time_enabled and
+         * total_time_running measures for counters in the context.
+         */
+        ctx->time_lost = get_context_time(ctx, 0) - ctx->time_now;
        flags = hw_perf_save_disable();
        /*
@@ -817,8 +903,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
                 * If this pinned group hasn't been scheduled,
                 * put it in error state.
                 */
-                if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+                if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+                        update_group_times(counter);
                        counter->state = PERF_COUNTER_STATE_ERROR;
+                }
        }
        list_for_each_entry(counter, &ctx->counter_list, list_entry) {
@@ -902,8 +990,10 @@ int perf_counter_task_disable(void)
        perf_flags = hw_perf_save_disable();
        list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-                if (counter->state != PERF_COUNTER_STATE_ERROR)
+                if (counter->state != PERF_COUNTER_STATE_ERROR) {
+                        update_group_times(counter);
                        counter->state = PERF_COUNTER_STATE_OFF;
+                }
        }
        hw_perf_restore(perf_flags);
@@ -946,6 +1036,8 @@ int perf_counter_task_enable(void)
                if (counter->state > PERF_COUNTER_STATE_OFF)
                        continue;
                counter->state = PERF_COUNTER_STATE_INACTIVE;
+                counter->tstamp_enabled = ctx->time_now -
+                        counter->total_time_enabled;
                counter->hw_event.disabled = 0;
        }
        hw_perf_restore(perf_flags);
@@ -1009,10 +1101,14 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 static void __read(void *info)
 {
        struct perf_counter *counter = info;
+        struct perf_counter_context *ctx = counter->ctx;
        unsigned long flags;
        curr_rq_lock_irq_save(&flags);
+        if (ctx->is_active)
+                update_context_time(ctx, 1);
        counter->hw_ops->read(counter);
+        update_counter_times(counter);
        curr_rq_unlock_irq_restore(&flags);
 }
@@ -1025,6 +1121,8 @@ static u64 perf_counter_read(struct perf_counter *counter)
        if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
                smp_call_function_single(counter->oncpu,
                                         __read, counter, 1);
+        } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+                update_counter_times(counter);
        }
        return atomic64_read(&counter->count);
@@ -1137,10 +1235,8 @@ static int perf_release(struct inode *inode, struct file *file)
 static ssize_t
 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 {
-        u64 cntval;
+        u64 values[3];
+        int n;
-        if (count < sizeof(cntval))
-                return -EINVAL;
        /*
         * Return end-of-file for a read on a counter that is in
@@ -1151,10 +1247,24 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
                return 0;
        mutex_lock(&counter->mutex);
-        cntval = perf_counter_read(counter);
+        values[0] = perf_counter_read(counter);
+        n = 1;
+        if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+                values[n++] = counter->total_time_enabled +
+                        atomic64_read(&counter->child_total_time_enabled);
+        if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+                values[n++] = counter->total_time_running +
+                        atomic64_read(&counter->child_total_time_running);
        mutex_unlock(&counter->mutex);
-        return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
+        if (count < n * sizeof(u64))
+                return -EINVAL;
+        count = n * sizeof(u64);
+        if (copy_to_user(buf, values, count))
+                return -EFAULT;
+        return count;
 }
 static ssize_t
@@ -2290,8 +2400,7 @@ inherit_counter(struct perf_counter *parent_counter,
         * Link it up in the child's context:
         */
        child_counter->task = child;
-        list_add_counter(child_counter, child_ctx);
+        add_counter_to_ctx(child_counter, child_ctx);
-        child_ctx->nr_counters++;
        child_counter->parent = parent_counter;
        /*
@@ -2361,6 +2470,10 @@ static void sync_child_counter(struct perf_counter *child_counter,
         * Add back the child's count to the parent's count:
         */
        atomic64_add(child_val, &parent_counter->count);
+        atomic64_add(child_counter->total_time_enabled,
+                     &parent_counter->child_total_time_enabled);
+        atomic64_add(child_counter->total_time_running,
+                     &parent_counter->child_total_time_running);
        /*
         * Remove this counter from the parent's list
@@ -2395,6 +2508,7 @@ __perf_counter_exit_task(struct task_struct *child,
        if (child != current) {
                wait_task_inactive(child, 0);
                list_del_init(&child_counter->list_entry);
+                update_counter_times(child_counter);
        } else {
                struct perf_cpu_context *cpuctx;
                unsigned long flags;
@@ -2412,6 +2526,7 @@ __perf_counter_exit_task(struct task_struct *child,
                cpuctx = &__get_cpu_var(perf_cpu_context);
                group_sched_out(child_counter, cpuctx, child_ctx);
+                update_counter_times(child_counter);
                list_del_init(&child_counter->list_entry);
author	Paul Mackerras <paulus@samba.org>	2009-03-25 07:46:58 -0400
committer	Ingo Molnar <mingo@elte.hu>	2009-04-06 03:30:36 -0400
commit	53cfbf593758916aac41db728f029986a62f1254 (patch)
tree	c58a9c0f6e3cc050235e736e288e268bdb1f37eb /kernel/perf_counter.c
parent	7730d8655880f41f2ea519aca2ca6a1413dfd2c9 (diff)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 95e02575546b..3b862a7988cd 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c
@@ -116,6 +116,7 @@ counter_sched_out(struct perf_counter *counter,
116	return;	116	return;
117		117
118	counter->state = PERF_COUNTER_STATE_INACTIVE;	118	counter->state = PERF_COUNTER_STATE_INACTIVE;
		119	counter->tstamp_stopped = ctx->time_now;
119	counter->hw_ops->disable(counter);	120	counter->hw_ops->disable(counter);
120	counter->oncpu = -1;	121	counter->oncpu = -1;
121		122
@@ -252,6 +253,60 @@ retry:
252	}	253	}
253		254
254	/*	255	/*
		256	* Get the current time for this context.
		257	* If this is a task context, we use the task's task clock,
		258	* or for a per-cpu context, we use the cpu clock.
		259	*/
		260	static u64 get_context_time(struct perf_counter_context *ctx, int update)
		261	{
		262	struct task_struct *curr = ctx->task;
		263
		264	if (!curr)
		265	return cpu_clock(smp_processor_id());
		266
		267	return __task_delta_exec(curr, update) + curr->se.sum_exec_runtime;
		268	}
		269
		270	/*
		271	* Update the record of the current time in a context.
		272	*/
		273	static void update_context_time(struct perf_counter_context *ctx, int update)
		274	{
		275	ctx->time_now = get_context_time(ctx, update) - ctx->time_lost;
		276	}
		277
		278	/*
		279	* Update the total_time_enabled and total_time_running fields for a counter.
		280	*/
		281	static void update_counter_times(struct perf_counter *counter)
		282	{
		283	struct perf_counter_context *ctx = counter->ctx;
		284	u64 run_end;
		285
		286	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
		287	counter->total_time_enabled = ctx->time_now -
		288	counter->tstamp_enabled;
		289	if (counter->state == PERF_COUNTER_STATE_INACTIVE)
		290	run_end = counter->tstamp_stopped;
		291	else
		292	run_end = ctx->time_now;
		293	counter->total_time_running = run_end - counter->tstamp_running;
		294	}
		295	}
		296
		297	/*
		298	* Update total_time_enabled and total_time_running for all counters in a group.
		299	*/
		300	static void update_group_times(struct perf_counter *leader)
		301	{
		302	struct perf_counter *counter;
		303
		304	update_counter_times(leader);
		305	list_for_each_entry(counter, &leader->sibling_list, list_entry)
		306	update_counter_times(counter);
		307	}
		308
		309	/*
255	* Cross CPU call to disable a performance counter	310	* Cross CPU call to disable a performance counter
256	*/	311	*/
257	static void __perf_counter_disable(void *info)	312	static void __perf_counter_disable(void *info)
@@ -276,6 +331,8 @@ static void __perf_counter_disable(void *info)
276	* If it is in error state, leave it in error state.	331	* If it is in error state, leave it in error state.
277	*/	332	*/
278	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {	333	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
		334	update_context_time(ctx, 1);
		335	update_counter_times(counter);
279	if (counter == counter->group_leader)	336	if (counter == counter->group_leader)
280	group_sched_out(counter, cpuctx, ctx);	337	group_sched_out(counter, cpuctx, ctx);
281	else	338	else
@@ -320,8 +377,10 @@ static void perf_counter_disable(struct perf_counter *counter)
320	* Since we have the lock this context can't be scheduled	377	* Since we have the lock this context can't be scheduled
321	* in, so we can change the state safely.	378	* in, so we can change the state safely.
322	*/	379	*/
323	if (counter->state == PERF_COUNTER_STATE_INACTIVE)	380	if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
		381	update_counter_times(counter);
324	counter->state = PERF_COUNTER_STATE_OFF;	382	counter->state = PERF_COUNTER_STATE_OFF;
		383	}
325		384
326	spin_unlock_irq(&ctx->lock);	385	spin_unlock_irq(&ctx->lock);
327	}	386	}
@@ -366,6 +425,8 @@ counter_sched_in(struct perf_counter *counter,
366	return -EAGAIN;	425	return -EAGAIN;
367	}	426	}
368		427
		428	counter->tstamp_running += ctx->time_now - counter->tstamp_stopped;
		429
369	if (!is_software_counter(counter))	430	if (!is_software_counter(counter))
370	cpuctx->active_oncpu++;	431	cpuctx->active_oncpu++;
371	ctx->nr_active++;	432	ctx->nr_active++;
@@ -425,6 +486,17 @@ static int group_can_go_on(struct perf_counter *counter,
425	return can_add_hw;	486	return can_add_hw;
426	}	487	}
427		488
		489	static void add_counter_to_ctx(struct perf_counter *counter,
		490	struct perf_counter_context *ctx)
		491	{
		492	list_add_counter(counter, ctx);
		493	ctx->nr_counters++;
		494	counter->prev_state = PERF_COUNTER_STATE_OFF;
		495	counter->tstamp_enabled = ctx->time_now;
		496	counter->tstamp_running = ctx->time_now;
		497	counter->tstamp_stopped = ctx->time_now;
		498	}
		499
428	/*	500	/*
429	* Cross CPU call to install and enable a performance counter	501	* Cross CPU call to install and enable a performance counter
430	*/	502	*/
@@ -449,6 +521,7 @@ static void __perf_install_in_context(void *info)
449		521
450	curr_rq_lock_irq_save(&flags);	522	curr_rq_lock_irq_save(&flags);
451	spin_lock(&ctx->lock);	523	spin_lock(&ctx->lock);
		524	update_context_time(ctx, 1);
452		525
453	/*	526	/*
454	* Protect the list operation against NMI by disabling the	527	* Protect the list operation against NMI by disabling the
@@ -456,9 +529,7 @@ static void __perf_install_in_context(void *info)
456	*/	529	*/
457	perf_flags = hw_perf_save_disable();	530	perf_flags = hw_perf_save_disable();
458		531
459	list_add_counter(counter, ctx);	532	add_counter_to_ctx(counter, ctx);
460	ctx->nr_counters++;
461	counter->prev_state = PERF_COUNTER_STATE_OFF;
462		533
463	/*	534	/*
464	* Don't put the counter on if it is disabled or if	535	* Don't put the counter on if it is disabled or if
@@ -486,8 +557,10 @@ static void __perf_install_in_context(void *info)
486	*/	557	*/
487	if (leader != counter)	558	if (leader != counter)
488	group_sched_out(leader, cpuctx, ctx);	559	group_sched_out(leader, cpuctx, ctx);
489	if (leader->hw_event.pinned)	560	if (leader->hw_event.pinned) {
		561	update_group_times(leader);
490	leader->state = PERF_COUNTER_STATE_ERROR;	562	leader->state = PERF_COUNTER_STATE_ERROR;
		563	}
491	}	564	}
492		565
493	if (!err && !ctx->task && cpuctx->max_pertask)	566	if (!err && !ctx->task && cpuctx->max_pertask)
@@ -548,10 +621,8 @@ retry:
548	* can add the counter safely, if it the call above did not	621	* can add the counter safely, if it the call above did not
549	* succeed.	622	* succeed.
550	*/	623	*/
551	if (list_empty(&counter->list_entry)) {	624	if (list_empty(&counter->list_entry))
552	list_add_counter(counter, ctx);	625	add_counter_to_ctx(counter, ctx);
553	ctx->nr_counters++;
554	}
555	spin_unlock_irq(&ctx->lock);	626	spin_unlock_irq(&ctx->lock);
556	}	627	}
557		628
@@ -576,11 +647,13 @@ static void __perf_counter_enable(void *info)
576		647
577	curr_rq_lock_irq_save(&flags);	648	curr_rq_lock_irq_save(&flags);
578	spin_lock(&ctx->lock);	649	spin_lock(&ctx->lock);
		650	update_context_time(ctx, 1);
579		651
580	counter->prev_state = counter->state;	652	counter->prev_state = counter->state;
581	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)	653	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
582	goto unlock;	654	goto unlock;
583	counter->state = PERF_COUNTER_STATE_INACTIVE;	655	counter->state = PERF_COUNTER_STATE_INACTIVE;
		656	counter->tstamp_enabled = ctx->time_now - counter->total_time_enabled;
584		657
585	/*	658	/*
586	* If the counter is in a group and isn't the group leader,	659	* If the counter is in a group and isn't the group leader,
@@ -602,8 +675,10 @@ static void __perf_counter_enable(void *info)
602	*/	675	*/
603	if (leader != counter)	676	if (leader != counter)
604	group_sched_out(leader, cpuctx, ctx);	677	group_sched_out(leader, cpuctx, ctx);
605	if (leader->hw_event.pinned)	678	if (leader->hw_event.pinned) {
		679	update_group_times(leader);
606	leader->state = PERF_COUNTER_STATE_ERROR;	680	leader->state = PERF_COUNTER_STATE_ERROR;
		681	}
607	}	682	}
608		683
609	unlock:	684	unlock:
@@ -659,8 +734,11 @@ static void perf_counter_enable(struct perf_counter *counter)
659	* Since we have the lock this context can't be scheduled	734	* Since we have the lock this context can't be scheduled
660	* in, so we can change the state safely.	735	* in, so we can change the state safely.
661	*/	736	*/
662	if (counter->state == PERF_COUNTER_STATE_OFF)	737	if (counter->state == PERF_COUNTER_STATE_OFF) {
663	counter->state = PERF_COUNTER_STATE_INACTIVE;	738	counter->state = PERF_COUNTER_STATE_INACTIVE;
		739	counter->tstamp_enabled = ctx->time_now -
		740	counter->total_time_enabled;
		741	}
664	out:	742	out:
665	spin_unlock_irq(&ctx->lock);	743	spin_unlock_irq(&ctx->lock);
666	}	744	}
@@ -693,6 +771,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
693	ctx->is_active = 0;	771	ctx->is_active = 0;
694	if (likely(!ctx->nr_counters))	772	if (likely(!ctx->nr_counters))
695	goto out;	773	goto out;
		774	update_context_time(ctx, 0);
696		775
697	flags = hw_perf_save_disable();	776	flags = hw_perf_save_disable();
698	if (ctx->nr_active) {	777	if (ctx->nr_active) {
@@ -797,6 +876,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
797	if (likely(!ctx->nr_counters))	876	if (likely(!ctx->nr_counters))
798	goto out;	877	goto out;
799		878
		879	/*
		880	* Add any time since the last sched_out to the lost time
		881	* so it doesn't get included in the total_time_enabled and
		882	* total_time_running measures for counters in the context.
		883	*/
		884	ctx->time_lost = get_context_time(ctx, 0) - ctx->time_now;
		885
800	flags = hw_perf_save_disable();	886	flags = hw_perf_save_disable();
801		887
802	/*	888	/*
@@ -817,8 +903,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
817	* If this pinned group hasn't been scheduled,	903	* If this pinned group hasn't been scheduled,
818	* put it in error state.	904	* put it in error state.
819	*/	905	*/
820	if (counter->state == PERF_COUNTER_STATE_INACTIVE)	906	if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
		907	update_group_times(counter);
821	counter->state = PERF_COUNTER_STATE_ERROR;	908	counter->state = PERF_COUNTER_STATE_ERROR;
		909	}
822	}	910	}
823		911
824	list_for_each_entry(counter, &ctx->counter_list, list_entry) {	912	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
@@ -902,8 +990,10 @@ int perf_counter_task_disable(void)
902	perf_flags = hw_perf_save_disable();	990	perf_flags = hw_perf_save_disable();
903		991
904	list_for_each_entry(counter, &ctx->counter_list, list_entry) {	992	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
905	if (counter->state != PERF_COUNTER_STATE_ERROR)	993	if (counter->state != PERF_COUNTER_STATE_ERROR) {
		994	update_group_times(counter);
906	counter->state = PERF_COUNTER_STATE_OFF;	995	counter->state = PERF_COUNTER_STATE_OFF;
		996	}
907	}	997	}
908		998
909	hw_perf_restore(perf_flags);	999	hw_perf_restore(perf_flags);
@@ -946,6 +1036,8 @@ int perf_counter_task_enable(void)
946	if (counter->state > PERF_COUNTER_STATE_OFF)	1036	if (counter->state > PERF_COUNTER_STATE_OFF)
947	continue;	1037	continue;
948	counter->state = PERF_COUNTER_STATE_INACTIVE;	1038	counter->state = PERF_COUNTER_STATE_INACTIVE;
		1039	counter->tstamp_enabled = ctx->time_now -
		1040	counter->total_time_enabled;
949	counter->hw_event.disabled = 0;	1041	counter->hw_event.disabled = 0;
950	}	1042	}
951	hw_perf_restore(perf_flags);	1043	hw_perf_restore(perf_flags);
@@ -1009,10 +1101,14 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
1009	static void __read(void *info)	1101	static void __read(void *info)
1010	{	1102	{
1011	struct perf_counter *counter = info;	1103	struct perf_counter *counter = info;
		1104	struct perf_counter_context *ctx = counter->ctx;
1012	unsigned long flags;	1105	unsigned long flags;
1013		1106
1014	curr_rq_lock_irq_save(&flags);	1107	curr_rq_lock_irq_save(&flags);
		1108	if (ctx->is_active)
		1109	update_context_time(ctx, 1);
1015	counter->hw_ops->read(counter);	1110	counter->hw_ops->read(counter);
		1111	update_counter_times(counter);
1016	curr_rq_unlock_irq_restore(&flags);	1112	curr_rq_unlock_irq_restore(&flags);
1017	}	1113	}
1018		1114
@@ -1025,6 +1121,8 @@ static u64 perf_counter_read(struct perf_counter *counter)
1025	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {	1121	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1026	smp_call_function_single(counter->oncpu,	1122	smp_call_function_single(counter->oncpu,
1027	__read, counter, 1);	1123	__read, counter, 1);
		1124	} else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
		1125	update_counter_times(counter);
1028	}	1126	}
1029		1127
1030	return atomic64_read(&counter->count);	1128	return atomic64_read(&counter->count);
@@ -1137,10 +1235,8 @@ static int perf_release(struct inode inode, struct file file)
1137	static ssize_t	1235	static ssize_t
1138	perf_read_hw(struct perf_counter counter, char __user buf, size_t count)	1236	perf_read_hw(struct perf_counter counter, char __user buf, size_t count)
1139	{	1237	{
1140	u64 cntval;	1238	u64 values[3];
1141		1239	int n;
1142	if (count < sizeof(cntval))
1143	return -EINVAL;
1144		1240
1145	/*	1241	/*
1146	* Return end-of-file for a read on a counter that is in	1242	* Return end-of-file for a read on a counter that is in
@@ -1151,10 +1247,24 @@ perf_read_hw(struct perf_counter counter, char __user buf, size_t count)
1151	return 0;	1247	return 0;
1152		1248
1153	mutex_lock(&counter->mutex);	1249	mutex_lock(&counter->mutex);
1154	cntval = perf_counter_read(counter);	1250	values[0] = perf_counter_read(counter);
		1251	n = 1;
		1252	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
		1253	values[n++] = counter->total_time_enabled +
		1254	atomic64_read(&counter->child_total_time_enabled);
		1255	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
		1256	values[n++] = counter->total_time_running +
		1257	atomic64_read(&counter->child_total_time_running);
1155	mutex_unlock(&counter->mutex);	1258	mutex_unlock(&counter->mutex);
1156		1259
1157	return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);	1260	if (count < n * sizeof(u64))
		1261	return -EINVAL;
		1262	count = n * sizeof(u64);
		1263
		1264	if (copy_to_user(buf, values, count))
		1265	return -EFAULT;
		1266
		1267	return count;
1158	}	1268	}
1159		1269
1160	static ssize_t	1270	static ssize_t
@@ -2290,8 +2400,7 @@ inherit_counter(struct perf_counter *parent_counter,
2290	* Link it up in the child's context:	2400	* Link it up in the child's context:
2291	*/	2401	*/
2292	child_counter->task = child;	2402	child_counter->task = child;
2293	list_add_counter(child_counter, child_ctx);	2403	add_counter_to_ctx(child_counter, child_ctx);
2294	child_ctx->nr_counters++;
2295		2404
2296	child_counter->parent = parent_counter;	2405	child_counter->parent = parent_counter;
2297	/*	2406	/*
@@ -2361,6 +2470,10 @@ static void sync_child_counter(struct perf_counter *child_counter,
2361	* Add back the child's count to the parent's count:	2470	* Add back the child's count to the parent's count:
2362	*/	2471	*/
2363	atomic64_add(child_val, &parent_counter->count);	2472	atomic64_add(child_val, &parent_counter->count);
		2473	atomic64_add(child_counter->total_time_enabled,
		2474	&parent_counter->child_total_time_enabled);
		2475	atomic64_add(child_counter->total_time_running,
		2476	&parent_counter->child_total_time_running);
2364		2477
2365	/*	2478	/*
2366	* Remove this counter from the parent's list	2479	* Remove this counter from the parent's list
@@ -2395,6 +2508,7 @@ __perf_counter_exit_task(struct task_struct *child,
2395	if (child != current) {	2508	if (child != current) {
2396	wait_task_inactive(child, 0);	2509	wait_task_inactive(child, 0);
2397	list_del_init(&child_counter->list_entry);	2510	list_del_init(&child_counter->list_entry);
		2511	update_counter_times(child_counter);
2398	} else {	2512	} else {
2399	struct perf_cpu_context *cpuctx;	2513	struct perf_cpu_context *cpuctx;
2400	unsigned long flags;	2514	unsigned long flags;
@@ -2412,6 +2526,7 @@ __perf_counter_exit_task(struct task_struct *child,
2412	cpuctx = &__get_cpu_var(perf_cpu_context);	2526	cpuctx = &__get_cpu_var(perf_cpu_context);
2413		2527
2414	group_sched_out(child_counter, cpuctx, child_ctx);	2528	group_sched_out(child_counter, cpuctx, child_ctx);
		2529	update_counter_times(child_counter);
2415		2530
2416	list_del_init(&child_counter->list_entry);	2531	list_del_init(&child_counter->list_entry);
2417		2532