perf_events: Fix transaction recovery in group_sched_in()

The group_sched_in() function uses a transactional approach to schedule a group of events. In a group, either all events can be scheduled or none are. To schedule each event in, the function calls event_sched_in(). In case of error, event_sched_out() is called on each event in the group. The problem is that event_sched_out() does not completely cancel the effects of event_sched_in(). Furthermore event_sched_out() changes the state of the event as if it had run which is not true is this particular case. Those inconsistencies impact time tracking fields and may lead to events in a group not all reporting the same time_enabled and time_running values. This is demonstrated with the example below: $ task -eunhalted_core_cycles,baclears,baclears -e unhalted_core_cycles,baclears,baclears sleep 5 1946101 unhalted_core_cycles (32.85% scaling, ena=829181, run=556827) 11423 baclears (32.85% scaling, ena=829181, run=556827) 7671 baclears (0.00% scaling, ena=556827, run=556827) 2250443 unhalted_core_cycles (57.83% scaling, ena=962822, run=405995) 11705 baclears (57.83% scaling, ena=962822, run=405995) 11705 baclears (57.83% scaling, ena=962822, run=405995) Notice that in the first group, the last baclears event does not report the same timings as its siblings. This issue comes from the fact that tstamp_stopped is updated by event_sched_out() as if the event had actually run. To solve the issue, we must ensure that, in case of error, there is no change in the event state whatsoever. That means timings must remain as they were when entering group_sched_in(). To do this we defer updating tstamp_running until we know the transaction succeeded. Therefore, we have split event_sched_in() in two parts separating the update to tstamp_running. Similarly, in case of error, we do not want to update tstamp_stopped. Therefore, we have split event_sched_out() in two parts separating the update to tstamp_stopped. With this patch, we now get the following output: $ task -eunhalted_core_cycles,baclears,baclears -e unhalted_core_cycles,baclears,baclears sleep 5 2492050 unhalted_core_cycles (71.75% scaling, ena=1093330, run=308841) 11243 baclears (71.75% scaling, ena=1093330, run=308841) 11243 baclears (71.75% scaling, ena=1093330, run=308841) 1852746 unhalted_core_cycles (0.00% scaling, ena=784489, run=784489) 9253 baclears (0.00% scaling, ena=784489, run=784489) 9253 baclears (0.00% scaling, ena=784489, run=784489) Note that the uneven timing between groups is a side effect of the process spending most of its time sleeping, i.e., not enough event rotations (but that's a separate issue). Signed-off-by: Stephane Eranian <eranian@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <4cb86b4c.41e9d80a.44e9.3e19@mx.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Stephane Eranian <eranian@google.com> 2010-10-15 10:54:01 -0400
committer: Ingo Molnar <mingo@elte.hu> 2010-10-18 13:58:49 -0400
commit: 8e5fc1a7320baf6076391607515dceb61319b36a (patch)
tree: 24a5e7ac4bbd8a960fa90ce2a668b9475a3f0b16 /kernel/perf_event.c
parent: ba0cef3d149ce4db293c572bf36ed352b11ce7b9 (diff)
1 files changed, 63 insertions, 13 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index e7eeba1794fd..634f86a4b2f9 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -412,8 +412,8 @@ event_filter_match(struct perf_event *event)
        return event->cpu == -1 || event->cpu == smp_processor_id();
 }
-static void
+static int
-event_sched_out(struct perf_event *event,
+__event_sched_out(struct perf_event *event,
                  struct perf_cpu_context *cpuctx,
                  struct perf_event_context *ctx)
 {
@@ -432,14 +432,13 @@ event_sched_out(struct perf_event *event,
        }
        if (event->state != PERF_EVENT_STATE_ACTIVE)
-                return;
+                return 0;
        event->state = PERF_EVENT_STATE_INACTIVE;
        if (event->pending_disable) {
                event->pending_disable = 0;
                event->state = PERF_EVENT_STATE_OFF;
        }
-        event->tstamp_stopped = ctx->time;
        event->pmu->del(event, 0);
        event->oncpu = -1;
@@ -448,6 +447,19 @@ event_sched_out(struct perf_event *event,
        ctx->nr_active--;
        if (event->attr.exclusive || !cpuctx->active_oncpu)
                cpuctx->exclusive = 0;
+        return 1;
+}
+static void
+event_sched_out(struct perf_event *event,
+                  struct perf_cpu_context *cpuctx,
+                  struct perf_event_context *ctx)
+{
+        int ret;
+        ret = __event_sched_out(event, cpuctx, ctx);
+        if (ret)
+                event->tstamp_stopped = ctx->time;
 }
 static void
@@ -647,7 +659,7 @@ retry:
 }
 static int
-event_sched_in(struct perf_event *event,
+__event_sched_in(struct perf_event *event,
                 struct perf_cpu_context *cpuctx,
                 struct perf_event_context *ctx)
 {
@@ -667,8 +679,6 @@ event_sched_in(struct perf_event *event,
                return -EAGAIN;
        }
-        event->tstamp_running += ctx->time - event->tstamp_stopped;
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
        ctx->nr_active++;
@@ -679,6 +689,35 @@ event_sched_in(struct perf_event *event,
        return 0;
 }
+static inline int
+event_sched_in(struct perf_event *event,
+                 struct perf_cpu_context *cpuctx,
+                 struct perf_event_context *ctx)
+{
+        int ret = __event_sched_in(event, cpuctx, ctx);
+        if (ret)
+                return ret;
+        event->tstamp_running += ctx->time - event->tstamp_stopped;
+        return 0;
+}
+static void
+group_commit_event_sched_in(struct perf_event *group_event,
+               struct perf_cpu_context *cpuctx,
+               struct perf_event_context *ctx)
+{
+        struct perf_event *event;
+        u64 now = ctx->time;
+        group_event->tstamp_running += now - group_event->tstamp_stopped;
+        /*
+         * Schedule in siblings as one group (if any):
+         */
+        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+                event->tstamp_running += now - event->tstamp_stopped;
+        }
+}
 static int
 group_sched_in(struct perf_event *group_event,
               struct perf_cpu_context *cpuctx,
@@ -692,7 +731,13 @@ group_sched_in(struct perf_event *group_event,
        pmu->start_txn(pmu);
-        if (event_sched_in(group_event, cpuctx, ctx)) {
+        /*
+         * use __event_sched_in() to delay updating tstamp_running
+         * until the transaction is committed. In case of failure
+         * we will keep an unmodified tstamp_running which is a
+         * requirement to get correct timing information
+         */
+        if (__event_sched_in(group_event, cpuctx, ctx)) {
                pmu->cancel_txn(pmu);
                return -EAGAIN;
        }
@@ -701,26 +746,31 @@ group_sched_in(struct perf_event *group_event,
         * Schedule in siblings as one group (if any):
         */
        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
-                if (event_sched_in(event, cpuctx, ctx)) {
+                if (__event_sched_in(event, cpuctx, ctx)) {
                        partial_group = event;
                        goto group_error;
                }
        }
-        if (!pmu->commit_txn(pmu))
+        if (!pmu->commit_txn(pmu)) {
+                /* commit tstamp_running */
+                group_commit_event_sched_in(group_event, cpuctx, ctx);
                return 0;
+        }
 group_error:
        /*
         * Groups can be scheduled in as one unit only, so undo any
         * partial group before returning:
+         *
+         * use __event_sched_out() to avoid updating tstamp_stopped
+         * because the event never actually ran
         */
        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
                if (event == partial_group)
                        break;
-                event_sched_out(event, cpuctx, ctx);
+                __event_sched_out(event, cpuctx, ctx);
        }
-        event_sched_out(group_event, cpuctx, ctx);
+        __event_sched_out(group_event, cpuctx, ctx);
        pmu->cancel_txn(pmu);
author	Stephane Eranian <eranian@google.com>	2010-10-15 10:54:01 -0400
committer	Ingo Molnar <mingo@elte.hu>	2010-10-18 13:58:49 -0400
commit	8e5fc1a7320baf6076391607515dceb61319b36a (patch)
tree	24a5e7ac4bbd8a960fa90ce2a668b9475a3f0b16 /kernel/perf_event.c
parent	ba0cef3d149ce4db293c572bf36ed352b11ce7b9 (diff)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c index e7eeba1794fd..634f86a4b2f9 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c
@@ -412,8 +412,8 @@ event_filter_match(struct perf_event *event)
412	return event->cpu == -1 \|\| event->cpu == smp_processor_id();	412	return event->cpu == -1 \|\| event->cpu == smp_processor_id();
413	}	413	}
414		414
415	static void	415	static int
416	event_sched_out(struct perf_event *event,	416	__event_sched_out(struct perf_event *event,
417	struct perf_cpu_context *cpuctx,	417	struct perf_cpu_context *cpuctx,
418	struct perf_event_context *ctx)	418	struct perf_event_context *ctx)
419	{	419	{
@@ -432,14 +432,13 @@ event_sched_out(struct perf_event *event,
432	}	432	}
433		433
434	if (event->state != PERF_EVENT_STATE_ACTIVE)	434	if (event->state != PERF_EVENT_STATE_ACTIVE)
435	return;	435	return 0;
436		436
437	event->state = PERF_EVENT_STATE_INACTIVE;	437	event->state = PERF_EVENT_STATE_INACTIVE;
438	if (event->pending_disable) {	438	if (event->pending_disable) {
439	event->pending_disable = 0;	439	event->pending_disable = 0;
440	event->state = PERF_EVENT_STATE_OFF;	440	event->state = PERF_EVENT_STATE_OFF;
441	}	441	}
442	event->tstamp_stopped = ctx->time;
443	event->pmu->del(event, 0);	442	event->pmu->del(event, 0);
444	event->oncpu = -1;	443	event->oncpu = -1;
445		444
@@ -448,6 +447,19 @@ event_sched_out(struct perf_event *event,
448	ctx->nr_active--;	447	ctx->nr_active--;
449	if (event->attr.exclusive \|\| !cpuctx->active_oncpu)	448	if (event->attr.exclusive \|\| !cpuctx->active_oncpu)
450	cpuctx->exclusive = 0;	449	cpuctx->exclusive = 0;
		450	return 1;
		451	}
		452
		453	static void
		454	event_sched_out(struct perf_event *event,
		455	struct perf_cpu_context *cpuctx,
		456	struct perf_event_context *ctx)
		457	{
		458	int ret;
		459
		460	ret = __event_sched_out(event, cpuctx, ctx);
		461	if (ret)
		462	event->tstamp_stopped = ctx->time;
451	}	463	}
452		464
453	static void	465	static void
@@ -647,7 +659,7 @@ retry:
647	}	659	}
648		660
649	static int	661	static int
650	event_sched_in(struct perf_event *event,	662	__event_sched_in(struct perf_event *event,
651	struct perf_cpu_context *cpuctx,	663	struct perf_cpu_context *cpuctx,
652	struct perf_event_context *ctx)	664	struct perf_event_context *ctx)
653	{	665	{
@@ -667,8 +679,6 @@ event_sched_in(struct perf_event *event,
667	return -EAGAIN;	679	return -EAGAIN;
668	}	680	}
669		681
670	event->tstamp_running += ctx->time - event->tstamp_stopped;
671
672	if (!is_software_event(event))	682	if (!is_software_event(event))
673	cpuctx->active_oncpu++;	683	cpuctx->active_oncpu++;
674	ctx->nr_active++;	684	ctx->nr_active++;
@@ -679,6 +689,35 @@ event_sched_in(struct perf_event *event,
679	return 0;	689	return 0;
680	}	690	}
681		691
		692	static inline int
		693	event_sched_in(struct perf_event *event,
		694	struct perf_cpu_context *cpuctx,
		695	struct perf_event_context *ctx)
		696	{
		697	int ret = __event_sched_in(event, cpuctx, ctx);
		698	if (ret)
		699	return ret;
		700	event->tstamp_running += ctx->time - event->tstamp_stopped;
		701	return 0;
		702	}
		703
		704	static void
		705	group_commit_event_sched_in(struct perf_event *group_event,
		706	struct perf_cpu_context *cpuctx,
		707	struct perf_event_context *ctx)
		708	{
		709	struct perf_event *event;
		710	u64 now = ctx->time;
		711
		712	group_event->tstamp_running += now - group_event->tstamp_stopped;
		713	/*
		714	* Schedule in siblings as one group (if any):
		715	*/
		716	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
		717	event->tstamp_running += now - event->tstamp_stopped;
		718	}
		719	}
		720
682	static int	721	static int
683	group_sched_in(struct perf_event *group_event,	722	group_sched_in(struct perf_event *group_event,
684	struct perf_cpu_context *cpuctx,	723	struct perf_cpu_context *cpuctx,
@@ -692,7 +731,13 @@ group_sched_in(struct perf_event *group_event,
692		731
693	pmu->start_txn(pmu);	732	pmu->start_txn(pmu);
694		733
695	if (event_sched_in(group_event, cpuctx, ctx)) {	734	/*
		735	* use __event_sched_in() to delay updating tstamp_running
		736	* until the transaction is committed. In case of failure
		737	* we will keep an unmodified tstamp_running which is a
		738	* requirement to get correct timing information
		739	*/
		740	if (__event_sched_in(group_event, cpuctx, ctx)) {
696	pmu->cancel_txn(pmu);	741	pmu->cancel_txn(pmu);
697	return -EAGAIN;	742	return -EAGAIN;
698	}	743	}
@@ -701,26 +746,31 @@ group_sched_in(struct perf_event *group_event,
701	* Schedule in siblings as one group (if any):	746	* Schedule in siblings as one group (if any):
702	*/	747	*/
703	list_for_each_entry(event, &group_event->sibling_list, group_entry) {	748	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
704	if (event_sched_in(event, cpuctx, ctx)) {	749	if (__event_sched_in(event, cpuctx, ctx)) {
705	partial_group = event;	750	partial_group = event;
706	goto group_error;	751	goto group_error;
707	}	752	}
708	}	753	}
709		754
710	if (!pmu->commit_txn(pmu))	755	if (!pmu->commit_txn(pmu)) {
		756	/* commit tstamp_running */
		757	group_commit_event_sched_in(group_event, cpuctx, ctx);
711	return 0;	758	return 0;
712		759	}
713	group_error:	760	group_error:
714	/*	761	/*
715	* Groups can be scheduled in as one unit only, so undo any	762	* Groups can be scheduled in as one unit only, so undo any
716	* partial group before returning:	763	* partial group before returning:
		764	*
		765	* use __event_sched_out() to avoid updating tstamp_stopped
		766	* because the event never actually ran
717	*/	767	*/
718	list_for_each_entry(event, &group_event->sibling_list, group_entry) {	768	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
719	if (event == partial_group)	769	if (event == partial_group)
720	break;	770	break;
721	event_sched_out(event, cpuctx, ctx);	771	__event_sched_out(event, cpuctx, ctx);
722	}	772	}
723	event_sched_out(group_event, cpuctx, ctx);	773	__event_sched_out(group_event, cpuctx, ctx);
724		774
725	pmu->cancel_txn(pmu);	775	pmu->cancel_txn(pmu);
726		776