perf_counter: record time running and time enabled for each counter

Impact: new functionality Currently, if there are more counters enabled than can fit on the CPU, the kernel will multiplex the counters on to the hardware using round-robin scheduling. That isn't too bad for sampling counters, but for counting counters it means that the value read from a counter represents some unknown fraction of the true count of events that occurred while the counter was enabled. This remedies the situation by keeping track of how long each counter is enabled for, and how long it is actually on the cpu and counting events. These times are recorded in nanoseconds using the task clock for per-task counters and the cpu clock for per-cpu counters. These values can be supplied to userspace on a read from the counter. Userspace requests that they be supplied after the counter value by setting the PERF_FORMAT_TOTAL_TIME_ENABLED and/or PERF_FORMAT_TOTAL_TIME_RUNNING bits in the hw_event.read_format field when creating the counter. (There is no way to change the read format after the counter is created, though it would be possible to add some way to do that.) Using this information it is possible for userspace to scale the count it reads from the counter to get an estimate of the true count: true_count_estimate = count * total_time_enabled / total_time_running This also lets userspace detect the situation where the counter never got to go on the cpu: total_time_running == 0. This functionality has been requested by the PAPI developers, and will be generally needed for interpreting the count values from counting counters correctly. In the implementation, this keeps 5 time values (in nanoseconds) for each counter: total_time_enabled and total_time_running are used when the counter is in state OFF or ERROR and for reporting back to userspace. When the counter is in state INACTIVE or ACTIVE, it is the tstamp_enabled, tstamp_running and tstamp_stopped values that are relevant, and total_time_enabled and total_time_running are determined from them. (tstamp_stopped is only used in INACTIVE state.) The reason for doing it like this is that it means that only counters being enabled or disabled at sched-in and sched-out time need to be updated. There are no new loops that iterate over all counters to update total_time_enabled or total_time_running. This also keeps separate child_total_time_running and child_total_time_enabled fields that get added in when reporting the totals to userspace. They are separate fields so that they can be atomic. We don't want to use atomics for total_time_running, total_time_enabled etc., because then we would have to use atomic sequences to update them, which are slower than regular arithmetic and memory accesses. It is possible to measure total_time_running by adding a task_clock counter to each group of counters, and total_time_enabled can be measured approximately with a top-level task_clock counter (though inaccuracies will creep in if you need to disable and enable groups since it is not possible in general to disable/enable the top-level task_clock counter simultaneously with another group). However, that adds extra overhead - I measured around 15% increase in the context switch latency reported by lat_ctx (from lmbench) when a task_clock counter was added to each of 2 groups, and around 25% increase when a task_clock counter was added to each of 4 groups. (In both cases a top-level task-clock counter was also added.) In contrast, the code added in this commit gives better information with no overhead that I could measure (in fact in some cases I measured lower times with this code, but the differences were all less than one standard deviation). [ v2: address review comments by Andrew Morton. ] Signed-off-by: Paul Mackerras <paulus@samba.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Andrew Morton <akpm@linux-foundation.org> Orig-LKML-Reference: <18890.6578.728637.139402@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Paul Mackerras <paulus@samba.org> 2009-03-25 07:46:58 -0400
committer: Ingo Molnar <mingo@elte.hu> 2009-04-06 03:30:36 -0400
commit: 53cfbf593758916aac41db728f029986a62f1254 (patch)
tree: c58a9c0f6e3cc050235e736e288e268bdb1f37eb /include/linux/perf_counter.h
parent: 7730d8655880f41f2ea519aca2ca6a1413dfd2c9 (diff)
1 files changed, 53 insertions, 0 deletions
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7fdbdf8be775..6bf67ce17625 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -103,6 +103,16 @@ enum perf_counter_record_type {
 #define PERF_COUNTER_EVENT_MASK         __PERF_COUNTER_MASK(EVENT)
 /*
+ * Bits that can be set in hw_event.read_format to request that
+ * reads on the counter should return the indicated quantities,
+ * in increasing order of bit value, after the counter value.
+ */
+enum perf_counter_read_format {
+        PERF_FORMAT_TOTAL_TIME_ENABLED  =  1,
+        PERF_FORMAT_TOTAL_TIME_RUNNING  =  2,
+};
+/*
 * Hardware event to monitor via a performance monitoring counter:
 */
 struct perf_counter_hw_event {
@@ -281,6 +291,32 @@ struct perf_counter {
        enum perf_counter_active_state  prev_state;
        atomic64_t                      count;
+        /*
+         * These are the total time in nanoseconds that the counter
+         * has been enabled (i.e. eligible to run, and the task has
+         * been scheduled in, if this is a per-task counter)
+         * and running (scheduled onto the CPU), respectively.
+         *
+         * They are computed from tstamp_enabled, tstamp_running and
+         * tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
+         */
+        u64                             total_time_enabled;
+        u64                             total_time_running;
+        /*
+         * These are timestamps used for computing total_time_enabled
+         * and total_time_running when the counter is in INACTIVE or
+         * ACTIVE state, measured in nanoseconds from an arbitrary point
+         * in time.
+         * tstamp_enabled: the notional time when the counter was enabled
+         * tstamp_running: the notional time when the counter was scheduled on
+         * tstamp_stopped: in INACTIVE state, the notional time when the
+         *      counter was scheduled off.
+         */
+        u64                             tstamp_enabled;
+        u64                             tstamp_running;
+        u64                             tstamp_stopped;
        struct perf_counter_hw_event    hw_event;
        struct hw_perf_counter          hw;
@@ -292,6 +328,13 @@ struct perf_counter {
        struct list_head                child_list;
        /*
+         * These accumulate total time (in nanoseconds) that children
+         * counters have been enabled and running, respectively.
+         */
+        atomic64_t                      child_total_time_enabled;
+        atomic64_t                      child_total_time_running;
+        /*
         * Protect attach/detach and child_list:
         */
        struct mutex                    mutex;
@@ -339,6 +382,16 @@ struct perf_counter_context {
        int                     nr_active;
        int                     is_active;
        struct task_struct      *task;
+        /*
+         * time_now is the current time in nanoseconds since an arbitrary
+         * point in the past.  For per-task counters, this is based on the
+         * task clock, and for per-cpu counters it is based on the cpu clock.
+         * time_lost is an offset from the task/cpu clock, used to make it
+         * appear that time only passes while the context is scheduled in.
+         */
+        u64                     time_now;
+        u64                     time_lost;
 #endif
 };
author	Paul Mackerras <paulus@samba.org>	2009-03-25 07:46:58 -0400
committer	Ingo Molnar <mingo@elte.hu>	2009-04-06 03:30:36 -0400
commit	53cfbf593758916aac41db728f029986a62f1254 (patch)
tree	c58a9c0f6e3cc050235e736e288e268bdb1f37eb /include/linux/perf_counter.h
parent	7730d8655880f41f2ea519aca2ca6a1413dfd2c9 (diff)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 7fdbdf8be775..6bf67ce17625 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h
@@ -103,6 +103,16 @@ enum perf_counter_record_type {
103	#define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT)	103	#define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT)
104		104
105	/*	105	/*
		106	* Bits that can be set in hw_event.read_format to request that
		107	* reads on the counter should return the indicated quantities,
		108	* in increasing order of bit value, after the counter value.
		109	*/
		110	enum perf_counter_read_format {
		111	PERF_FORMAT_TOTAL_TIME_ENABLED = 1,
		112	PERF_FORMAT_TOTAL_TIME_RUNNING = 2,
		113	};
		114
		115	/*
106	* Hardware event to monitor via a performance monitoring counter:	116	* Hardware event to monitor via a performance monitoring counter:
107	*/	117	*/
108	struct perf_counter_hw_event {	118	struct perf_counter_hw_event {
@@ -281,6 +291,32 @@ struct perf_counter {
281	enum perf_counter_active_state prev_state;	291	enum perf_counter_active_state prev_state;
282	atomic64_t count;	292	atomic64_t count;
283		293
		294	/*
		295	* These are the total time in nanoseconds that the counter
		296	* has been enabled (i.e. eligible to run, and the task has
		297	* been scheduled in, if this is a per-task counter)
		298	* and running (scheduled onto the CPU), respectively.
		299	*
		300	* They are computed from tstamp_enabled, tstamp_running and
		301	* tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
		302	*/
		303	u64 total_time_enabled;
		304	u64 total_time_running;
		305
		306	/*
		307	* These are timestamps used for computing total_time_enabled
		308	* and total_time_running when the counter is in INACTIVE or
		309	* ACTIVE state, measured in nanoseconds from an arbitrary point
		310	* in time.
		311	* tstamp_enabled: the notional time when the counter was enabled
		312	* tstamp_running: the notional time when the counter was scheduled on
		313	* tstamp_stopped: in INACTIVE state, the notional time when the
		314	* counter was scheduled off.
		315	*/
		316	u64 tstamp_enabled;
		317	u64 tstamp_running;
		318	u64 tstamp_stopped;
		319
284	struct perf_counter_hw_event hw_event;	320	struct perf_counter_hw_event hw_event;
285	struct hw_perf_counter hw;	321	struct hw_perf_counter hw;
286		322
@@ -292,6 +328,13 @@ struct perf_counter {
292	struct list_head child_list;	328	struct list_head child_list;
293		329
294	/*	330	/*
		331	* These accumulate total time (in nanoseconds) that children
		332	* counters have been enabled and running, respectively.
		333	*/
		334	atomic64_t child_total_time_enabled;
		335	atomic64_t child_total_time_running;
		336
		337	/*
295	* Protect attach/detach and child_list:	338	* Protect attach/detach and child_list:
296	*/	339	*/
297	struct mutex mutex;	340	struct mutex mutex;
@@ -339,6 +382,16 @@ struct perf_counter_context {
339	int nr_active;	382	int nr_active;
340	int is_active;	383	int is_active;
341	struct task_struct *task;	384	struct task_struct *task;
		385
		386	/*
		387	* time_now is the current time in nanoseconds since an arbitrary
		388	* point in the past. For per-task counters, this is based on the
		389	* task clock, and for per-cpu counters it is based on the cpu clock.
		390	* time_lost is an offset from the task/cpu clock, used to make it
		391	* appear that time only passes while the context is scheduled in.
		392	*/
		393	u64 time_now;
		394	u64 time_lost;
342	#endif	395	#endif
343	};	396	};
344		397