include/linux/perf_counter.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691

/*
 *  Performance counters:
 *
 *   Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
 *   Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
 *
 *  Data type definitions, declarations, prototypes.
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 *
 *  For licencing details see kernel-base/COPYING
 */
#ifndef _LINUX_PERF_COUNTER_H
#define _LINUX_PERF_COUNTER_H

#include <linux/types.h>
#include <linux/ioctl.h>
#include <asm/byteorder.h>

/*
 * User-space ABI bits:
 */

/*
 * attr.type
 */
enum perf_type_id {
	PERF_TYPE_HARDWARE		= 0,
	PERF_TYPE_SOFTWARE		= 1,
	PERF_TYPE_TRACEPOINT		= 2,
	PERF_TYPE_HW_CACHE		= 3,
	PERF_TYPE_RAW			= 4,

	PERF_TYPE_MAX,			/* non ABI */
};

/*
 * Generalized performance counter event types, used by the attr.event_id
 * parameter of the sys_perf_counter_open() syscall:
 */
enum perf_hw_id {
	/*
	 * Common hardware events, generalized by the kernel:
	 */
	PERF_COUNT_CPU_CYCLES		= 0,
	PERF_COUNT_INSTRUCTIONS		= 1,
	PERF_COUNT_CACHE_REFERENCES	= 2,
	PERF_COUNT_CACHE_MISSES		= 3,
	PERF_COUNT_BRANCH_INSTRUCTIONS	= 4,
	PERF_COUNT_BRANCH_MISSES	= 5,
	PERF_COUNT_BUS_CYCLES		= 6,

	PERF_HW_EVENTS_MAX,		/* non ABI */
};

/*
 * Generalized hardware cache counters:
 *
 *       { L1-D, L1-I, L2, LLC, ITLB, DTLB, BPU } x
 *       { read, write, prefetch } x
 *       { accesses, misses }
 */
enum perf_hw_cache_id {
	PERF_COUNT_HW_CACHE_L1D		= 0,
	PERF_COUNT_HW_CACHE_L1I		= 1,
	PERF_COUNT_HW_CACHE_L2		= 2,
	PERF_COUNT_HW_CACHE_DTLB	= 3,
	PERF_COUNT_HW_CACHE_ITLB	= 4,
	PERF_COUNT_HW_CACHE_BPU		= 5,

	PERF_COUNT_HW_CACHE_MAX,	/* non ABI */
};

enum perf_hw_cache_op_id {
	PERF_COUNT_HW_CACHE_OP_READ	= 0,
	PERF_COUNT_HW_CACHE_OP_WRITE	= 1,
	PERF_COUNT_HW_CACHE_OP_PREFETCH	= 2,

	PERF_COUNT_HW_CACHE_OP_MAX,	/* non ABI */
};

enum perf_hw_cache_op_result_id {
	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,

	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non ABI */
};

/*
 * Special "software" counters provided by the kernel, even if the hardware
 * does not support performance counters. These counters measure various
 * physical and sw events of the kernel (and allow the profiling of them as
 * well):
 */
enum perf_sw_ids {
	PERF_COUNT_CPU_CLOCK		= 0,
	PERF_COUNT_TASK_CLOCK		= 1,
	PERF_COUNT_PAGE_FAULTS		= 2,
	PERF_COUNT_CONTEXT_SWITCHES	= 3,
	PERF_COUNT_CPU_MIGRATIONS	= 4,
	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,

	PERF_SW_EVENTS_MAX,		/* non ABI */
};

/*
 * Bits that can be set in attr.sample_type to request information
 * in the overflow packets.
 */
enum perf_counter_sample_format {
	PERF_SAMPLE_IP			= 1U << 0,
	PERF_SAMPLE_TID			= 1U << 1,
	PERF_SAMPLE_TIME		= 1U << 2,
	PERF_SAMPLE_ADDR		= 1U << 3,
	PERF_SAMPLE_GROUP		= 1U << 4,
	PERF_SAMPLE_CALLCHAIN		= 1U << 5,
	PERF_SAMPLE_ID			= 1U << 6,
	PERF_SAMPLE_CPU			= 1U << 7,
	PERF_SAMPLE_PERIOD		= 1U << 8,
};

/*
 * Bits that can be set in attr.read_format to request that
 * reads on the counter should return the indicated quantities,
 * in increasing order of bit value, after the counter value.
 */
enum perf_counter_read_format {
	PERF_FORMAT_TOTAL_TIME_ENABLED	=  1U << 0,
	PERF_FORMAT_TOTAL_TIME_RUNNING	=  1U << 1,
	PERF_FORMAT_ID			=  1U << 2,
};

/*
 * Hardware event to monitor via a performance monitoring counter:
 */
struct perf_counter_attr {
	/*
	 * Major type: hardware/software/tracepoint/etc.
	 */
	__u32			type;
	__u32			__reserved_1;

	/*
	 * Type specific configuration information.
	 */
	__u64			config;

	union {
		__u64		sample_period;
		__u64		sample_freq;
	};

	__u64			sample_type;
	__u64			read_format;

	__u64			disabled       :  1, /* off by default        */
				inherit	       :  1, /* children inherit it   */
				pinned	       :  1, /* must always be on PMU */
				exclusive      :  1, /* only group on PMU     */
				exclude_user   :  1, /* don't count user      */
				exclude_kernel :  1, /* ditto kernel          */
				exclude_hv     :  1, /* ditto hypervisor      */
				exclude_idle   :  1, /* don't count when idle */
				mmap           :  1, /* include mmap data     */
				comm	       :  1, /* include comm data     */
				freq           :  1, /* use freq, not period  */

				__reserved_2   : 53;

	__u32			wakeup_events;	/* wakeup every n events */
	__u32			__reserved_3;

	__u64			__reserved_4;
};

/*
 * Ioctls that can be done on a perf counter fd:
 */
#define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
#define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
#define PERF_COUNTER_IOC_REFRESH	_IO ('$', 2)
#define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
#define PERF_COUNTER_IOC_PERIOD		_IOW('$', 4, u64)

enum perf_counter_ioc_flags {
	PERF_IOC_FLAG_GROUP		= 1U << 0,
};

/*
 * Structure of the page that can be mapped via mmap
 */
struct perf_counter_mmap_page {
	__u32	version;		/* version number of this structure */
	__u32	compat_version;		/* lowest version this is compat with */

	/*
	 * Bits needed to read the hw counters in user-space.
	 *
	 *   u32 seq;
	 *   s64 count;
	 *
	 *   do {
	 *     seq = pc->lock;
	 *
	 *     barrier()
	 *     if (pc->index) {
	 *       count = pmc_read(pc->index - 1);
	 *       count += pc->offset;
	 *     } else
	 *       goto regular_read;
	 *
	 *     barrier();
	 *   } while (pc->lock != seq);
	 *
	 * NOTE: for obvious reason this only works on self-monitoring
	 *       processes.
	 */
	__u32	lock;			/* seqlock for synchronization */
	__u32	index;			/* hardware counter identifier */
	__s64	offset;			/* add to hardware counter value */

	/*
	 * Control data for the mmap() data buffer.
	 *
	 * User-space reading this value should issue an rmb(), on SMP capable
	 * platforms, after reading this value -- see perf_counter_wakeup().
	 */
	__u64   data_head;		/* head in the data section */
};

#define PERF_EVENT_MISC_CPUMODE_MASK	(3 << 0)
#define PERF_EVENT_MISC_CPUMODE_UNKNOWN	(0 << 0)
#define PERF_EVENT_MISC_KERNEL		(1 << 0)
#define PERF_EVENT_MISC_USER		(2 << 0)
#define PERF_EVENT_MISC_HYPERVISOR	(3 << 0)
#define PERF_EVENT_MISC_OVERFLOW	(1 << 2)

struct perf_event_header {
	__u32	type;
	__u16	misc;
	__u16	size;
};

enum perf_event_type {

	/*
	 * The MMAP events record the PROT_EXEC mappings so that we can
	 * correlate userspace IPs to code. They have the following structure:
	 *
	 * struct {
	 *	struct perf_event_header	header;
	 *
	 *	u32				pid, tid;
	 *	u64				addr;
	 *	u64				len;
	 *	u64				pgoff;
	 *	char				filename[];
	 * };
	 */
	PERF_EVENT_MMAP			= 1,

	/*
	 * struct {
	 *	struct perf_event_header	header;
	 *
	 *	u32				pid, tid;
	 *	char				comm[];
	 * };
	 */
	PERF_EVENT_COMM			= 3,

	/*
	 * struct {
	 *	struct perf_event_header	header;
	 *	u64				time;
	 *	u64				id;
	 *	u64				sample_period;
	 * };
	 */
	PERF_EVENT_PERIOD		= 4,

	/*
	 * struct {
	 *	struct perf_event_header	header;
	 *	u64				time;
	 * };
	 */
	PERF_EVENT_THROTTLE		= 5,
	PERF_EVENT_UNTHROTTLE		= 6,

	/*
	 * struct {
	 *	struct perf_event_header	header;
	 *	u32				pid, ppid;
	 * };
	 */
	PERF_EVENT_FORK			= 7,

	/*
	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
	 * will be PERF_RECORD_*
	 *
	 * struct {
	 *	struct perf_event_header	header;
	 *
	 *	{ u64			ip;	  } && PERF_RECORD_IP
	 *	{ u32			pid, tid; } && PERF_RECORD_TID
	 *	{ u64			time;     } && PERF_RECORD_TIME
	 *	{ u64			addr;     } && PERF_RECORD_ADDR
	 *	{ u64			config;   } && PERF_RECORD_CONFIG
	 *	{ u32			cpu, res; } && PERF_RECORD_CPU
	 *
	 *	{ u64			nr;
	 *	  { u64 id, val; }	cnt[nr];  } && PERF_RECORD_GROUP
	 *
	 *	{ u16			nr,
	 *				hv,
	 *				kernel,
	 *				user;
	 *	  u64			ips[nr];  } && PERF_RECORD_CALLCHAIN
	 * };
	 */
};

#ifdef __KERNEL__
/*
 * Kernel-internal data types and definitions:
 */

#ifdef CONFIG_PERF_COUNTERS
# include <asm/perf_counter.h>
#endif

#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/hrtimer.h>
#include <linux/fs.h>
#include <linux/pid_namespace.h>
#include <asm/atomic.h>

struct task_struct;

/**
 * struct hw_perf_counter - performance counter hardware details:
 */
struct hw_perf_counter {
#ifdef CONFIG_PERF_COUNTERS
	union {
		struct { /* hardware */
			u64				config;
			unsigned long			config_base;
			unsigned long			counter_base;
			int				idx;
		};
		union { /* software */
			atomic64_t			count;
			struct hrtimer			hrtimer;
		};
	};
	atomic64_t			prev_count;
	u64				sample_period;
	u64				last_period;
	atomic64_t			period_left;
	u64				interrupts;

	u64				freq_count;
	u64				freq_interrupts;
	u64				freq_stamp;
#endif
};

struct perf_counter;

/**
 * struct pmu - generic performance monitoring unit
 */
struct pmu {
	int (*enable)			(struct perf_counter *counter);
	void (*disable)			(struct perf_counter *counter);
	void (*read)			(struct perf_counter *counter);
	void (*unthrottle)		(struct perf_counter *counter);
};

/**
 * enum perf_counter_active_state - the states of a counter
 */
enum perf_counter_active_state {
	PERF_COUNTER_STATE_ERROR	= -2,
	PERF_COUNTER_STATE_OFF		= -1,
	PERF_COUNTER_STATE_INACTIVE	=  0,
	PERF_COUNTER_STATE_ACTIVE	=  1,
};

struct file;

struct perf_mmap_data {
	struct rcu_head			rcu_head;
	int				nr_pages;	/* nr of data pages  */
	int				nr_locked;	/* nr pages mlocked  */

	atomic_t			poll;		/* POLL_ for wakeups */
	atomic_t			events;		/* event limit       */

	atomic_long_t			head;		/* write position    */
	atomic_long_t			done_head;	/* completed head    */

	atomic_t			lock;		/* concurrent writes */

	atomic_t			wakeup;		/* needs a wakeup    */

	struct perf_counter_mmap_page   *user_page;
	void				*data_pages[0];
};

struct perf_pending_entry {
	struct perf_pending_entry *next;
	void (*func)(struct perf_pending_entry *);
};

/**
 * struct perf_counter - performance counter kernel representation:
 */
struct perf_counter {
#ifdef CONFIG_PERF_COUNTERS
	struct list_head		list_entry;
	struct list_head		event_entry;
	struct list_head		sibling_list;
	int				nr_siblings;
	struct perf_counter		*group_leader;
	const struct pmu		*pmu;

	enum perf_counter_active_state	state;
	atomic64_t			count;

	/*
	 * These are the total time in nanoseconds that the counter
	 * has been enabled (i.e. eligible to run, and the task has
	 * been scheduled in, if this is a per-task counter)
	 * and running (scheduled onto the CPU), respectively.
	 *
	 * They are computed from tstamp_enabled, tstamp_running and
	 * tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
	 */
	u64				total_time_enabled;
	u64				total_time_running;

	/*
	 * These are timestamps used for computing total_time_enabled
	 * and total_time_running when the counter is in INACTIVE or
	 * ACTIVE state, measured in nanoseconds from an arbitrary point
	 * in time.
	 * tstamp_enabled: the notional time when the counter was enabled
	 * tstamp_running: the notional time when the counter was scheduled on
	 * tstamp_stopped: in INACTIVE state, the notional time when the
	 *	counter was scheduled off.
	 */
	u64				tstamp_enabled;
	u64				tstamp_running;
	u64				tstamp_stopped;

	struct perf_counter_attr	attr;
	struct hw_perf_counter		hw;

	struct perf_counter_context	*ctx;
	struct file			*filp;

	/*
	 * These accumulate total time (in nanoseconds) that children
	 * counters have been enabled and running, respectively.
	 */
	atomic64_t			child_total_time_enabled;
	atomic64_t			child_total_time_running;

	/*
	 * Protect attach/detach and child_list:
	 */
	struct mutex			child_mutex;
	struct list_head		child_list;
	struct perf_counter		*parent;

	int				oncpu;
	int				cpu;

	struct list_head		owner_entry;
	struct task_struct		*owner;

	/* mmap bits */
	struct mutex			mmap_mutex;
	atomic_t			mmap_count;
	struct perf_mmap_data		*data;

	/* poll related */
	wait_queue_head_t		waitq;
	struct fasync_struct		*fasync;

	/* delayed work for NMIs and such */
	int				pending_wakeup;
	int				pending_kill;
	int				pending_disable;
	struct perf_pending_entry	pending;

	atomic_t			event_limit;

	void (*destroy)(struct perf_counter *);
	struct rcu_head			rcu_head;

	struct pid_namespace		*ns;
	u64				id;
#endif
};

/**
 * struct perf_counter_context - counter context structure
 *
 * Used as a container for task counters and CPU counters as well:
 */
struct perf_counter_context {
	/*
	 * Protect the states of the counters in the list,
	 * nr_active, and the list:
	 */
	spinlock_t		lock;
	/*
	 * Protect the list of counters.  Locking either mutex or lock
	 * is sufficient to ensure the list doesn't change; to change
	 * the list you need to lock both the mutex and the spinlock.
	 */
	struct mutex		mutex;

	struct list_head	counter_list;
	struct list_head	event_list;
	int			nr_counters;
	int			nr_active;
	int			is_active;
	atomic_t		refcount;
	struct task_struct	*task;

	/*
	 * Context clock, runs when context enabled.
	 */
	u64			time;
	u64			timestamp;

	/*
	 * These fields let us detect when two contexts have both
	 * been cloned (inherited) from a common ancestor.
	 */
	struct perf_counter_context *parent_ctx;
	u64			parent_gen;
	u64			generation;
	int			pin_count;
	struct rcu_head		rcu_head;
};

/**
 * struct perf_counter_cpu_context - per cpu counter context structure
 */
struct perf_cpu_context {
	struct perf_counter_context	ctx;
	struct perf_counter_context	*task_ctx;
	int				active_oncpu;
	int				max_pertask;
	int				exclusive;

	/*
	 * Recursion avoidance:
	 *
	 * task, softirq, irq, nmi context
	 */
	int				recursion[4];
};

#ifdef CONFIG_PERF_COUNTERS

/*
 * Set by architecture code:
 */
extern int perf_max_counters;

extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);

extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
extern void perf_counter_task_sched_out(struct task_struct *task,
					struct task_struct *next, int cpu);
extern void perf_counter_task_tick(struct task_struct *task, int cpu);
extern int perf_counter_init_task(struct task_struct *child);
extern void perf_counter_exit_task(struct task_struct *child);
extern void perf_counter_free_task(struct task_struct *task);
extern void perf_counter_do_pending(void);
extern void perf_counter_print_debug(void);
extern void __perf_disable(void);
extern bool __perf_enable(void);
extern void perf_disable(void);
extern void perf_enable(void);
extern int perf_counter_task_disable(void);
extern int perf_counter_task_enable(void);
extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
	       struct perf_cpu_context *cpuctx,
	       struct perf_counter_context *ctx, int cpu);
extern void perf_counter_update_userpage(struct perf_counter *counter);

struct perf_sample_data {
	struct pt_regs		*regs;
	u64			addr;
	u64			period;
};

extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
				 struct perf_sample_data *data);

/*
 * Return 1 for a software counter, 0 for a hardware counter
 */
static inline int is_software_counter(struct perf_counter *counter)
{
	return (counter->attr.type != PERF_TYPE_RAW) &&
		(counter->attr.type != PERF_TYPE_HARDWARE);
}

extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);

extern void __perf_counter_mmap(struct vm_area_struct *vma);

static inline void perf_counter_mmap(struct vm_area_struct *vma)
{
	if (vma->vm_flags & VM_EXEC)
		__perf_counter_mmap(vma);
}

extern void perf_counter_comm(struct task_struct *tsk);
extern void perf_counter_fork(struct task_struct *tsk);

extern void perf_counter_task_migration(struct task_struct *task, int cpu);

#define MAX_STACK_DEPTH		255

struct perf_callchain_entry {
	u16	nr, hv, kernel, user;
	u64	ip[MAX_STACK_DEPTH];
};

extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);

extern int sysctl_perf_counter_paranoid;
extern int sysctl_perf_counter_mlock;
extern int sysctl_perf_counter_sample_rate;

extern void perf_counter_init(void);

#ifndef perf_misc_flags
#define perf_misc_flags(regs)	(user_mode(regs) ? PERF_EVENT_MISC_USER : \
				 PERF_EVENT_MISC_KERNEL)
#define perf_instruction_pointer(regs)	instruction_pointer(regs)
#endif

#else
static inline void
perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
static inline void
perf_counter_task_sched_out(struct task_struct *task,
			    struct task_struct *next, int cpu)		{ }
static inline void
perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
static inline int perf_counter_init_task(struct task_struct *child)	{ return 0; }
static inline void perf_counter_exit_task(struct task_struct *child)	{ }
static inline void perf_counter_free_task(struct task_struct *task)	{ }
static inline void perf_counter_do_pending(void)			{ }
static inline void perf_counter_print_debug(void)			{ }
static inline void perf_disable(void)					{ }
static inline void perf_enable(void)					{ }
static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
static inline int perf_counter_task_enable(void)	{ return -EINVAL; }

static inline void
perf_swcounter_event(u32 event, u64 nr, int nmi,
		     struct pt_regs *regs, u64 addr)			{ }

static inline void perf_counter_mmap(struct vm_area_struct *vma)	{ }
static inline void perf_counter_comm(struct task_struct *tsk)		{ }
static inline void perf_counter_fork(struct task_struct *tsk)		{ }
static inline void perf_counter_init(void)				{ }
static inline void perf_counter_task_migration(struct task_struct *task,
					       int cpu)			{ }
#endif

#endif /* __KERNEL__ */
#endif /* _LINUX_PERF_COUNTER_H */