From 27ac905b8f88d28779b0661809286b5ba2817d37 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:55:57 -0500
Subject: perf/x86/intel: Reduce lbr_sel_map[] size

The index of lbr_sel_map is bit value of perf branch_sample_type.
PERF_SAMPLE_BRANCH_MAX is 1024 at present, so each lbr_sel_map uses
4096 bytes. By using bit shift as index, we can reduce lbr_sel_map
size to 40 bytes. This patch defines 'bit shift' for branch types,
and use 'bit shift' to define lbr_sel_maps.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: jolsa@redhat.com
Cc: linux-api@vger.kernel.org
Link: http://lkml.kernel.org/r/1415156173-10035-2-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/perf_event.h | 49 +++++++++++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 9b79abbd1ab8..e46b93279e3d 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -152,21 +152,42 @@ enum perf_event_sample_format {
  * The branch types can be combined, however BRANCH_ANY covers all types
  * of branches and therefore it supersedes all the other types.
  */
+enum perf_branch_sample_type_shift {
+	PERF_SAMPLE_BRANCH_USER_SHIFT		= 0, /* user branches */
+	PERF_SAMPLE_BRANCH_KERNEL_SHIFT		= 1, /* kernel branches */
+	PERF_SAMPLE_BRANCH_HV_SHIFT		= 2, /* hypervisor branches */
+
+	PERF_SAMPLE_BRANCH_ANY_SHIFT		= 3, /* any branch types */
+	PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT	= 4, /* any call branch */
+	PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT	= 5, /* any return branch */
+	PERF_SAMPLE_BRANCH_IND_CALL_SHIFT	= 6, /* indirect calls */
+	PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT	= 7, /* transaction aborts */
+	PERF_SAMPLE_BRANCH_IN_TX_SHIFT		= 8, /* in transaction */
+	PERF_SAMPLE_BRANCH_NO_TX_SHIFT		= 9, /* not in transaction */
+	PERF_SAMPLE_BRANCH_COND_SHIFT		= 10, /* conditional branches */
+
+	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
+};
+
 enum perf_branch_sample_type {
-	PERF_SAMPLE_BRANCH_USER		= 1U << 0, /* user branches */
-	PERF_SAMPLE_BRANCH_KERNEL	= 1U << 1, /* kernel branches */
-	PERF_SAMPLE_BRANCH_HV		= 1U << 2, /* hypervisor branches */
-
-	PERF_SAMPLE_BRANCH_ANY		= 1U << 3, /* any branch types */
-	PERF_SAMPLE_BRANCH_ANY_CALL	= 1U << 4, /* any call branch */
-	PERF_SAMPLE_BRANCH_ANY_RETURN	= 1U << 5, /* any return branch */
-	PERF_SAMPLE_BRANCH_IND_CALL	= 1U << 6, /* indirect calls */
-	PERF_SAMPLE_BRANCH_ABORT_TX	= 1U << 7, /* transaction aborts */
-	PERF_SAMPLE_BRANCH_IN_TX	= 1U << 8, /* in transaction */
-	PERF_SAMPLE_BRANCH_NO_TX	= 1U << 9, /* not in transaction */
-	PERF_SAMPLE_BRANCH_COND		= 1U << 10, /* conditional branches */
-
-	PERF_SAMPLE_BRANCH_MAX		= 1U << 11, /* non-ABI */
+	PERF_SAMPLE_BRANCH_USER		= 1U << PERF_SAMPLE_BRANCH_USER_SHIFT,
+	PERF_SAMPLE_BRANCH_KERNEL	= 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT,
+	PERF_SAMPLE_BRANCH_HV		= 1U << PERF_SAMPLE_BRANCH_HV_SHIFT,
+
+	PERF_SAMPLE_BRANCH_ANY		= 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT,
+	PERF_SAMPLE_BRANCH_ANY_CALL	=
+				1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT,
+	PERF_SAMPLE_BRANCH_ANY_RETURN	=
+				1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT,
+	PERF_SAMPLE_BRANCH_IND_CALL	=
+				1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT,
+	PERF_SAMPLE_BRANCH_ABORT_TX	=
+				1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT,
+	PERF_SAMPLE_BRANCH_IN_TX	= 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT,
+	PERF_SAMPLE_BRANCH_NO_TX	= 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT,
+	PERF_SAMPLE_BRANCH_COND		= 1U << PERF_SAMPLE_BRANCH_COND_SHIFT,
+
+	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
 #define PERF_SAMPLE_BRANCH_PLM_ALL \
-- 
cgit v1.2.2


From ba532500c5651a4be4108acc64ed99a95cb005b3 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:55:58 -0500
Subject: perf: Introduce pmu context switch callback

The callback is invoked when process is scheduled in or out.
It provides mechanism for later patches to save/store the LBR
stack. For the schedule in case, the callback is invoked at
the same place that flush branch stack callback is invoked.
So it also can replace the flush branch stack callback. To
avoid unnecessary overhead, the callback is enabled only when
there are events use the LBR stack.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-3-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 33262004c310..fbab6235d053 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -265,6 +265,13 @@ struct pmu {
 	 * flush branch stack on context-switches (needed in cpu-wide mode)
 	 */
 	void (*flush_branch_stack)	(void);
+
+	/*
+	 * context-switches callback
+	 */
+	void (*sched_task)		(struct perf_event_context *ctx,
+					bool sched_in);
+
 };
 
 /**
@@ -558,6 +565,8 @@ extern void perf_event_delayed_put(struct task_struct *task);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
+extern void perf_sched_cb_dec(struct pmu *pmu);
+extern void perf_sched_cb_inc(struct pmu *pmu);
 extern int perf_event_task_disable(void);
 extern int perf_event_task_enable(void);
 extern int perf_event_refresh(struct perf_event *event, int refresh);
-- 
cgit v1.2.2


From 2a0ad3b326a9024ba86dca4028499d31fa0c6c4d Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:55:59 -0500
Subject: perf/x86/intel: Use context switch callback to flush LBR stack

Previous commit introduces context switch callback, its function
overlaps with the flush branch stack callback. So we can use the
context switch callback to flush LBR stack.

This patch adds code that uses the flush branch callback to
flush the LBR stack when task is being scheduled in. The callback
is enabled only when there are events use the LBR hardware. This
patch also removes all old flush branch stack code.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-4-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fbab6235d053..c7007a564440 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -511,7 +511,6 @@ struct perf_event_context {
 	u64				generation;
 	int				pin_count;
 	int				nr_cgroups;	 /* cgroup evts */
-	int				nr_branch_stack; /* branch_stack evt */
 	struct rcu_head			rcu_head;
 
 	struct delayed_work		orphans_remove;
-- 
cgit v1.2.2


From 4af57ef28c2c1047fda9e1a5be02aa7a6a69cf9e Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:56:01 -0500
Subject: perf: Add pmu specific data for perf task context

Introduce a new flag PERF_ATTACH_TASK_DATA for perf event's attach
stata. The flag is set by PMU's event_init() callback, it indicates
that perf event needs PMU specific data.

The PMU specific data are initialized to zeros. Later patches will
use PMU specific data to save LBR stack.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-6-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c7007a564440..270cd0173e61 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -271,6 +271,10 @@ struct pmu {
 	 */
 	void (*sched_task)		(struct perf_event_context *ctx,
 					bool sched_in);
+	/*
+	 * PMU specific data size
+	 */
+	size_t				task_ctx_size;
 
 };
 
@@ -307,6 +311,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_CONTEXT	0x01
 #define PERF_ATTACH_GROUP	0x02
 #define PERF_ATTACH_TASK	0x04
+#define PERF_ATTACH_TASK_DATA	0x08
 
 struct perf_cgroup;
 struct ring_buffer;
@@ -511,6 +516,7 @@ struct perf_event_context {
 	u64				generation;
 	int				pin_count;
 	int				nr_cgroups;	 /* cgroup evts */
+	void				*task_ctx_data; /* pmu specific data */
 	struct rcu_head			rcu_head;
 
 	struct delayed_work		orphans_remove;
-- 
cgit v1.2.2


From a46a23000198d929391aa9dac8de68734efa2703 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Tue, 4 Nov 2014 21:56:06 -0500
Subject: perf: Simplify the branch stack check

Use event->attr.branch_sample_type to replace
intel_pmu_needs_lbr_smpl() for avoiding duplicated code that
implicitly enables the LBR.

Currently, branch stack can be enabled by user explicitly requesting
branch sampling or implicit branch sampling to correct PEBS skid.

For user explicitly requested branch sampling, the branch_sample_type
is explicitly set by user. For PEBS case, the branch_sample_type is also
implicitly set to PERF_SAMPLE_BRANCH_ANY in x86_pmu_hw_config.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-11-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 270cd0173e61..43cc158487e6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -814,6 +814,11 @@ static inline bool has_branch_stack(struct perf_event *event)
 	return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
 }
 
+static inline bool needs_branch_stack(struct perf_event *event)
+{
+	return event->attr.branch_sample_type != 0;
+}
+
 extern int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_event *event, unsigned int size);
 extern void perf_output_end(struct perf_output_handle *handle);
-- 
cgit v1.2.2


From 2c44b1936bb3b135a3fac8b3493394d42e51cf70 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 5 Nov 2014 10:36:45 +0100
Subject: perf/x86/intel: Expose LBR callstack to user space tooling

With LBR call stack feature enable, there are three callchain options.
Enable the 3rd callchain option (LBR callstack) to user space tooling.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-api@vger.kernel.org
Link: http://lkml.kernel.org/r/20141105093759.GQ10501@worktop.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/perf_event.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index e46b93279e3d..1e3cd07cf76e 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -166,6 +166,8 @@ enum perf_branch_sample_type_shift {
 	PERF_SAMPLE_BRANCH_NO_TX_SHIFT		= 9, /* not in transaction */
 	PERF_SAMPLE_BRANCH_COND_SHIFT		= 10, /* conditional branches */
 
+	PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT	= 11, /* call/ret stack */
+
 	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
 };
 
@@ -175,18 +177,16 @@ enum perf_branch_sample_type {
 	PERF_SAMPLE_BRANCH_HV		= 1U << PERF_SAMPLE_BRANCH_HV_SHIFT,
 
 	PERF_SAMPLE_BRANCH_ANY		= 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT,
-	PERF_SAMPLE_BRANCH_ANY_CALL	=
-				1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT,
-	PERF_SAMPLE_BRANCH_ANY_RETURN	=
-				1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT,
-	PERF_SAMPLE_BRANCH_IND_CALL	=
-				1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT,
-	PERF_SAMPLE_BRANCH_ABORT_TX	=
-				1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT,
+	PERF_SAMPLE_BRANCH_ANY_CALL	= 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT,
+	PERF_SAMPLE_BRANCH_ANY_RETURN	= 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT,
+	PERF_SAMPLE_BRANCH_IND_CALL	= 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT,
+	PERF_SAMPLE_BRANCH_ABORT_TX	= 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT,
 	PERF_SAMPLE_BRANCH_IN_TX	= 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT,
 	PERF_SAMPLE_BRANCH_NO_TX	= 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT,
 	PERF_SAMPLE_BRANCH_COND		= 1U << PERF_SAMPLE_BRANCH_COND_SHIFT,
 
+	PERF_SAMPLE_BRANCH_CALL_STACK	= 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
+
 	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
-- 
cgit v1.2.2


From acba3c7e4652ca5fcb2fd9376d58c2dffd8ddf2a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 14 Jan 2015 14:15:39 +0100
Subject: perf, powerpc: Fix up flush_branch_stack() users

The recent LBR rework for x86 left a stray flush_branch_stack() user in
the PowerPC code, fix that up.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joel Stanley <joel@jms.id.au>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 43cc158487e6..724d3720c9b1 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -261,11 +261,6 @@ struct pmu {
 	 */
 	int (*event_idx)		(struct perf_event *event); /*optional */
 
-	/*
-	 * flush branch stack on context-switches (needed in cpu-wide mode)
-	 */
-	void (*flush_branch_stack)	(void);
-
 	/*
 	 * context-switches callback
 	 */
-- 
cgit v1.2.2


From 39bed6cbb842d8edf5a26b01122b391d36775b5e Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Jan 2015 18:45:40 +0000
Subject: perf: Make perf_cgroup_from_task() global

Move perf_cgroup_from_task() from kernel/events/ to include/linux/ along
with the necessary struct definitions, so that it can be used by the PMU
code.

When the upcoming Intel Cache Monitoring PMU driver assigns monitoring
IDs to perf events, it needs to be able to check whether any two
monitoring events overlap (say, a cgroup and task event), which means we
need to be able to lookup the cgroup associated with a task (if any).

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-2-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 724d3720c9b1..cae4a9481777 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -53,6 +53,7 @@ struct perf_guest_info_callbacks {
 #include <linux/sysfs.h>
 #include <linux/perf_regs.h>
 #include <linux/workqueue.h>
+#include <linux/cgroup.h>
 #include <asm/local.h>
 
 struct perf_callchain_entry {
@@ -547,6 +548,35 @@ struct perf_output_handle {
 	int				page;
 };
 
+#ifdef CONFIG_CGROUP_PERF
+
+/*
+ * perf_cgroup_info keeps track of time_enabled for a cgroup.
+ * This is a per-cpu dynamically allocated data structure.
+ */
+struct perf_cgroup_info {
+	u64				time;
+	u64				timestamp;
+};
+
+struct perf_cgroup {
+	struct cgroup_subsys_state	css;
+	struct perf_cgroup_info	__percpu *info;
+};
+
+/*
+ * Must ensure cgroup is pinned (css_get) before calling
+ * this function. In other words, we cannot call this function
+ * if there is no cgroup event for the current CPU context.
+ */
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+	return container_of(task_css(task, perf_event_cgrp_id),
+			    struct perf_cgroup, css);
+}
+#endif /* CONFIG_CGROUP_PERF */
+
 #ifdef CONFIG_PERF_EVENTS
 
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
-- 
cgit v1.2.2


From eacd3ecc34472ce3751eedfc94e44c7cc6eb6305 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Jan 2015 18:45:41 +0000
Subject: perf: Add ->count() function to read per-package counters

For PMU drivers that record per-package counters, the ->count variable
cannot be used to record an accurate aggregated value, since it's not
possible to perform SMP cross-calls to cpus on other packages from the
context in which we update ->count.

Introduce a new optional ->count() accessor function that can be used to
customize how values are collected. If a PMU driver doesn't provide a
->count() function, we fallback to the existing code.

There is necessarily a window of staleness with this approach because
the task that generated the counter value may not have been scheduled by
the cpu recently.

An alternative and more complex approach would be to use a hrtimer to
periodically refresh the values from a more permissive scheduling
context. So, we're trading off complexity for accuracy.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-3-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index cae4a9481777..9fc9b0d31442 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -272,6 +272,11 @@ struct pmu {
 	 */
 	size_t				task_ctx_size;
 
+
+	/*
+	 * Return the count value for a counter.
+	 */
+	u64 (*count)			(struct perf_event *event); /*optional*/
 };
 
 /**
@@ -770,6 +775,11 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
 		__perf_event_task_sched_out(prev, next);
 }
 
+static inline u64 __perf_event_count(struct perf_event *event)
+{
+	return local64_read(&event->count) + atomic64_read(&event->child_count);
+}
+
 extern void perf_event_mmap(struct vm_area_struct *vma);
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
-- 
cgit v1.2.2


From 4afbb24ce5e723c8a093a6674a3c33062175078a Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Jan 2015 18:45:44 +0000
Subject: perf/x86/intel: Add Intel Cache QoS Monitoring support

Future Intel Xeon processors support a Cache QoS Monitoring feature that
allows tracking of the LLC occupancy for a task or task group, i.e. the
amount of data in pulled into the LLC for the task (group).

Currently the PMU only supports per-cpu events. We create an event for
each cpu and read out all the LLC occupancy values.

Because this results in duplicate values being written out to userspace,
we also export a .per-pkg event file so that the perf tools only
accumulate values for one cpu per package.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-6-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 9fc9b0d31442..ca5504c48f4f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -123,6 +123,13 @@ struct hw_perf_event {
 			/* for tp_event->class */
 			struct list_head	tp_list;
 		};
+		struct { /* intel_cqm */
+			int			cqm_state;
+			int			cqm_rmid;
+			struct list_head	cqm_events_entry;
+			struct list_head	cqm_groups_entry;
+			struct list_head	cqm_group_entry;
+		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
 			/*
-- 
cgit v1.2.2


From bfe1fcd2688f557a6b6a88f59ea7619228728bd7 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 23 Jan 2015 18:45:46 +0000
Subject: perf/x86/intel: Support task events with Intel CQM

Add support for task events as well as system-wide events. This change
has a big impact on the way that we gather LLC occupancy values in
intel_cqm_event_read().

Currently, for system-wide (per-cpu) events we defer processing to
userspace which knows how to discard all but one cpu result per package.

Things aren't so simple for task events because we need to do the value
aggregation ourselves. To do this, we defer updating the LLC occupancy
value in event->count from intel_cqm_event_read() and do an SMP
cross-call to read values for all packages in intel_cqm_event_count().
We need to ensure that we only do this for one task event per cache
group, otherwise we'll report duplicate values.

If we're a system-wide event we want to fallback to the default
perf_event_count() implementation. Refactor this into a common function
so that we don't duplicate the code.

Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an
event's task (if the event isn't per-cpu) inside of the Intel CQM PMU
driver.  This task information is only availble in the upper layers of
the perf infrastructure.

Other perf backends stash the target task in event->hw.*target so we
need to do something similar. The task is used to determine whether
events should share a cache group and an RMID.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Cc: linux-api@vger.kernel.org
Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h      | 1 +
 include/uapi/linux/perf_event.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ca5504c48f4f..dac4c2831d82 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -129,6 +129,7 @@ struct hw_perf_event {
 			struct list_head	cqm_events_entry;
 			struct list_head	cqm_groups_entry;
 			struct list_head	cqm_group_entry;
+			struct task_struct	*cqm_target;
 		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1e3cd07cf76e..3c8b45de57ec 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -32,6 +32,7 @@ enum perf_type_id {
 	PERF_TYPE_HW_CACHE			= 3,
 	PERF_TYPE_RAW				= 4,
 	PERF_TYPE_BREAKPOINT			= 5,
+	PERF_TYPE_INTEL_CQM			= 6,
 
 	PERF_TYPE_MAX,				/* non-ABI */
 };
-- 
cgit v1.2.2


From 50f16a8bf9d7a92c437ed1867d0f7e1dc6a9aca9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 5 Mar 2015 22:10:19 +0100
Subject: perf: Remove type specific target pointers

The only reason CQM had to use a hard-coded pmu type was so it could use
cqm_target in hw_perf_event.

Do away with the {tp,bp,cqm}_target pointers and provide a non type
specific one.

This allows us to do away with that silly pmu type as well.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Vince Weaver <vince@deater.net>
Cc: acme@kernel.org
Cc: acme@redhat.com
Cc: hpa@zytor.com
Cc: jolsa@redhat.com
Cc: kanaka.d.juvva@intel.com
Cc: matt.fleming@intel.com
Cc: tglx@linutronix.de
Cc: torvalds@linux-foundation.org
Cc: vikas.shivappa@linux.intel.com
Link: http://lkml.kernel.org/r/20150305211019.GU21418@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h      | 4 +---
 include/uapi/linux/perf_event.h | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dac4c2831d82..5aa49d7bfd07 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -119,7 +119,6 @@ struct hw_perf_event {
 			struct hrtimer	hrtimer;
 		};
 		struct { /* tracepoint */
-			struct task_struct	*tp_target;
 			/* for tp_event->class */
 			struct list_head	tp_list;
 		};
@@ -129,7 +128,6 @@ struct hw_perf_event {
 			struct list_head	cqm_events_entry;
 			struct list_head	cqm_groups_entry;
 			struct list_head	cqm_group_entry;
-			struct task_struct	*cqm_target;
 		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
@@ -138,12 +136,12 @@ struct hw_perf_event {
 			 * problem hw_breakpoint has with context
 			 * creation and event initalization.
 			 */
-			struct task_struct		*bp_target;
 			struct arch_hw_breakpoint	info;
 			struct list_head		bp_list;
 		};
 #endif
 	};
+	struct task_struct		*target;
 	int				state;
 	local64_t			prev_count;
 	u64				sample_period;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 3c8b45de57ec..1e3cd07cf76e 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -32,7 +32,6 @@ enum perf_type_id {
 	PERF_TYPE_HW_CACHE			= 3,
 	PERF_TYPE_RAW				= 4,
 	PERF_TYPE_BREAKPOINT			= 5,
-	PERF_TYPE_INTEL_CQM			= 6,
 
 	PERF_TYPE_MAX,				/* non-ABI */
 };
-- 
cgit v1.2.2


From 34f439278cef7b1177f8ce24f9fc81dfc6221d3b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 20 Feb 2015 14:05:38 +0100
Subject: perf: Add per event clockid support

While thinking on the whole clock discussion it occurred to me we have
two distinct uses of time:

 1) the tracking of event/ctx/cgroup enabled/running/stopped times
    which includes the self-monitoring support in struct
    perf_event_mmap_page.

 2) the actual timestamps visible in the data records.

And we've been conflating them.

The first is all about tracking time deltas, nobody should really care
in what time base that happens, its all relative information, as long
as its internally consistent it works.

The second however is what people are worried about when having to
merge their data with external sources. And here we have the
discussion on MONOTONIC vs MONOTONIC_RAW etc..

Where MONOTONIC is good for correlating between machines (static
offset), MONOTNIC_RAW is required for correlating against a fixed rate
hardware clock.

This means configurability; now 1) makes that hard because it needs to
be internally consistent across groups of unrelated events; which is
why we had to have a global perf_clock().

However, for 2) it doesn't really matter, perf itself doesn't care
what it writes into the buffer.

The below patch makes the distinction between these two cases by
adding perf_event_clock() which is used for the second case. It
further makes this configurable on a per-event basis, but adds a few
sanity checks such that we cannot combine events with different clocks
in confusing ways.

And since we then have per-event configurability we might as well
retain the 'legacy' behaviour as a default.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h      | 2 ++
 include/uapi/linux/perf_event.h | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b16eac5f54ce..401554074de9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -173,6 +173,7 @@ struct perf_event;
  * pmu::capabilities flags
  */
 #define PERF_PMU_CAP_NO_INTERRUPT		0x01
+#define PERF_PMU_CAP_NO_NMI			0x02
 
 /**
  * struct pmu - generic performance monitoring unit
@@ -457,6 +458,7 @@ struct perf_event {
 	struct pid_namespace		*ns;
 	u64				id;
 
+	u64				(*clock)(void);
 	perf_overflow_handler_t		overflow_handler;
 	void				*overflow_handler_context;
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1e3cd07cf76e..3bb40ddadbe5 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -326,7 +326,8 @@ struct perf_event_attr {
 				exclude_callchain_user   : 1, /* exclude user callchains */
 				mmap2          :  1, /* include mmap with inode data     */
 				comm_exec      :  1, /* flag comm events that are due to an exec */
-				__reserved_1   : 39;
+				use_clockid    :  1, /* use @clockid for time fields */
+				__reserved_1   : 38;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -355,8 +356,7 @@ struct perf_event_attr {
 	 */
 	__u32	sample_stack_user;
 
-	/* Align to u64. */
-	__u32	__reserved_2;
+	__s32	clockid;
 	/*
 	 * Defines set of regs to dump for each sample
 	 * state captured on:
-- 
cgit v1.2.2


From 4e537f7fbdce5e8ae7c33ebaa8a1956c7727d5a7 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 25 Mar 2015 12:49:18 -0700
Subject: bpf: Make internal bpf API independent of CONFIG_BPF_SYSCALL #ifdefs

Socket filter code and other subsystems with upcoming eBPF
support should not need to deal with the fact that we have
CONFIG_BPF_SYSCALL defined or not.

Having the bpf syscall as a config option is a nice thing and
I'd expect it to stay that way for expert users (I presume one
day the default setting of it might change, though), but code
making use of it should not care if it's actually enabled or
not.

Instead, hide this via header files and let the rest deal with it.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1427312966-8434-2-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/bpf.h | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bbfceb756452..c2e21113ecc0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -113,8 +113,6 @@ struct bpf_prog_type_list {
 	enum bpf_prog_type type;
 };
 
-void bpf_register_prog_type(struct bpf_prog_type_list *tl);
-
 struct bpf_prog;
 
 struct bpf_prog_aux {
@@ -129,11 +127,25 @@ struct bpf_prog_aux {
 };
 
 #ifdef CONFIG_BPF_SYSCALL
+void bpf_register_prog_type(struct bpf_prog_type_list *tl);
+
 void bpf_prog_put(struct bpf_prog *prog);
+struct bpf_prog *bpf_prog_get(u32 ufd);
 #else
-static inline void bpf_prog_put(struct bpf_prog *prog) {}
+static inline void bpf_register_prog_type(struct bpf_prog_type_list *tl)
+{
+}
+
+static inline struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void bpf_prog_put(struct bpf_prog *prog)
+{
+}
 #endif
-struct bpf_prog *bpf_prog_get(u32 ufd);
+
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
 
-- 
cgit v1.2.2


From 72cbbc8994242b5b43753738c01bf07bf29cb70d Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 25 Mar 2015 12:49:19 -0700
Subject: tracing: Add kprobe flag

add TRACE_EVENT_FL_KPROBE flag to differentiate kprobe type of
tracepoints, since bpf programs can only be attached to kprobe
type of PERF_TYPE_TRACEPOINT perf events.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-3-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ftrace_event.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index c674ee8f7fca..77325e1a1816 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -252,6 +252,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED_BIT,
 	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
+	TRACE_EVENT_FL_KPROBE_BIT,
 };
 
 /*
@@ -265,6 +266,7 @@ enum {
  *                     it is best to clear the buffers that used it).
  *  USE_CALL_FILTER - For ftrace internal events, don't use file filter
  *  TRACEPOINT    - Event is a tracepoint
+ *  KPROBE        - Event is a kprobe
  */
 enum {
 	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
@@ -274,6 +276,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
 	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
+	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
 };
 
 struct ftrace_event_call {
-- 
cgit v1.2.2


From 2541517c32be2531e0da59dfd7efc1ce844644f5 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 25 Mar 2015 12:49:20 -0700
Subject: tracing, perf: Implement BPF programs attached to kprobes

BPF programs, attached to kprobes, provide a safe way to execute
user-defined BPF byte-code programs without being able to crash or
hang the kernel in any way. The BPF engine makes sure that such
programs have a finite execution time and that they cannot break
out of their sandbox.

The user interface is to attach to a kprobe via the perf syscall:

	struct perf_event_attr attr = {
		.type	= PERF_TYPE_TRACEPOINT,
		.config	= event_id,
		...
	};

	event_fd = perf_event_open(&attr,...);
	ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);

'prog_fd' is a file descriptor associated with BPF program
previously loaded.

'event_id' is an ID of the kprobe created.

Closing 'event_fd':

	close(event_fd);

... automatically detaches BPF program from it.

BPF programs can call in-kernel helper functions to:

  - lookup/update/delete elements in maps

  - probe_read - wraper of probe_kernel_read() used to access any
    kernel data structures

BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is
architecture dependent) and return 0 to ignore the event and 1 to store
kprobe event into the ring buffer.

Note, kprobes are a fundamentally _not_ a stable kernel ABI,
so BPF programs attached to kprobes must be recompiled for
every kernel version and user must supply correct LINUX_VERSION_CODE
in attr.kern_version during bpf_prog_load() call.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ftrace_event.h    | 11 +++++++++++
 include/uapi/linux/bpf.h        |  3 +++
 include/uapi/linux/perf_event.h |  1 +
 3 files changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 77325e1a1816..0aa535bc9f05 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -13,6 +13,7 @@ struct trace_array;
 struct trace_buffer;
 struct tracer;
 struct dentry;
+struct bpf_prog;
 
 struct trace_print_flags {
 	unsigned long		mask;
@@ -306,6 +307,7 @@ struct ftrace_event_call {
 #ifdef CONFIG_PERF_EVENTS
 	int				perf_refcount;
 	struct hlist_head __percpu	*perf_events;
+	struct bpf_prog			*prog;
 
 	int	(*perf_perm)(struct ftrace_event_call *,
 			     struct perf_event *);
@@ -551,6 +553,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
 		event_triggers_post_call(file, tt);
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+#else
+static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	return 1;
+}
+#endif
+
 enum {
 	FILTER_OTHER = 0,
 	FILTER_STATIC_STRING,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 45da7ec7d274..b2948feeb70b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
+	BPF_PROG_TYPE_KPROBE,
 };
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -151,6 +152,7 @@ union bpf_attr {
 		__u32		log_level;	/* verbosity level of verifier */
 		__u32		log_size;	/* size of user buffer */
 		__aligned_u64	log_buf;	/* user supplied buffer */
+		__u32		kern_version;	/* checked when prog_type=kprobe */
 	};
 } __attribute__((aligned(8)));
 
@@ -162,6 +164,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
+	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 3bb40ddadbe5..91803e54ee73 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -381,6 +381,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
 #define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
 #define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
+#define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, __u32)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
-- 
cgit v1.2.2


From d9847d310ab4003725e6ed1822682e24bd406908 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 25 Mar 2015 12:49:21 -0700
Subject: tracing: Allow BPF programs to call bpf_ktime_get_ns()

bpf_ktime_get_ns() is used by programs to compute time delta
between events or as a timestamp

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-5-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b2948feeb70b..238c6883877b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -165,6 +165,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
 	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
+	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
 	__BPF_FUNC_MAX_ID,
 };
 
-- 
cgit v1.2.2


From 9c959c863f8217a2ff3d7c296e8223654d240569 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 25 Mar 2015 12:49:22 -0700
Subject: tracing: Allow BPF programs to call bpf_trace_printk()

Debugging of BPF programs needs some form of printk from the
program, so let programs call limited trace_printk() with %d %u
%x %p modifiers only.

Similar to kernel modules, during program load verifier checks
whether program is calling bpf_trace_printk() and if so, kernel
allocates trace_printk buffers and emits big 'this is debug
only' banner.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1427312966-8434-6-git-send-email-ast@plumgrid.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 238c6883877b..cc47ef41076a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -166,6 +166,7 @@ enum bpf_func_id {
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
 	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
 	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
+	BPF_FUNC_trace_printk,    /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */
 	__BPF_FUNC_MAX_ID,
 };
 
-- 
cgit v1.2.2


From e8c6deac69629c0cb97c3d3272f8631ef17f8f0f Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:10 +0200
Subject: perf: Add data_{offset,size} to user_page

Currently, the actual perf ring buffer is one page into the mmap area,
following the user page and the userspace follows this convention. This
patch adds data_{offset,size} fields to user_page that can be used by
userspace instead for locating perf data in the mmap area. This is also
helpful when mapping existing or shared buffers if their size is not
known in advance.

Right now, it is made to follow the existing convention that

	data_offset == PAGE_SIZE and
	data_offset + data_size == mmap_size.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-2-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/perf_event.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 91803e54ee73..86c44ae66d43 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -522,9 +522,14 @@ struct perf_event_mmap_page {
 	 * In this case the kernel will not over-write unread data.
 	 *
 	 * See perf_output_put_handle() for the data ordering.
+	 *
+	 * data_{offset,size} indicate the location and size of the perf record
+	 * buffer within the mmapped area.
 	 */
 	__u64   data_head;		/* head in the data section */
 	__u64	data_tail;		/* user-space written tail */
+	__u64	data_offset;		/* where the buffer starts */
+	__u64	data_size;		/* data buffer size */
 };
 
 #define PERF_RECORD_MISC_CPUMODE_MASK		(7 << 0)
-- 
cgit v1.2.2


From 45bfb2e50471abbbfd83d40d28c986078b0d24ff Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 14 Jan 2015 14:18:11 +0200
Subject: perf: Add AUX area to ring buffer for raw data streams

This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.

AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.

In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.

Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h      | 17 +++++++++++++++++
 include/uapi/linux/perf_event.h | 16 ++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 401554074de9..5a94f6d6fa91 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -284,6 +284,18 @@ struct pmu {
 	 * Return the count value for a counter.
 	 */
 	u64 (*count)			(struct perf_event *event); /*optional*/
+
+	/*
+	 * Set up pmu-private data structures for an AUX area
+	 */
+	void *(*setup_aux)		(int cpu, void **pages,
+					 int nr_pages, bool overwrite);
+					/* optional */
+
+	/*
+	 * Free pmu-private AUX data structures
+	 */
+	void (*free_aux)		(void *aux); /* optional */
 };
 
 /**
@@ -862,6 +874,11 @@ static inline bool needs_branch_stack(struct perf_event *event)
 	return event->attr.branch_sample_type != 0;
 }
 
+static inline bool has_aux(struct perf_event *event)
+{
+	return event->pmu->setup_aux;
+}
+
 extern int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_event *event, unsigned int size);
 extern void perf_output_end(struct perf_output_handle *handle);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 86c44ae66d43..6c5013a71714 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -530,6 +530,22 @@ struct perf_event_mmap_page {
 	__u64	data_tail;		/* user-space written tail */
 	__u64	data_offset;		/* where the buffer starts */
 	__u64	data_size;		/* data buffer size */
+
+	/*
+	 * AUX area is defined by aux_{offset,size} fields that should be set
+	 * by the userspace, so that
+	 *
+	 *   aux_offset >= data_offset + data_size
+	 *
+	 * prior to mmap()ing it. Size of the mmap()ed area should be aux_size.
+	 *
+	 * Ring buffer pointers aux_{head,tail} have the same semantics as
+	 * data_{head,tail} and same ordering rules apply.
+	 */
+	__u64	aux_head;
+	__u64	aux_tail;
+	__u64	aux_offset;
+	__u64	aux_size;
 };
 
 #define PERF_RECORD_MISC_CPUMODE_MASK		(7 << 0)
-- 
cgit v1.2.2


From 0a4e38e64f5e91ce131cc42ee5bb3925377ec840 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:12 +0200
Subject: perf: Support high-order allocations for AUX space

Some pmus (such as BTS or Intel PT without multiple-entry ToPA capability)
don't support scatter-gather and will prefer larger contiguous areas for
their output regions.

This patch adds a new pmu capability to request higher order allocations.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-4-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5a94f6d6fa91..d5a4a8e95808 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -174,6 +174,7 @@ struct perf_event;
  */
 #define PERF_PMU_CAP_NO_INTERRUPT		0x01
 #define PERF_PMU_CAP_NO_NMI			0x02
+#define PERF_PMU_CAP_AUX_NO_SG			0x04
 
 /**
  * struct pmu - generic performance monitoring unit
-- 
cgit v1.2.2


From 6a279230391b63130070e0219b0ad09d34d28c89 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:13 +0200
Subject: perf: Add a capability for AUX_NO_SG pmus to do software double
 buffering

For pmus that don't support scatter-gather for AUX data in hardware, it
might still make sense to implement software double buffering to avoid
losing data while the user is reading data out. For this purpose, add
a pmu capability that guarantees multiple high-order chunks for AUX buffer,
so that the pmu driver can do switchover tricks.

To make use of this feature, add PERF_PMU_CAP_AUX_SW_DOUBLEBUF to your
pmu's capability mask. This will make the ring buffer AUX allocation code
ensure that the biggest high order allocation for the aux buffer pages is
no bigger than half of the total requested buffer size, thus making sure
that the buffer has at least two high order allocations.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-5-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d5a4a8e95808..13a1eb3a2a2d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -175,6 +175,7 @@ struct perf_event;
 #define PERF_PMU_CAP_NO_INTERRUPT		0x01
 #define PERF_PMU_CAP_NO_NMI			0x02
 #define PERF_PMU_CAP_AUX_NO_SG			0x04
+#define PERF_PMU_CAP_AUX_SW_DOUBLEBUF		0x08
 
 /**
  * struct pmu - generic performance monitoring unit
-- 
cgit v1.2.2


From bed5b25ad9c8a2f5d735ef0bc746ec870c01c1b0 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Fri, 30 Jan 2015 12:31:06 +0200
Subject: perf: Add a pmu capability for "exclusive" events

Usually, pmus that do, for example, instruction tracing, would only ever
be able to have one event per task per cpu (or per perf_event_context). For
such pmus it makes sense to disallow creating conflicting events early on,
so as to provide consistent behavior for the user.

This patch adds a pmu capability that indicates such constraint on event
creation.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1422613866-113186-1-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 13a1eb3a2a2d..f936a1e51f29 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -176,6 +176,7 @@ struct perf_event;
 #define PERF_PMU_CAP_NO_NMI			0x02
 #define PERF_PMU_CAP_AUX_NO_SG			0x04
 #define PERF_PMU_CAP_AUX_SW_DOUBLEBUF		0x08
+#define PERF_PMU_CAP_EXCLUSIVE			0x10
 
 /**
  * struct pmu - generic performance monitoring unit
@@ -196,6 +197,7 @@ struct pmu {
 
 	int * __percpu			pmu_disable_count;
 	struct perf_cpu_context * __percpu pmu_cpu_context;
+	atomic_t			exclusive_cnt; /* < 0: cpu; > 0: tsk */
 	int				task_ctx_nr;
 	int				hrtimer_interval_ms;
 
-- 
cgit v1.2.2


From 68db7e98c3a6ebe7284b6cf14906ed7c55f3f7f0 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:15 +0200
Subject: perf: Add AUX record

When there's new data in the AUX space, output a record indicating its
offset and size and a set of flags, such as PERF_AUX_FLAG_TRUNCATED, to
mean the described data was truncated to fit in the ring buffer.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-7-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/perf_event.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 6c5013a71714..8904ad3a850b 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -768,6 +768,20 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_MMAP2			= 10,
 
+	/*
+	 * Records that new data landed in the AUX buffer part.
+	 *
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 *
+	 * 	u64				aux_offset;
+	 * 	u64				aux_size;
+	 *	u64				flags;
+	 * 	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_AUX				= 11,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
@@ -785,6 +799,11 @@ enum perf_callchain_context {
 	PERF_CONTEXT_MAX		= (__u64)-4095,
 };
 
+/**
+ * PERF_RECORD_AUX::flags bits
+ */
+#define PERF_AUX_FLAG_TRUNCATED		0x01	/* record was truncated to fit */
+
 #define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
 #define PERF_FLAG_FD_OUTPUT		(1UL << 1)
 #define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
-- 
cgit v1.2.2


From fdc2670666f40ab3e03143f04d1ebf4a05e2c24a Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:16 +0200
Subject: perf: Add API for PMUs to write to the AUX area

For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.

After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.

PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.

Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.

Nested writers are forbidden and guards are in place to catch such
attempts.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f936a1e51f29..45c5873ad9b3 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -573,7 +573,10 @@ struct perf_output_handle {
 	struct ring_buffer		*rb;
 	unsigned long			wakeup;
 	unsigned long			size;
-	void				*addr;
+	union {
+		void			*addr;
+		unsigned long		head;
+	};
 	int				page;
 };
 
@@ -608,6 +611,14 @@ perf_cgroup_from_task(struct task_struct *task)
 
 #ifdef CONFIG_PERF_EVENTS
 
+extern void *perf_aux_output_begin(struct perf_output_handle *handle,
+				   struct perf_event *event);
+extern void perf_aux_output_end(struct perf_output_handle *handle,
+				unsigned long size, bool truncated);
+extern int perf_aux_output_skip(struct perf_output_handle *handle,
+				unsigned long size);
+extern void *perf_get_aux(struct perf_output_handle *handle);
+
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
 extern void perf_pmu_unregister(struct pmu *pmu);
 
@@ -898,6 +909,17 @@ extern void perf_event_disable(struct perf_event *event);
 extern int __perf_event_disable(void *info);
 extern void perf_event_task_tick(void);
 #else /* !CONFIG_PERF_EVENTS: */
+static inline void *
+perf_aux_output_begin(struct perf_output_handle *handle,
+		      struct perf_event *event)				{ return NULL; }
+static inline void
+perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
+		    bool truncated)					{ }
+static inline int
+perf_aux_output_skip(struct perf_output_handle *handle,
+		     unsigned long size)				{ return -EINVAL; }
+static inline void *
+perf_get_aux(struct perf_output_handle *handle)				{ return NULL; }
 static inline void
 perf_event_task_sched_in(struct task_struct *prev,
 			 struct task_struct *task)			{ }
-- 
cgit v1.2.2


From 2023a0d2829e521fe6ad6b9907f3f90bfbf57142 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:17 +0200
Subject: perf: Support overwrite mode for the AUX area

This adds support for overwrite mode in the AUX area, which means "keep
collecting data till you're stopped", turning AUX area into a circular
buffer, where new data overwrites old data. It does not depend on data
buffer's overwrite mode, so that it doesn't lose sideband data that is
instrumental for processing AUX data.

Overwrite mode is enabled at mapping AUX area read only. Even though
aux_tail in the buffer's user page might be user writable, it will be
ignored in this mode.

A PERF_RECORD_AUX with PERF_AUX_FLAG_OVERWRITE set is written to the perf
data stream every time an event writes new data to the AUX area. The pmu
driver might not be able to infer the exact beginning of the new data in
each snapshot, some drivers will only provide the tail, which is
aux_offset + aux_size in the AUX record. Consumer has to be able to tell
the new data from the old one, for example, by means of time stamps if
such are provided in the trace.

Consumer is also responsible for disabling any events that might write
to the AUX area (thus potentially racing with the consumer) before
collecting the data.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-9-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 8904ad3a850b..29ef2f73bb4a 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -803,6 +803,7 @@ enum perf_callchain_context {
  * PERF_RECORD_AUX::flags bits
  */
 #define PERF_AUX_FLAG_TRUNCATED		0x01	/* record was truncated to fit */
+#define PERF_AUX_FLAG_OVERWRITE		0x02	/* snapshot from overwrite mode */
 
 #define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
 #define PERF_FLAG_FD_OUTPUT		(1UL << 1)
-- 
cgit v1.2.2


From 1a5941312414c71dece6717da9a0fa1303127afa Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:18 +0200
Subject: perf: Add wakeup watermark control to the AUX area

When AUX area gets a certain amount of new data, we want to wake up
userspace to collect it. This adds a new control to specify how much
data will cause a wakeup. This is then passed down to pmu drivers via
output handle's "wakeup" field, so that the driver can find the nearest
point where it can generate an interrupt.

We repurpose __reserved_2 in the event attribute for this, even though
it was never checked to be zero before, aux_watermark will only matter
for new AUX-aware code, so the old code should still be fine.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-10-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/perf_event.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 29ef2f73bb4a..84819546c8ce 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -261,6 +261,7 @@ enum perf_event_read_format {
 #define PERF_ATTR_SIZE_VER3	96	/* add: sample_regs_user */
 					/* add: sample_stack_user */
 #define PERF_ATTR_SIZE_VER4	104	/* add: sample_regs_intr */
+#define PERF_ATTR_SIZE_VER5	112	/* add: aux_watermark */
 
 /*
  * Hardware event_id to monitor via a performance monitoring event:
@@ -366,6 +367,12 @@ struct perf_event_attr {
 	 * See asm/perf_regs.h for details.
 	 */
 	__u64	sample_regs_intr;
+
+	/*
+	 * Wakeup watermark for AUX area
+	 */
+	__u32	aux_watermark;
+	__u32	__reserved_2;	/* align to __u64 */
 };
 
 #define perf_flags(attr)	(*(&(attr)->read_format + 1))
-- 
cgit v1.2.2


From ec0d7729bbaed4b9d2d3fada693278e13a3d1368 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 14 Jan 2015 14:18:23 +0200
Subject: perf: Add ITRACE_START record to indicate that tracing has started

For counters that generate AUX data that is bound to the context of a
running task, such as instruction tracing, the decoder needs to know
exactly which task is running when the event is first scheduled in,
before the first sched_switch. The decoder's need to know this stems
from the fact that instruction flow trace decoding will almost always
require program's object code in order to reconstruct said flow and
for that we need at least its pid/tid in the perf stream.

To single out such instruction tracing pmus, this patch introduces
ITRACE PMU capability. The reason this is not part of RECORD_AUX
record is that not all pmus capable of generating AUX data need this,
and the opposite is *probably* also true.

While sched_switch covers for most cases, there are two problems with it:
the consumer will need to process events out of order (that is, having
found RECORD_AUX, it will have to skip forward to the nearest sched_switch
to figure out which task it was, then go back to the actual trace to
decode it) and it completely misses the case when the tracing is enabled
and disabled before sched_switch, for example, via PERF_EVENT_IOC_DISABLE.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-15-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h      |  4 ++++
 include/uapi/linux/perf_event.h | 11 +++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 45c5873ad9b3..61992cf2e977 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -129,6 +129,9 @@ struct hw_perf_event {
 			struct list_head	cqm_groups_entry;
 			struct list_head	cqm_group_entry;
 		};
+		struct { /* itrace */
+			int			itrace_started;
+		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
 			/*
@@ -177,6 +180,7 @@ struct perf_event;
 #define PERF_PMU_CAP_AUX_NO_SG			0x04
 #define PERF_PMU_CAP_AUX_SW_DOUBLEBUF		0x08
 #define PERF_PMU_CAP_EXCLUSIVE			0x10
+#define PERF_PMU_CAP_ITRACE			0x20
 
 /**
  * struct pmu - generic performance monitoring unit
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 84819546c8ce..309211b3eb67 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -789,6 +789,17 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_AUX				= 11,
 
+	/*
+	 * Indicates that instruction trace has started
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid;
+	 *	u32				tid;
+	 * };
+	 */
+	PERF_RECORD_ITRACE_START		= 12,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
-- 
cgit v1.2.2


From b3738d29323344da3017a91010530cf3a58590fc Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 17 Nov 2014 20:07:03 +0100
Subject: watchdog: Add watchdog enable/disable all functions

This patch adds two new functions to enable/disable
the watchdog across all CPUs.

This will be used by the HT PMU bug workaround code to
disable/enable the NMI watchdog across quirk enablement.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Cc: maria.n.dimakopoulou@gmail.com
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1416251225-17721-12-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/watchdog.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index 395b70e0eccf..a746bf5216f8 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -137,4 +137,12 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd,
 extern int watchdog_register_device(struct watchdog_device *);
 extern void watchdog_unregister_device(struct watchdog_device *);
 
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+void watchdog_nmi_disable_all(void);
+void watchdog_nmi_enable_all(void);
+#else
+static inline void watchdog_nmi_disable_all(void) {}
+static inline void watchdog_nmi_enable_all(void) {}
+#endif
+
 #endif  /* ifndef _LINUX_WATCHDOG_H */
-- 
cgit v1.2.2