/*
* Performance events core code:
*
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*
* For licensing details see kernel-base/COPYING
*/
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/idr.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/hash.h>
#include <linux/sysfs.h>
#include <linux/dcache.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
#include <linux/reboot.h>
#include <linux/vmstat.h>
#include <linux/device.h>
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/hardirq.h>
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
#include <linux/kernel_stat.h>
#include <linux/perf_event.h>
#include <linux/ftrace_event.h>
#include <linux/hw_breakpoint.h>
#include "internal.h"
#include <asm/irq_regs.h>
struct remote_function_call {
struct task_struct *p;
int (*func)(void *info);
void *info;
int ret;
};
static void remote_function(void *data)
{
struct remote_function_call *tfc = data;
struct task_struct *p = tfc->p;
if (p) {
tfc->ret = -EAGAIN;
if (task_cpu(p) != smp_processor_id() || !task_curr(p))
return;
}
tfc->ret = tfc->func(tfc->info);
}
/**
* task_function_call - call a function on the cpu on which a task runs
* @p: the task to evaluate
* @func: the function to be called
* @info: the function call argument
*
* Calls the function @func when the task is currently running. This might
* be on the current CPU, which just calls the function directly
*
* returns: @func return value, or
* -ESRCH - when the process isn't running
* -EAGAIN - when the process moved away
*/
static int
task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
{
struct remote_function_call data = {
.p = p,
.func = func,
.info = info,
.ret = -ESRCH, /* No such (running) process */
};
if (task_curr(p))
smp_call_function_single(task_cpu(p), remote_function, &data, 1);
return data.ret;
}
/**
* cpu_function_call - call a function on the cpu
* @func: the function to be called
* @info: the function call argument
*
* Calls the function @func on the remote cpu.
*
* returns: @func return value or -ENXIO when the cpu is offline
*/
static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
{
struct remote_function_call data = {
.p = NULL,
.func = func,
.info = info,
.ret = -ENXIO, /* No such CPU */
};
smp_call_function_single(cpu, remote_function, &data, 1);
return data.ret;
}
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
PERF_FLAG_FD_OUTPUT |\
PERF_FLAG_PID_CGROUP)
enum event_type_t {
EVENT_FLEXIBLE = 0x1,
EVENT_PINNED = 0x2,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};
/*
* perf_sched_events : >0 events exist
* perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
*/
struct jump_label_key perf_sched_events __read_mostly;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
/*
* perf event paranoia level:
* -1 - not paranoid at all
* 0 - disallow raw tracepoint access for unpriv
* 1 - disallow cpu events for unpriv
* 2 - disallow kernel profiling for unpriv
*/
int sysctl_perf_event_paranoid __read_mostly = 1;
/* Minimum for 512 kiB + 1 user control page */
int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
/*
* max perf event sample rate
*/
#define DEFAULT_MAX_SAMPLE_RATE 100000
int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
static int max_samples_per_tick __read_mostly =
DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
int perf_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
return 0;
}
static atomic64_t perf_event_id;
static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
enum event_type_t event_type);
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
struct task_struct *task);
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
void __weak perf_event_print_debug(void) { }
extern __weak const char *perf_pmu_name(void)
{
return "pmu";
}
static inline u64 perf_clock(void)
{
return local_clock();
}
static inline struct perf_cpu_context *
__get_cpu_context(struct perf_event_context *ctx)
{
return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
}
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
raw_spin_lock(&cpuctx->ctx.lock);
if (ctx)
raw_spin_lock(&ctx->lock);
}
static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
if (ctx)
raw_spin_unlock(&ctx->lock);
raw_spin_unlock(&cpuctx->ctx.lock);
}
#ifdef CONFIG_CGROUP_PERF
/*
* Must ensure cgroup is pinned (css_get) before calling
* this function. In other words, we cannot call this function
* if there is no cgroup event for the current CPU context.
*/
static inline struct perf_cgroup *
perf_cgroup_from_task(struct task_struct *task)
{
return container_of(task_subsys_state(task, perf_subsys_id),
struct perf_cgroup, css);
}
static inline bool
perf_cgroup_match(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
return !event->cgrp || event->cgrp == cpuctx->cgrp;
}
static inline void perf_get_cgroup(struct perf_event *event)
{
css_get(&event->cgrp->css);
}
static inline void perf_put_cgroup(struct perf_event *event)
{
css_put(&event->cgrp->css);
}
static inline void perf_detach_cgroup(struct perf_event *event)
{
perf_put_cgroup(event);
event->cgrp = NULL;
}
static inline int is_cgroup_event(struct perf_event *event)
{
return event->cgrp != NULL;
}
static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
struct perf_cgroup_info *t;
t = per_cpu_ptr(event->cgrp->info, event->cpu);
return t->time;
}
static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
{
struct perf_cgroup_info *info;
u64 now;
now = perf_clock();
info = this_cpu_ptr(cgrp->info);
info->time += now - info->timestamp;
info->timestamp = now;
}
static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
{
struct perf_cgroup *cgrp_out = cpuctx->cgrp;
if (cgrp_out)
__update_cgrp_time(cgrp_out);
}
static inline void update_cgrp_time_from_event(struct perf_event *event)
{
struct perf_cgroup *cgrp;
/*
* ensure we access cgroup data only when needed and
* when we know the cgroup is pinned (css_get)
*/
if (!is_cgroup_event(event))
return;
cgrp = perf_cgroup_from_task(current);
/*
* Do not update time when cgroup is not active
*/
if (cgrp == event->cgrp)
__update_cgrp_time(event->cgrp);
}
static inline void
perf_cgroup_set_timestamp(struct task_struct *task,
struct perf_event_context *ctx)
{
struct perf_cgroup *cgrp;
struct perf_cgroup_info *info;
/*
* ctx->lock held by caller
* ensure we do not access cgroup data
* unless we have the cgroup pinned (css_get)
*/
if (!task || !ctx->nr_cgroups)
return;
cgrp = perf_cgroup_from_task(task);
info = this_cpu_ptr(cgrp->info);
info->timestamp = ctx->timestamp;
}
#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
/*
* reschedule events based on the cgroup constraint of task.
*
* mode SWOUT : schedule out everything
* mode SWIN : schedule in based on cgroup for next
*/
void perf_cgroup_switch(struct task_struct *task, int mode)
{
struct perf_cpu_context *cpuctx;
struct pmu *pmu;
unsigned long flags;
/*
* disable interrupts to avoid geting nr_cgroup
* changes via __perf_event_disable(). Also
* avoids preemption.
*/
local_irq_save(flags);
/*
* we reschedule only in the presence of cgroup
* constrained events.
*/
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
/*
* perf_cgroup_events says at least one
* context on this CPU has cgroup events.
*
* ctx->nr_cgroups reports the number of cgroup
* events for a context.
*/
if (cpuctx->ctx.nr_cgroups > 0) {
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);
if (mode & PERF_CGROUP_SWOUT) {
cpu_ctx_sched_out(cpuctx, EVENT_ALL);
/*
* must not be done before ctxswout due
* to event_filter_match() in event_sched_out()
*/
cpuctx->cgrp = NULL;
}
if (mode & PERF_CGROUP_SWIN) {
WARN_ON_ONCE(cpuctx->cgrp);
/* set cgrp before ctxsw in to
* allow event_filter_match() to not
* have to pass task around
*/
cpuctx->cgrp = perf_cgroup_from_task(task);
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
}
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
}
rcu_read_unlock();
local_irq_restore(flags);
}
static inline void perf_cgroup_sched_out(struct task_struct *task,
struct task_struct *next)
{
struct perf_cgroup *cgrp1;
struct perf_cgroup *cgrp2 = NULL;
/*
* we come here when we know perf_cgroup_events > 0
*/
cgrp1 = perf_cgroup_from_task(task);
/*
* next is NULL when called from perf_event_enable_on_exec()
* that will systematically cause a cgroup_switch()
*/
if (next)
cgrp2 = perf_cgroup_from_task(next);
/*
* only schedule out current cgroup events if we know
* that we are switching to a different cgroup. Otherwise,
* do no touch the cgroup events.
*/
if (cgrp1 != cgrp2)
perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
}
static inline void perf_cgroup_sched_in(struct task_struct *prev,
struct task_struct *task)
{
struct perf_cgroup *cgrp1;
struct perf_cgroup *cgrp2 = NULL;
/*
* we come here when we know perf_cgroup_events > 0
*/
cgrp1 = perf_cgroup_from_task(task);
/* prev can never be NULL */
cgrp2 = perf_cgroup_from_task(prev);
/*
* only need to schedule in cgroup events if we are changing
* cgroup during ctxsw. Cgroup events were not scheduled
* out of ctxsw out if that was not the case.
*/
if (cgrp1 != cgrp2)
perf_cgroup_switch(task, PERF_CGROUP_SWIN);
}
static inline int perf_cgroup_connect(int fd, struct perf_event *event,
struct perf_event_attr *attr,
struct perf_event *group_leader)
{
struct perf_cgroup *cgrp;
struct cgroup_subsys_state *css;
struct file *file;
int ret = 0, fput_needed;
file = fget_light(fd, &fput_needed);
if (!file)
return -EBADF;
css = cgroup_css_from_dir(file, perf_subsys_id);
if (IS_ERR(css)) {
ret = PTR_ERR(css);
goto out;
}
cgrp = container_of(css, struct perf_cgroup, css);
event->cgrp = cgrp;
/* must be done before we fput() the file */
perf_get_cgroup(event);
/*
* all events in a group must monitor
* the same cgroup because a task belongs
* to only one perf cgroup at a time
*/
if (group_leader && group_leader->cgrp != cgrp) {
perf_detach_cgroup(event);
ret = -EINVAL;
}
out:
fput_light(file, fput_needed);
return ret;
}
static inline void
perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
{
struct perf_cgroup_info *t;
t = per_cpu_ptr(event->cgrp->info, event->cpu);
event->shadow_ctx_time = now - t->timestamp;
}
static inline void
perf_cgroup_defer_enabled(struct perf_event *event)
{
/*
* when the current task's perf cgroup does not match
* the event's, we need to remember to call the
* perf_mark_enable() function the first time a task with
* a matching perf cgroup is scheduled in.
*/
if (is_cgroup_event(event) && !perf_cgroup_match(event))
event->cgrp_defer_enabled = 1;
}
static inline void
perf_cgroup_mark_enabled(struct perf_event *event,
struct perf_event_context *ctx)
{
struct perf_event *sub;
u64 tstamp = perf_event_time(event);
if (!event->cgrp_defer_enabled)
return;
event->cgrp_defer_enabled = 0;
event->tstamp_enabled = tstamp - event->total_time_enabled;
list_for_each_entry(sub, &event->sibling_list, group_entry) {
if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
sub->tstamp_enabled = tstamp - sub->total_time_enabled;
sub->cgrp_defer_enabled = 0;
}
}
}
#else /* !CONFIG_CGROUP_PERF */
static inline bool
perf_cgroup_match(struct perf_event *event)
{
return true;
}
static inline void perf_detach_cgroup(struct perf_event *event)
{}
static inline int is_cgroup_event(struct perf_event *event)
{
return 0;
}
static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
{
return 0;
}
static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}
static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
{
}
static inline void perf_cgroup_sched_out(struct task_struct *task,
struct task_struct *next)
{
}
static inline void perf_cgroup_sched_in(struct task_struct *prev,
struct task_struct *task)
{
}
static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
struct perf_event_attr *attr,
struct perf_event *group_leader)
{
return -EINVAL;
}
static inline void
perf_cgroup_set_timestamp(struct task_struct *task,
struct perf_event_context *ctx)
{
}
void
perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
{
}
static inline void
perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
{
}
static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
return 0;
}
static inline void
perf_cgroup_defer_enabled(struct perf_event *event)
{
}
static inline void
perf_cgroup_mark_enabled(struct perf_event *event,
struct perf_event_context *ctx)
{
}
#endif
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
if (!(*count)++)
pmu->pmu_disable(pmu);
}
void perf_pmu_enable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
if (!--(*count))
pmu->pmu_enable(pmu);
}
static DEFINE_PER_CPU(struct list_head, rotation_list);
/*
* perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
* because they're strictly cpu affine and rotate_start is called with IRQs
* disabled, while rotate_context is called from IRQ context.
*/
static void perf_pmu_rotate_start(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
struct list_head *head = &__get_cpu_var(rotation_list);
WARN_ON(!irqs_disabled());
if (list_empty(&cpuctx->rotation_list))
list_add(&cpuctx->rotation_list, head);
}
static void get_ctx(struct perf_event_context *ctx)
{
WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
}
static void put_ctx(struct perf_event_context *ctx)
{
if (atomic_dec_and_test(&ctx->refcount)) {
if (ctx->parent_ctx)
put_ctx(ctx->parent_ctx);
if (ctx->task)
put_task_struct(ctx->task);
kfree_rcu(ctx, rcu_head);
}
}
static void unclone_ctx(struct perf_event_context *ctx)
{
if (ctx->parent_ctx) {
put_ctx(ctx->parent_ctx);
ctx->parent_ctx = NULL;
}
}
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
{
/*
* only top level events have the pid namespace they were created in
*/
if (event->parent)
event = event->parent;
return task_tgid_nr_ns(p, event->ns);
}
static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
{
/*
* only top level events have the pid namespace they were created in
*/
if (event->parent)
event = event->parent;
return task_pid_nr_ns(p, event->ns);
}
/*
* If we inherit events we want to return the parent event id
* to userspace.
*/
static u64 primary_event_id(struct perf_event *event)
{
u64 id = event->id;
if (event->parent)
id = event->parent->id;
return id;
}
/*
* Get the perf_event_context for a task and lock it.
* This has to cope with with the fact that until it is locked,
* the context could get moved to another task.
*/
static struct perf_event_context *
perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
{
struct perf_event_context *ctx;
rcu_read_lock();
retry:
ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
if (ctx) {
/*
* If this context is a clone of another, it might
* get swapped for another underneath us by
* perf_event_task_sched_out, though the
* rcu_read_lock() protects us from any context
* getting freed. Lock the context and check if it
* got swapped before we could get the lock, and retry
* if so. If we locked the right context, then it
* can't get swapped on us any more.
*/
raw_spin_lock_irqsave(&ctx->lock, *flags);
if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
raw_spin_unlock_irqrestore(&ctx->lock, *flags);
goto retry;
}
if (!atomic_inc_not_zero(&ctx->refcount)) {
raw_spin_unlock_irqrestore(&ctx->lock, *flags);
ctx = NULL;
}
}
rcu_read_unlock();
return ctx;
}
/*
* Get the context for a task and increment its pin_count so it
* can't get swapped to another task. This also increments its
* reference count so that the context can't get freed.
*/
static struct perf_event_context *
perf_pin_task_context(struct task_struct *task, int ctxn)
{
struct perf_event_context *ctx;
unsigned long flags;
ctx = perf_lock_task_context(task, ctxn, &flags);
if (ctx) {
++ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
return ctx;
}
static void perf_unpin_context(struct perf_event_context *ctx)
{
unsigned long flags;
raw_spin_lock_irqsave(&ctx->lock, flags);
--ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
/*
* Update the record of the current time in a context.
*/
static void update_context_time(struct perf_event_context *ctx)
{
u64 now = perf_clock();
ctx->time += now - ctx->timestamp;
ctx->timestamp = now;
}
static u64 perf_event_time(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
if (is_cgroup_event(event))
return perf_cgroup_event_time(event);
return ctx ? ctx->time : 0;
}
/*
* Update the total_time_enabled and total_time_running fields for a event.
* The caller of this function needs to hold the ctx->lock.
*/
static void update_event_times(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
u64 run_end;
if (event->state < PERF_EVENT_STATE_INACTIVE ||
event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
return;
/*
* in cgroup mode, time_enabled represents
* the time the event was enabled AND active
* tasks were in the monitored cgroup. This is
* independent of the activity of the context as
* there may be a mix of cgroup and non-cgroup events.
*
* That is why we treat cgroup events differently
* here.
*/
if (is_cgroup_event(event))
run_end = perf_event_time(event);
else if (ctx->is_active)
run_end = ctx->time;
else
run_end = event->tstamp_stopped;
event->total_time_enabled = run_end - event->tstamp_enabled;
if (event->state == PERF_EVENT_STATE_INACTIVE)
run_end = event->tstamp_stopped;
else
run_end = perf_event_time(event);
event->total_time_running = run_end - event->tstamp_running;
}
/*
* Update total_time_enabled and total_time_running for all events in a group.
*/
static void update_group_times(struct perf_event *leader)
{
struct perf_event *event;
update_event_times(leader);
list_for_each_entry(event, &leader->sibling_list, group_entry)
update_event_times(event);
}
static struct list_head *
ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
{
if (event->attr.pinned)
return &ctx->pinned_groups;
else
return &ctx->flexible_groups;
}
/*
* Add a event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
*/
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
event->attach_state |= PERF_ATTACH_CONTEXT;
/*
* If we're a stand alone event or group leader, we go to the context
* list, group events are kept attached to the group so that
* perf_group_detach can, at all times, locate all siblings.
*/
if (event->group_leader == event) {
struct list_head *list;
if (is_software_event(event))
event->group_flags |= PERF_GROUP_SOFTWARE;
list = ctx_group_list(event, ctx);
list_add_tail(&event->group_entry, list);
}
if (is_cgroup_event(event))
ctx->nr_cgroups++;
list_add_rcu(&event->event_entry, &ctx->event_list);
if (!ctx->nr_events)
perf_pmu_rotate_start(ctx->pmu);
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
}
/*
* Called at perf_event creation and when events are attached/detached from a
* group.
*/
static void perf_event__read_size(struct perf_event *event)
{
int entry = sizeof(u64); /* value */
int size = 0;
int nr = 1;
if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
size += sizeof(u64);
if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
size += sizeof(u64);
if (event->attr.read_format & PERF_FORMAT_ID)
entry += sizeof(u64);
if (event->attr.read_format & PERF_FORMAT_GROUP) {
nr += event->group_leader->nr_siblings;
size += sizeof(u64);
}
size += entry * nr;
event->read_size = size;
}
static void perf_event__header_size(struct perf_event *event)
{
struct perf_sample_data *data;
u64 sample_type = event->attr.sample_type;
u16 size = 0;
perf_event__read_size(event);
if (sample_type & PERF_SAMPLE_IP)
size += sizeof(data->ip);
if (sample_type & PERF_SAMPLE_ADDR)
size += sizeof(data->addr);
if (sample_type & PERF_SAMPLE_PERIOD)
size += sizeof(data->period);
if (sample_type & PERF_SAMPLE_READ)
size += event->read_size;
event->header_size = size;
}
static void perf_event__id_header_size(struct perf_event *event)
{
struct perf_sample_data *data;
u64 sample_type = event->attr.sample_type;
u16 size = 0;
if (sample_type & PERF_SAMPLE_TID)
size += sizeof(data->tid_entry);
if (sample_type & PERF_SAMPLE_TIME)
size += sizeof(data->time);
if (sample_type & PERF_SAMPLE_ID)
size += sizeof(data->id);
if (sample_type & PERF_SAMPLE_STREAM_ID)
size += sizeof(data->stream_id);
if (sample_type & PERF_SAMPLE_CPU)
size += sizeof(data->cpu_entry);
event->id_header_size = size;
}
static void perf_group_attach(struct perf_event *event)
{
struct perf_event *group_leader = event->group_leader, *pos;
/*
* We can have double attach due to group movement in perf_event_open.
*/
if (event->attach_state & PERF_ATTACH_GROUP)
return;
event->attach_state |= PERF_ATTACH_GROUP;
if (group_leader == event)
return;
if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
!is_software_event(event))
group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
list_add_tail(&event->group_entry, &group_leader->sibling_list);
group_leader->nr_siblings++;
perf_event__header_size(group_leader);
list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
perf_event__header_size(pos);
}
/*
* Remove a event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
*/
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
/*
* We can have double detach due to exit/hot-unplug + close.
*/
if (!(event->attach_state & PERF_ATTACH_CONTEXT))
return;
event->attach_state &= ~PERF_ATTACH_CONTEXT;
if (is_cgroup_event(event)) {
ctx->nr_cgroups--;
cpuctx = __get_cpu_context(ctx);
/*
* if there are no more cgroup events
* then cler cgrp to avoid stale pointer
* in update_cgrp_time_from_cpuctx()
*/
if (!ctx->nr_cgroups)
cpuctx->cgrp = NULL;
}
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
list_del_rcu(&event->event_entry);
if (event->group_leader == event)
list_del_init(&event->group_entry);
update_group_times(event);
/*
* If event was in error state, then keep it
* that way, otherwise bogus counts will be
* returned on read(). The only way to get out
* of error state is by explicit re-enabling
* of the event
*/
if (event->state > PERF_EVENT_STATE_OFF)
event->state = PERF_EVENT_STATE_OFF;
}
static void perf_group_detach(struct perf_event *event)
{
struct perf_event *sibling, *tmp;
struct list_head *list = NULL;
/*
* We can have double detach due to exit/hot-unplug + close.
*/
if (!(event->attach_state & PERF_ATTACH_GROUP))
return;
event->attach_state &= ~PERF_ATTACH_GROUP;
/*
* If this is a sibling, remove it from its group.
*/
if (event->group_leader != event) {
list_del_init(&event->group_entry);
event->group_leader->nr_siblings--;
goto out;
}
if (!list_empty(&event->group_entry))
list = &event->group_entry;
/*
* If this was a group event with sibling events then
* upgrade the siblings to singleton events by adding them
* to whatever list we are on.
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
if (list)
list_move_tail(&sibling->group_entry, list);
sibling->group_leader = sibling;
/* Inherit group flags from the previous leader */
sibling->group_flags = event->group_flags;
}
out:
perf_event__header_size(event->group_leader);
list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
perf_event__header_size(tmp);
}
static inline int
event_filter_match(struct perf_event *event)
{
return (event->cpu == -1 || event->cpu == smp_processor_id())
&& perf_cgroup_match(event);
}
static void
event_sched_out(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
u64 tstamp = perf_event_time(event);
u64 delta;
/*
* An event which could not be activated because of
* filter mismatch still needs to have its timings
* maintained, otherwise bogus information is return
* via read() for time_enabled, time_running:
*/
if (event->state == PERF_EVENT_STATE_INACTIVE
&& !event_filter_match(event)) {
delta = tstamp - event->tstamp_stopped;
event->tstamp_running += delta;
event->tstamp_stopped = tstamp;
}
if (event->state != PERF_EVENT_STATE_ACTIVE)
return;
event->state = PERF_EVENT_STATE_INACTIVE;
if (event->pending_disable) {
event->pending_disable = 0;
event->state = PERF_EVENT_STATE_OFF;
}
event->tstamp_stopped = tstamp;
event->pmu->del(event, 0);
event->oncpu = -1;
if (!is_software_event(event))
cpuctx->active_oncpu--;
ctx->nr_active--;
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;
}
static void
group_sched_out(struct perf_event *group_event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
struct perf_event *event;
int state = group_event->state;
event_sched_out(group_event, cpuctx, ctx);
/*
* Schedule out siblings (if any):
*/
list_for_each_entry(event, &group_event->sibling_list, group_entry)
event_sched_out(event, cpuctx, ctx);
if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
cpuctx->exclusive = 0;
}
/*
* Cross CPU call to remove a performance event
*
* We disable the event on the hardware level first. After that we
* remove it from the context list.
*/
static int __perf_remove_from_context(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
raw_spin_lock(&ctx->lock);
event_sched_out(event, cpuctx, ctx);
list_del_event(event, ctx);
if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
ctx->is_active = 0;
cpuctx->task_ctx = NULL;
}
raw_spin_unlock(&ctx->lock);
return 0;
}
/*
* Remove the event from a task's (or a CPU's) list of events.
*
* CPU events are removed with a smp call. For task events we only
* call when the task is on a CPU.
*
* If event->ctx is a cloned context, callers must make sure that
* every task struct that event->ctx->task could possibly point to
* remains valid. This is OK when called from perf_release since
* that only calls us on the top-level context, which can't be a clone.
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
static void perf_remove_from_context(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
lockdep_assert_held(&ctx->mutex);
if (!task) {
/*
* Per cpu events are removed via an smp call and
* the removal is always successful.
*/
cpu_function_call(event->cpu, __perf_remove_from_context, event);
return;
}
retry:
if (!task_function_call(task, __perf_remove_from_context, event))
return;
raw_spin_lock_irq(&ctx->lock);
/*
* If we failed to find a running task, but find the context active now
* that we've acquired the ctx->lock, retry.
*/
if (ctx->is_active) {
raw_spin_unlock_irq(&ctx->lock);
goto retry;
}
/*
* Since the task isn't running, its safe to remove the event, us
* holding the ctx->lock ensures the task won't get scheduled in.
*/
list_del_event(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
}
/*
* Cross CPU call to disable a performance event
*/
static int __perf_event_disable(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
/*
* If this is a per-task event, need to check whether this
* event's task is the current task on this cpu.
*
* Can trigger due to concurrent perf_event_context_sched_out()
* flipping contexts around.
*/
if (ctx->task && cpuctx->task_ctx != ctx)
return -EINVAL;
raw_spin_lock(&ctx->lock);
/*
* If the event is on, turn it off.
* If it is in error state, leave it in error state.
*/
if (event->state >= PERF_EVENT_STATE_INACTIVE) {
update_context_time(ctx);
update_cgrp_time_from_event(event);
update_group_times(event);
if (event == event->group_leader)
group_sched_out(event, cpuctx, ctx);
else
event_sched_out(event, cpuctx, ctx);
event->state = PERF_EVENT_STATE_OFF;
}
raw_spin_unlock(&ctx->lock);
return 0;
}
/*
|