Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (470 commits) x86: Fix comments of register/stack access functions perf tools: Replace %m with %a in sscanf hw-breakpoints: Keep track of user disabled breakpoints tracing/syscalls: Make syscall events print callbacks static tracing: Add DEFINE_EVENT(), DEFINE_SINGLE_EVENT() support to docbook perf: Don't free perf_mmap_data until work has been done perf_event: Fix compile error perf tools: Fix _GNU_SOURCE macro related strndup() build error trace_syscalls: Remove unused syscall_name_to_nr() trace_syscalls: Simplify syscall profile trace_syscalls: Remove duplicate init_enter_##sname() trace_syscalls: Add syscall_nr field to struct syscall_metadata trace_syscalls: Remove enter_id exit_id trace_syscalls: Set event_enter_##sname->data to its metadata trace_syscalls: Remove unused event_syscall_enter and event_syscall_exit perf_event: Initialize data.period in perf_swevent_hrtimer() perf probe: Simplify event naming perf probe: Add --list option for listing current probe events perf probe: Add argv_split() from lib/argv_split.c perf probe: Move probe event utility functions to probe-event.c ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2009-12-05 18:30:21 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2009-12-05 18:30:21 -0500
commit: c3fa27d1367fac63ac8533d6f20ea851d0d70a10 (patch)
tree: e7731554085e22b6b63411b1ebb401079f3e0bbb /kernel
parent: 96fa2b508d2d3fe040cf4ef2fffb955f0a537ea1 (diff)
parent: d103d01e4b19f185d3c85f77402b605534c32e89 (diff)
22 files changed, 3574 insertions, 575 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index dcf6789bf547..982c50e2ce53 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,6 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
+CFLAGS_REMOVE_perf_event.o = -pg
 endif
 obj-$(CONFIG_FREEZER) += freezer.o
@@ -97,6 +98,7 @@ obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
 obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/exit.c b/kernel/exit.c
index f7864ac2ecc1..3f45e3cf931d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,6 +49,7 @@
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
+#include <linux/hw_breakpoint.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -978,6 +979,10 @@ NORET_TYPE void do_exit(long code)
        proc_exit_connector(tsk);
        /*
+         * FIXME: do that only when needed, using sched_exit tracepoint
+         */
+        flush_ptrace_hw_breakpoint(tsk);
+        /*
         * Flush inherited counters to the parent - before the parent
         * gets woken up by child-exit notifications.
         */
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..cf5ee1628411
--- /dev/null
+++ b/kernel/hw_breakpoint.c
@@ -0,0 +1,423 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) IBM Corporation, 2009
+ * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Thanks to Ingo Molnar for his many suggestions.
+ *
+ * Authors: Alan Stern <stern@rowland.harvard.edu>
+ *          K.Prasad <prasad@linux.vnet.ibm.com>
+ *          Frederic Weisbecker <fweisbec@gmail.com>
+ */
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ * This file contains the arch-independent routines.
+ */
+#include <linux/irqflags.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/hw_breakpoint.h>
+/*
+ * Constraints data
+ */
+/* Number of pinned cpu breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
+/* Number of pinned task breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]);
+/* Number of non-pinned cpu/task breakpoints in a cpu */
+static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
+/* Gather the number of total pinned and un-pinned bp in a cpuset */
+struct bp_busy_slots {
+        unsigned int pinned;
+        unsigned int flexible;
+};
+/* Serialize accesses to the above constraints */
+static DEFINE_MUTEX(nr_bp_mutex);
+/*
+ * Report the maximum number of pinned breakpoints a task
+ * have in this cpu
+ */
+static unsigned int max_task_bp_pinned(int cpu)
+{
+        int i;
+        unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu);
+        for (i = HBP_NUM -1; i >= 0; i--) {
+                if (tsk_pinned[i] > 0)
+                        return i + 1;
+        }
+        return 0;
+}
+/*
+ * Report the number of pinned/un-pinned breakpoints we have in
+ * a given cpu (cpu > -1) or in all of them (cpu = -1).
+ */
+static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
+{
+        if (cpu >= 0) {
+                slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
+                slots->pinned += max_task_bp_pinned(cpu);
+                slots->flexible = per_cpu(nr_bp_flexible, cpu);
+                return;
+        }
+        for_each_online_cpu(cpu) {
+                unsigned int nr;
+                nr = per_cpu(nr_cpu_bp_pinned, cpu);
+                nr += max_task_bp_pinned(cpu);
+                if (nr > slots->pinned)
+                        slots->pinned = nr;
+                nr = per_cpu(nr_bp_flexible, cpu);
+                if (nr > slots->flexible)
+                        slots->flexible = nr;
+        }
+}
+/*
+ * Add a pinned breakpoint for the given task in our constraint table
+ */
+static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
+{
+        int count = 0;
+        struct perf_event *bp;
+        struct perf_event_context *ctx = tsk->perf_event_ctxp;
+        unsigned int *tsk_pinned;
+        struct list_head *list;
+        unsigned long flags;
+        if (WARN_ONCE(!ctx, "No perf context for this task"))
+                return;
+        list = &ctx->event_list;
+        spin_lock_irqsave(&ctx->lock, flags);
+        /*
+         * The current breakpoint counter is not included in the list
+         * at the open() callback time
+         */
+        list_for_each_entry(bp, list, event_entry) {
+                if (bp->attr.type == PERF_TYPE_BREAKPOINT)
+                        count++;
+        }
+        spin_unlock_irqrestore(&ctx->lock, flags);
+        if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list"))
+                return;
+        tsk_pinned = per_cpu(task_bp_pinned, cpu);
+        if (enable) {
+                tsk_pinned[count]++;
+                if (count > 0)
+                        tsk_pinned[count-1]--;
+        } else {
+                tsk_pinned[count]--;
+                if (count > 0)
+                        tsk_pinned[count-1]++;
+        }
+}
+/*
+ * Add/remove the given breakpoint in our constraint table
+ */
+static void toggle_bp_slot(struct perf_event *bp, bool enable)
+{
+        int cpu = bp->cpu;
+        struct task_struct *tsk = bp->ctx->task;
+        /* Pinned counter task profiling */
+        if (tsk) {
+                if (cpu >= 0) {
+                        toggle_bp_task_slot(tsk, cpu, enable);
+                        return;
+                }
+                for_each_online_cpu(cpu)
+                        toggle_bp_task_slot(tsk, cpu, enable);
+                return;
+        }
+        /* Pinned counter cpu profiling */
+        if (enable)
+                per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
+        else
+                per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
+}
+/*
+ * Contraints to check before allowing this new breakpoint counter:
+ *
+ *  == Non-pinned counter == (Considered as pinned for now)
+ *
+ *   - If attached to a single cpu, check:
+ *
+ *       (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
+ *           + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM
+ *
+ *       -> If there are already non-pinned counters in this cpu, it means
+ *          there is already a free slot for them.
+ *          Otherwise, we check that the maximum number of per task
+ *          breakpoints (for this cpu) plus the number of per cpu breakpoint
+ *          (for this cpu) doesn't cover every registers.
+ *
+ *   - If attached to every cpus, check:
+ *
+ *       (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
+ *           + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM
+ *
+ *       -> This is roughly the same, except we check the number of per cpu
+ *          bp for every cpu and we keep the max one. Same for the per tasks
+ *          breakpoints.
+ *
+ *
+ * == Pinned counter ==
+ *
+ *   - If attached to a single cpu, check:
+ *
+ *       ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
+ *            + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM
+ *
+ *       -> Same checks as before. But now the nr_bp_flexible, if any, must keep
+ *          one register at least (or they will never be fed).
+ *
+ *   - If attached to every cpus, check:
+ *
+ *       ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
+ *            + max(per_cpu(task_bp_pinned, *))) < HBP_NUM
+ */
+int reserve_bp_slot(struct perf_event *bp)
+{
+        struct bp_busy_slots slots = {0};
+        int ret = 0;
+        mutex_lock(&nr_bp_mutex);
+        fetch_bp_busy_slots(&slots, bp->cpu);
+        /* Flexible counters need to keep at least one slot */
+        if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
+                ret = -ENOSPC;
+                goto end;
+        }
+        toggle_bp_slot(bp, true);
+end:
+        mutex_unlock(&nr_bp_mutex);
+        return ret;
+}
+void release_bp_slot(struct perf_event *bp)
+{
+        mutex_lock(&nr_bp_mutex);
+        toggle_bp_slot(bp, false);
+        mutex_unlock(&nr_bp_mutex);
+}
+int __register_perf_hw_breakpoint(struct perf_event *bp)
+{
+        int ret;
+        ret = reserve_bp_slot(bp);
+        if (ret)
+                return ret;
+        /*
+         * Ptrace breakpoints can be temporary perf events only
+         * meant to reserve a slot. In this case, it is created disabled and
+         * we don't want to check the params right now (as we put a null addr)
+         * But perf tools create events as disabled and we want to check
+         * the params for them.
+         * This is a quick hack that will be removed soon, once we remove
+         * the tmp breakpoints from ptrace
+         */
+        if (!bp->attr.disabled || bp->callback == perf_bp_event)
+                ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+        return ret;
+}
+int register_perf_hw_breakpoint(struct perf_event *bp)
+{
+        bp->callback = perf_bp_event;
+        return __register_perf_hw_breakpoint(bp);
+}
+/**
+ * register_user_hw_breakpoint - register a hardware breakpoint for user space
+ * @attr: breakpoint attributes
+ * @triggered: callback to trigger when we hit the breakpoint
+ * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ */
+struct perf_event *
+register_user_hw_breakpoint(struct perf_event_attr *attr,
+                            perf_callback_t triggered,
+                            struct task_struct *tsk)
+{
+        return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
+}
+EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
+/**
+ * modify_user_hw_breakpoint - modify a user-space hardware breakpoint
+ * @bp: the breakpoint structure to modify
+ * @attr: new breakpoint attributes
+ * @triggered: callback to trigger when we hit the breakpoint
+ * @tsk: pointer to 'task_struct' of the process to which the address belongs
+ */
+struct perf_event *
+modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr,
+                          perf_callback_t triggered,
+                          struct task_struct *tsk)
+{
+        /*
+         * FIXME: do it without unregistering
+         * - We don't want to lose our slot
+         * - If the new bp is incorrect, don't lose the older one
+         */
+        unregister_hw_breakpoint(bp);
+        return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
+}
+EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
+/**
+ * unregister_hw_breakpoint - unregister a user-space hardware breakpoint
+ * @bp: the breakpoint structure to unregister
+ */
+void unregister_hw_breakpoint(struct perf_event *bp)
+{
+        if (!bp)
+                return;
+        perf_event_release_kernel(bp);
+}
+EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
+/**
+ * register_wide_hw_breakpoint - register a wide breakpoint in the kernel
+ * @attr: breakpoint attributes
+ * @triggered: callback to trigger when we hit the breakpoint
+ *
+ * @return a set of per_cpu pointers to perf events
+ */
+struct perf_event **
+register_wide_hw_breakpoint(struct perf_event_attr *attr,
+                            perf_callback_t triggered)
+{
+        struct perf_event **cpu_events, **pevent, *bp;
+        long err;
+        int cpu;
+        cpu_events = alloc_percpu(typeof(*cpu_events));
+        if (!cpu_events)
+                return ERR_PTR(-ENOMEM);
+        for_each_possible_cpu(cpu) {
+                pevent = per_cpu_ptr(cpu_events, cpu);
+                bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
+                *pevent = bp;
+                if (IS_ERR(bp)) {
+                        err = PTR_ERR(bp);
+                        goto fail;
+                }
+        }
+        return cpu_events;
+fail:
+        for_each_possible_cpu(cpu) {
+                pevent = per_cpu_ptr(cpu_events, cpu);
+                if (IS_ERR(*pevent))
+                        break;
+                unregister_hw_breakpoint(*pevent);
+        }
+        free_percpu(cpu_events);
+        /* return the error if any */
+        return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
+/**
+ * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
+ * @cpu_events: the per cpu set of events to unregister
+ */
+void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
+{
+        int cpu;
+        struct perf_event **pevent;
+        for_each_possible_cpu(cpu) {
+                pevent = per_cpu_ptr(cpu_events, cpu);
+                unregister_hw_breakpoint(*pevent);
+        }
+        free_percpu(cpu_events);
+}
+EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
+static struct notifier_block hw_breakpoint_exceptions_nb = {
+        .notifier_call = hw_breakpoint_exceptions_notify,
+        /* we need to be notified first */
+        .priority = 0x7fffffff
+};
+static int __init init_hw_breakpoint(void)
+{
+        return register_die_notifier(&hw_breakpoint_exceptions_nb);
+}
+core_initcall(init_hw_breakpoint);
+struct pmu perf_ops_bp = {
+        .enable         = arch_install_hw_breakpoint,
+        .disable        = arch_uninstall_hw_breakpoint,
+        .read           = hw_breakpoint_pmu_read,
+        .unthrottle     = hw_breakpoint_pmu_unthrottle
+};
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8b6b8b697c68..8e5288a8a355 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -181,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name)
        }
        return module_kallsyms_lookup_name(name);
 }
+EXPORT_SYMBOL_GPL(kallsyms_lookup_name);
 int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
                                      unsigned long),
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1494e85b35f2..e5342a344c43 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -90,6 +90,9 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
 */
 static struct kprobe_blackpoint kprobe_blacklist[] = {
        {"preempt_schedule",},
+        {"native_get_debugreg",},
+        {"irq_entries_start",},
+        {"common_interrupt",},
        {NULL}    /* Terminator */
 };
@@ -673,6 +676,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
        return (kprobe_opcode_t *)(((char *)addr) + p->offset);
 }
+/* Check passed kprobe is valid and return kprobe in kprobe_table. */
+static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
+{
+        struct kprobe *old_p, *list_p;
+        old_p = get_kprobe(p->addr);
+        if (unlikely(!old_p))
+                return NULL;
+        if (p != old_p) {
+                list_for_each_entry_rcu(list_p, &old_p->list, list)
+                        if (list_p == p)
+                        /* kprobe p is a valid probe */
+                                goto valid;
+                return NULL;
+        }
+valid:
+        return old_p;
+}
+/* Return error if the kprobe is being re-registered */
+static inline int check_kprobe_rereg(struct kprobe *p)
+{
+        int ret = 0;
+        struct kprobe *old_p;
+        mutex_lock(&kprobe_mutex);
+        old_p = __get_valid_kprobe(p);
+        if (old_p)
+                ret = -EINVAL;
+        mutex_unlock(&kprobe_mutex);
+        return ret;
+}
 int __kprobes register_kprobe(struct kprobe *p)
 {
        int ret = 0;
@@ -685,6 +722,10 @@ int __kprobes register_kprobe(struct kprobe *p)
                return -EINVAL;
        p->addr = addr;
+        ret = check_kprobe_rereg(p);
+        if (ret)
+                return ret;
        preempt_disable();
        if (!kernel_text_address((unsigned long) p->addr) ||
            in_kprobes_functions((unsigned long) p->addr)) {
@@ -754,26 +795,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(register_kprobe);
-/* Check passed kprobe is valid and return kprobe in kprobe_table. */
-static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
-{
-        struct kprobe *old_p, *list_p;
-        old_p = get_kprobe(p->addr);
-        if (unlikely(!old_p))
-                return NULL;
-        if (p != old_p) {
-                list_for_each_entry_rcu(list_p, &old_p->list, list)
-                        if (list_p == p)
-                        /* kprobe p is a valid probe */
-                                goto valid;
-                return NULL;
-        }
-valid:
-        return old_p;
-}
 /*
 * Unregister a kprobe without a scheduler synchronization.
 */
@@ -1141,6 +1162,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
        arch_remove_kprobe(p);
 }
+void __kprobes dump_kprobe(struct kprobe *kp)
+{
+        printk(KERN_WARNING "Dumping kprobe:\n");
+        printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
+               kp->symbol_name, kp->addr, kp->offset);
+}
 /* Module notifier call back, checking kprobes on the module */
 static int __kprobes kprobes_module_callback(struct notifier_block *nb,
                                             unsigned long val, void *data)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 9af56723c096..f5dcd36d3151 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -49,7 +49,7 @@
 #include "lockdep_internals.h"
 #define CREATE_TRACE_POINTS
-#include <trace/events/lockdep.h>
+#include <trace/events/lock.h>
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 61d5aa5eced3..acd24e7643eb 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
 static ATOMIC_NOTIFIER_HEAD(die_chain);
-int notrace notify_die(enum die_val val, const char *str,
+int notrace __kprobes notify_die(enum die_val val, const char *str,
               struct pt_regs *regs, long err, int trap, int sig)
 {
        struct die_args args = {
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 7f29643c8985..6b7ddba1dd64 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -28,6 +28,8 @@
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
 #include <linux/perf_event.h>
+#include <linux/ftrace_event.h>
+#include <linux/hw_breakpoint.h>
 #include <asm/irq_regs.h>
@@ -244,6 +246,49 @@ static void perf_unpin_context(struct perf_event_context *ctx)
        put_ctx(ctx);
 }
+static inline u64 perf_clock(void)
+{
+        return cpu_clock(smp_processor_id());
+}
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_event_context *ctx)
+{
+        u64 now = perf_clock();
+        ctx->time += now - ctx->timestamp;
+        ctx->timestamp = now;
+}
+/*
+ * Update the total_time_enabled and total_time_running fields for a event.
+ */
+static void update_event_times(struct perf_event *event)
+{
+        struct perf_event_context *ctx = event->ctx;
+        u64 run_end;
+        if (event->state < PERF_EVENT_STATE_INACTIVE ||
+            event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
+                return;
+        if (ctx->is_active)
+                run_end = ctx->time;
+        else
+                run_end = event->tstamp_stopped;
+        event->total_time_enabled = run_end - event->tstamp_enabled;
+        if (event->state == PERF_EVENT_STATE_INACTIVE)
+                run_end = event->tstamp_stopped;
+        else
+                run_end = ctx->time;
+        event->total_time_running = run_end - event->tstamp_running;
+}
 /*
 * Add a event from the lists for its context.
 * Must be called with ctx->mutex and ctx->lock held.
@@ -292,6 +337,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
        if (event->group_leader != event)
                event->group_leader->nr_siblings--;
+        update_event_times(event);
+        /*
+         * If event was in error state, then keep it
+         * that way, otherwise bogus counts will be
+         * returned on read(). The only way to get out
+         * of error state is by explicit re-enabling
+         * of the event
+         */
+        if (event->state > PERF_EVENT_STATE_OFF)
+                event->state = PERF_EVENT_STATE_OFF;
        /*
         * If this was a group event with sibling events then
         * upgrade the siblings to singleton events by adding them
@@ -445,50 +502,11 @@ retry:
         * can remove the event safely, if the call above did not
         * succeed.
         */
-        if (!list_empty(&event->group_entry)) {
+        if (!list_empty(&event->group_entry))
                list_del_event(event, ctx);
-        }
        spin_unlock_irq(&ctx->lock);
 }
-static inline u64 perf_clock(void)
-{
-        return cpu_clock(smp_processor_id());
-}
-/*
- * Update the record of the current time in a context.
- */
-static void update_context_time(struct perf_event_context *ctx)
-{
-        u64 now = perf_clock();
-        ctx->time += now - ctx->timestamp;
-        ctx->timestamp = now;
-}
-/*
- * Update the total_time_enabled and total_time_running fields for a event.
- */
-static void update_event_times(struct perf_event *event)
-{
-        struct perf_event_context *ctx = event->ctx;
-        u64 run_end;
-        if (event->state < PERF_EVENT_STATE_INACTIVE ||
-            event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
-                return;
-        event->total_time_enabled = ctx->time - event->tstamp_enabled;
-        if (event->state == PERF_EVENT_STATE_INACTIVE)
-                run_end = event->tstamp_stopped;
-        else
-                run_end = ctx->time;
-        event->total_time_running = run_end - event->tstamp_running;
-}
 /*
 * Update total_time_enabled and total_time_running for all events in a group.
 */
@@ -1031,10 +1049,10 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
        update_context_time(ctx);
        perf_disable();
-        if (ctx->nr_active)
+        if (ctx->nr_active) {
                list_for_each_entry(event, &ctx->group_list, group_entry)
                        group_sched_out(event, cpuctx, ctx);
+        }
        perf_enable();
 out:
        spin_unlock(&ctx->lock);
@@ -1059,8 +1077,6 @@ static int context_equiv(struct perf_event_context *ctx1,
                && !ctx1->pin_count && !ctx2->pin_count;
 }
-static void __perf_event_read(void *event);
 static void __perf_event_sync_stat(struct perf_event *event,
                                     struct perf_event *next_event)
 {
@@ -1078,8 +1094,8 @@ static void __perf_event_sync_stat(struct perf_event *event,
         */
        switch (event->state) {
        case PERF_EVENT_STATE_ACTIVE:
-                __perf_event_read(event);
+                event->pmu->read(event);
-                break;
+                /* fall-through */
        case PERF_EVENT_STATE_INACTIVE:
                update_event_times(event);
@@ -1118,6 +1134,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
        if (!ctx->nr_stat)
                return;
+        update_context_time(ctx);
        event = list_first_entry(&ctx->event_list,
                                   struct perf_event, event_entry);
@@ -1161,8 +1179,6 @@ void perf_event_task_sched_out(struct task_struct *task,
        if (likely(!ctx || !cpuctx->task_ctx))
                return;
-        update_context_time(ctx);
        rcu_read_lock();
        parent = rcu_dereference(ctx->parent_ctx);
        next_ctx = next->perf_event_ctxp;
@@ -1515,7 +1531,6 @@ static void __perf_event_read(void *info)
        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
-        unsigned long flags;
        /*
         * If this is a task context, we need to check whether it is
@@ -1527,12 +1542,12 @@ static void __perf_event_read(void *info)
        if (ctx->task && cpuctx->task_ctx != ctx)
                return;
-        local_irq_save(flags);
+        spin_lock(&ctx->lock);
-        if (ctx->is_active)
+        update_context_time(ctx);
-                update_context_time(ctx);
-        event->pmu->read(event);
        update_event_times(event);
-        local_irq_restore(flags);
+        spin_unlock(&ctx->lock);
+        event->pmu->read(event);
 }
 static u64 perf_event_read(struct perf_event *event)
@@ -1545,7 +1560,13 @@ static u64 perf_event_read(struct perf_event *event)
                smp_call_function_single(event->oncpu,
                                         __perf_event_read, event, 1);
        } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+                struct perf_event_context *ctx = event->ctx;
+                unsigned long flags;
+                spin_lock_irqsave(&ctx->lock, flags);
+                update_context_time(ctx);
                update_event_times(event);
+                spin_unlock_irqrestore(&ctx->lock, flags);
        }
        return atomic64_read(&event->count);
@@ -1658,6 +1679,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
        return ERR_PTR(err);
 }
+static void perf_event_free_filter(struct perf_event *event);
 static void free_event_rcu(struct rcu_head *head)
 {
        struct perf_event *event;
@@ -1665,6 +1688,7 @@ static void free_event_rcu(struct rcu_head *head)
        event = container_of(head, struct perf_event, rcu_head);
        if (event->ns)
                put_pid_ns(event->ns);
+        perf_event_free_filter(event);
        kfree(event);
 }
@@ -1696,16 +1720,10 @@ static void free_event(struct perf_event *event)
        call_rcu(&event->rcu_head, free_event_rcu);
 }
-/*
+int perf_event_release_kernel(struct perf_event *event)
- * Called when the last reference to the file is gone.
- */
-static int perf_release(struct inode *inode, struct file *file)
 {
-        struct perf_event *event = file->private_data;
        struct perf_event_context *ctx = event->ctx;
-        file->private_data = NULL;
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        perf_event_remove_from_context(event);
@@ -1720,6 +1738,19 @@ static int perf_release(struct inode *inode, struct file *file)
        return 0;
 }
+EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+/*
+ * Called when the last reference to the file is gone.
+ */
+static int perf_release(struct inode *inode, struct file *file)
+{
+        struct perf_event *event = file->private_data;
+        file->private_data = NULL;
+        return perf_event_release_kernel(event);
+}
 static int perf_event_read_size(struct perf_event *event)
 {
@@ -1746,91 +1777,94 @@ static int perf_event_read_size(struct perf_event *event)
        return size;
 }
-static u64 perf_event_read_value(struct perf_event *event)
+u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 {
        struct perf_event *child;
        u64 total = 0;
+        *enabled = 0;
+        *running = 0;
+        mutex_lock(&event->child_mutex);
        total += perf_event_read(event);
-        list_for_each_entry(child, &event->child_list, child_list)
+        *enabled += event->total_time_enabled +
+                        atomic64_read(&event->child_total_time_enabled);
+        *running += event->total_time_running +
+                        atomic64_read(&event->child_total_time_running);
+        list_for_each_entry(child, &event->child_list, child_list) {
                total += perf_event_read(child);
+                *enabled += child->total_time_enabled;
+                *running += child->total_time_running;
+        }
+        mutex_unlock(&event->child_mutex);
        return total;
 }
+EXPORT_SYMBOL_GPL(perf_event_read_value);
-static int perf_event_read_entry(struct perf_event *event,
-                                   u64 read_format, char __user *buf)
-{
-        int n = 0, count = 0;
-        u64 values[2];
-        values[n++] = perf_event_read_value(event);
-        if (read_format & PERF_FORMAT_ID)
-                values[n++] = primary_event_id(event);
-        count = n * sizeof(u64);
-        if (copy_to_user(buf, values, count))
-                return -EFAULT;
-        return count;
-}
 static int perf_event_read_group(struct perf_event *event,
                                   u64 read_format, char __user *buf)
 {
        struct perf_event *leader = event->group_leader, *sub;
-        int n = 0, size = 0, err = -EFAULT;
+        int n = 0, size = 0, ret = -EFAULT;
-        u64 values[3];
+        struct perf_event_context *ctx = leader->ctx;
+        u64 values[5];
+        u64 count, enabled, running;
+        mutex_lock(&ctx->mutex);
+        count = perf_event_read_value(leader, &enabled, &running);
        values[n++] = 1 + leader->nr_siblings;
-        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-                values[n++] = leader->total_time_enabled +
+                values[n++] = enabled;
-                        atomic64_read(&leader->child_total_time_enabled);
+        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-        }
+                values[n++] = running;
-        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+        values[n++] = count;
-                values[n++] = leader->total_time_running +
+        if (read_format & PERF_FORMAT_ID)
-                        atomic64_read(&leader->child_total_time_running);
+                values[n++] = primary_event_id(leader);
-        }
        size = n * sizeof(u64);
        if (copy_to_user(buf, values, size))
-                return -EFAULT;
+                goto unlock;
-        err = perf_event_read_entry(leader, read_format, buf + size);
-        if (err < 0)
-                return err;
-        size += err;
+        ret = size;
        list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-                err = perf_event_read_entry(sub, read_format,
+                n = 0;
-                                buf + size);
-                if (err < 0)
+                values[n++] = perf_event_read_value(sub, &enabled, &running);
-                        return err;
+                if (read_format & PERF_FORMAT_ID)
+                        values[n++] = primary_event_id(sub);
+                size = n * sizeof(u64);
-                size += err;
+                if (copy_to_user(buf + ret, values, size)) {
+                        ret = -EFAULT;
+                        goto unlock;
+                }
+                ret += size;
        }
+unlock:
+        mutex_unlock(&ctx->mutex);
-        return size;
+        return ret;
 }
 static int perf_event_read_one(struct perf_event *event,
                                 u64 read_format, char __user *buf)
 {
+        u64 enabled, running;
        u64 values[4];
        int n = 0;
-        values[n++] = perf_event_read_value(event);
+        values[n++] = perf_event_read_value(event, &enabled, &running);
-        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-                values[n++] = event->total_time_enabled +
+                values[n++] = enabled;
-                        atomic64_read(&event->child_total_time_enabled);
+        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-        }
+                values[n++] = running;
-        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-                values[n++] = event->total_time_running +
-                        atomic64_read(&event->child_total_time_running);
-        }
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(event);
@@ -1861,12 +1895,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
                return -ENOSPC;
        WARN_ON_ONCE(event->ctx->parent_ctx);
-        mutex_lock(&event->child_mutex);
        if (read_format & PERF_FORMAT_GROUP)
                ret = perf_event_read_group(event, read_format, buf);
        else
                ret = perf_event_read_one(event, read_format, buf);
-        mutex_unlock(&event->child_mutex);
        return ret;
 }
@@ -1974,7 +2006,8 @@ unlock:
        return ret;
 }
-int perf_event_set_output(struct perf_event *event, int output_fd);
+static int perf_event_set_output(struct perf_event *event, int output_fd);
+static int perf_event_set_filter(struct perf_event *event, void __user *arg);
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
@@ -2002,6 +2035,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case PERF_EVENT_IOC_SET_OUTPUT:
                return perf_event_set_output(event, arg);
+        case PERF_EVENT_IOC_SET_FILTER:
+                return perf_event_set_filter(event, (void __user *)arg);
        default:
                return -ENOTTY;
        }
@@ -2174,6 +2210,7 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
        perf_mmap_free_page((unsigned long)data->user_page);
        for (i = 0; i < data->nr_pages; i++)
                perf_mmap_free_page((unsigned long)data->data_pages[i]);
+        kfree(data);
 }
 #else
@@ -2214,6 +2251,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
                perf_mmap_unmark_page(base + (i * PAGE_SIZE));
        vfree(base);
+        kfree(data);
 }
 static void perf_mmap_data_free(struct perf_mmap_data *data)
@@ -2307,7 +2345,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
        }
        if (!data->watermark)
-                data->watermark = max_t(long, PAGE_SIZE, max_size / 2);
+                data->watermark = max_size / 2;
        rcu_assign_pointer(event->data, data);
@@ -2319,7 +2357,6 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
        data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
        perf_mmap_data_free(data);
-        kfree(data);
 }
 static void perf_mmap_data_release(struct perf_event *event)
@@ -2666,20 +2703,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
 static void perf_output_lock(struct perf_output_handle *handle)
 {
        struct perf_mmap_data *data = handle->data;
-        int cpu;
+        int cur, cpu = get_cpu();
        handle->locked = 0;
-        local_irq_save(handle->flags);
+        for (;;) {
-        cpu = smp_processor_id();
+                cur = atomic_cmpxchg(&data->lock, -1, cpu);
+                if (cur == -1) {
-        if (in_nmi() && atomic_read(&data->lock) == cpu)
+                        handle->locked = 1;
-                return;
+                        break;
+                }
+                if (cur == cpu)
+                        break;
-        while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
                cpu_relax();
+        }
-        handle->locked = 1;
 }
 static void perf_output_unlock(struct perf_output_handle *handle)
@@ -2725,7 +2763,7 @@ again:
        if (atomic_xchg(&data->wakeup, 0))
                perf_output_wakeup(handle);
 out:
-        local_irq_restore(handle->flags);
+        put_cpu();
 }
 void perf_output_copy(struct perf_output_handle *handle,
@@ -3236,15 +3274,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
 {
        struct perf_event *event;
-        if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-                return;
-        rcu_read_lock();
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (perf_event_task_match(event))
                        perf_event_task_output(event, task_event);
        }
-        rcu_read_unlock();
 }
 static void perf_event_task_event(struct perf_task_event *task_event)
@@ -3252,11 +3285,11 @@ static void perf_event_task_event(struct perf_task_event *task_event)
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx = task_event->task_ctx;
+        rcu_read_lock();
        cpuctx = &get_cpu_var(perf_cpu_context);
        perf_event_task_ctx(&cpuctx->ctx, task_event);
        put_cpu_var(perf_cpu_context);
-        rcu_read_lock();
        if (!ctx)
                ctx = rcu_dereference(task_event->task->perf_event_ctxp);
        if (ctx)
@@ -3348,15 +3381,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx,
 {
        struct perf_event *event;
-        if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-                return;
-        rcu_read_lock();
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (perf_event_comm_match(event))
                        perf_event_comm_output(event, comm_event);
        }
-        rcu_read_unlock();
 }
 static void perf_event_comm_event(struct perf_comm_event *comm_event)
@@ -3367,7 +3395,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
        char comm[TASK_COMM_LEN];
        memset(comm, 0, sizeof(comm));
-        strncpy(comm, comm_event->task->comm, sizeof(comm));
+        strlcpy(comm, comm_event->task->comm, sizeof(comm));
        size = ALIGN(strlen(comm)+1, sizeof(u64));
        comm_event->comm = comm;
@@ -3375,11 +3403,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
+        rcu_read_lock();
        cpuctx = &get_cpu_var(perf_cpu_context);
        perf_event_comm_ctx(&cpuctx->ctx, comm_event);
        put_cpu_var(perf_cpu_context);
-        rcu_read_lock();
        /*
         * doesn't really matter which of the child contexts the
         * events ends up in.
@@ -3472,15 +3500,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx,
 {
        struct perf_event *event;
-        if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-                return;
-        rcu_read_lock();
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (perf_event_mmap_match(event, mmap_event))
                        perf_event_mmap_output(event, mmap_event);
        }
-        rcu_read_unlock();
 }
 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -3536,11 +3559,11 @@ got_name:
        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
+        rcu_read_lock();
        cpuctx = &get_cpu_var(perf_cpu_context);
        perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
        put_cpu_var(perf_cpu_context);
-        rcu_read_lock();
        /*
         * doesn't really matter which of the child contexts the
         * events ends up in.
@@ -3679,7 +3702,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
                        perf_event_disable(event);
        }
-        perf_event_output(event, nmi, data, regs);
+        if (event->overflow_handler)
+                event->overflow_handler(event, nmi, data, regs);
+        else
+                perf_event_output(event, nmi, data, regs);
        return ret;
 }
@@ -3724,16 +3751,16 @@ again:
        return nr;
 }
-static void perf_swevent_overflow(struct perf_event *event,
+static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
                                    int nmi, struct perf_sample_data *data,
                                    struct pt_regs *regs)
 {
        struct hw_perf_event *hwc = &event->hw;
        int throttle = 0;
-        u64 overflow;
        data->period = event->hw.last_period;
-        overflow = perf_swevent_set_period(event);
+        if (!overflow)
+                overflow = perf_swevent_set_period(event);
        if (hwc->interrupts == MAX_INTERRUPTS)
                return;
@@ -3766,14 +3793,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
        atomic64_add(nr, &event->count);
+        if (!regs)
+                return;
        if (!hwc->sample_period)
                return;
-        if (!regs)
+        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
+                return perf_swevent_overflow(event, 1, nmi, data, regs);
+        if (atomic64_add_negative(nr, &hwc->period_left))
                return;
-        if (!atomic64_add_negative(nr, &hwc->period_left))
+        perf_swevent_overflow(event, 0, nmi, data, regs);
-                perf_swevent_overflow(event, nmi, data, regs);
 }
 static int perf_swevent_is_counting(struct perf_event *event)
@@ -3806,25 +3838,44 @@ static int perf_swevent_is_counting(struct perf_event *event)
        return 1;
 }
+static int perf_tp_event_match(struct perf_event *event,
+                                struct perf_sample_data *data);
+static int perf_exclude_event(struct perf_event *event,
+                              struct pt_regs *regs)
+{
+        if (regs) {
+                if (event->attr.exclude_user && user_mode(regs))
+                        return 1;
+                if (event->attr.exclude_kernel && !user_mode(regs))
+                        return 1;
+        }
+        return 0;
+}
 static int perf_swevent_match(struct perf_event *event,
                                enum perf_type_id type,
-                                u32 event_id, struct pt_regs *regs)
+                                u32 event_id,
+                                struct perf_sample_data *data,
+                                struct pt_regs *regs)
 {
        if (!perf_swevent_is_counting(event))
                return 0;
        if (event->attr.type != type)
                return 0;
        if (event->attr.config != event_id)
                return 0;
-        if (regs) {
+        if (perf_exclude_event(event, regs))
-                if (event->attr.exclude_user && user_mode(regs))
+                return 0;
-                        return 0;
-                if (event->attr.exclude_kernel && !user_mode(regs))
+        if (event->attr.type == PERF_TYPE_TRACEPOINT &&
-                        return 0;
+            !perf_tp_event_match(event, data))
-        }
+                return 0;
        return 1;
 }
@@ -3837,49 +3888,59 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
 {
        struct perf_event *event;
-        if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
-                return;
-        rcu_read_lock();
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-                if (perf_swevent_match(event, type, event_id, regs))
+                if (perf_swevent_match(event, type, event_id, data, regs))
                        perf_swevent_add(event, nr, nmi, data, regs);
        }
-        rcu_read_unlock();
 }
-static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
+int perf_swevent_get_recursion_context(void)
 {
+        struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+        int rctx;
        if (in_nmi())
-                return &cpuctx->recursion[3];
+                rctx = 3;
+        else if (in_irq())
+                rctx = 2;
+        else if (in_softirq())
+                rctx = 1;
+        else
+                rctx = 0;
+        if (cpuctx->recursion[rctx]) {
+                put_cpu_var(perf_cpu_context);
+                return -1;
+        }
-        if (in_irq())
+        cpuctx->recursion[rctx]++;
-                return &cpuctx->recursion[2];
+        barrier();
-        if (in_softirq())
+        return rctx;
-                return &cpuctx->recursion[1];
+}
+EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
-        return &cpuctx->recursion[0];
+void perf_swevent_put_recursion_context(int rctx)
+{
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        barrier();
+        cpuctx->recursion[rctx]--;
+        put_cpu_var(perf_cpu_context);
 }
+EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
                                    u64 nr, int nmi,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
 {
-        struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+        struct perf_cpu_context *cpuctx;
-        int *recursion = perf_swevent_recursion_context(cpuctx);
        struct perf_event_context *ctx;
-        if (*recursion)
+        cpuctx = &__get_cpu_var(perf_cpu_context);
-                goto out;
+        rcu_read_lock();
-        (*recursion)++;
-        barrier();
        perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
                                 nr, nmi, data, regs);
-        rcu_read_lock();
        /*
         * doesn't really matter which of the child contexts the
         * events ends up in.
@@ -3888,23 +3949,24 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
        if (ctx)
                perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
        rcu_read_unlock();
-        barrier();
-        (*recursion)--;
-out:
-        put_cpu_var(perf_cpu_context);
 }
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
                            struct pt_regs *regs, u64 addr)
 {
-        struct perf_sample_data data = {
+        struct perf_sample_data data;
-                .addr = addr,
+        int rctx;
-        };
-        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
+        rctx = perf_swevent_get_recursion_context();
-                                &data, regs);
+        if (rctx < 0)
+                return;
+        data.addr = addr;
+        data.raw  = NULL;
+        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
+        perf_swevent_put_recursion_context(rctx);
 }
 static void perf_swevent_read(struct perf_event *event)
@@ -3949,6 +4011,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        event->pmu->read(event);
        data.addr = 0;
+        data.period = event->hw.last_period;
        regs = get_irq_regs();
        /*
         * In case we exclude kernel IPs or are somehow not in interrupt
@@ -4108,6 +4171,7 @@ static const struct pmu perf_ops_task_clock = {
 };
 #ifdef CONFIG_EVENT_PROFILE
 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
                          int entry_size)
 {
@@ -4126,13 +4190,21 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
        if (!regs)
                regs = task_pt_regs(current);
+        /* Trace events already protected against recursion */
        do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
                                &data, regs);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
-extern int ftrace_profile_enable(int);
+static int perf_tp_event_match(struct perf_event *event,
-extern void ftrace_profile_disable(int);
+                                struct perf_sample_data *data)
+{
+        void *record = data->raw->data;
+        if (likely(!event->filter) || filter_match_preds(event->filter, record))
+                return 1;
+        return 0;
+}
 static void tp_perf_event_destroy(struct perf_event *event)
 {
@@ -4157,11 +4229,99 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
        return &perf_ops_generic;
 }
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+        char *filter_str;
+        int ret;
+        if (event->attr.type != PERF_TYPE_TRACEPOINT)
+                return -EINVAL;
+        filter_str = strndup_user(arg, PAGE_SIZE);
+        if (IS_ERR(filter_str))
+                return PTR_ERR(filter_str);
+        ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+        kfree(filter_str);
+        return ret;
+}
+static void perf_event_free_filter(struct perf_event *event)
+{
+        ftrace_profile_free_filter(event);
+}
 #else
+static int perf_tp_event_match(struct perf_event *event,
+                                struct perf_sample_data *data)
+{
+        return 1;
+}
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
        return NULL;
 }
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+        return -ENOENT;
+}
+static void perf_event_free_filter(struct perf_event *event)
+{
+}
+#endif /* CONFIG_EVENT_PROFILE */
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+static void bp_perf_event_destroy(struct perf_event *event)
+{
+        release_bp_slot(event);
+}
+static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+{
+        int err;
+        /*
+         * The breakpoint is already filled if we haven't created the counter
+         * through perf syscall
+         * FIXME: manage to get trigerred to NULL if it comes from syscalls
+         */
+        if (!bp->callback)
+                err = register_perf_hw_breakpoint(bp);
+        else
+                err = __register_perf_hw_breakpoint(bp);
+        if (err)
+                return ERR_PTR(err);
+        bp->destroy = bp_perf_event_destroy;
+        return &perf_ops_bp;
+}
+void perf_bp_event(struct perf_event *bp, void *data)
+{
+        struct perf_sample_data sample;
+        struct pt_regs *regs = data;
+        sample.addr = bp->attr.bp_addr;
+        if (!perf_exclude_event(bp, regs))
+                perf_swevent_add(bp, 1, 1, &sample, regs);
+}
+#else
+static const struct pmu *bp_perf_event_init(struct perf_event *bp)
+{
+        return NULL;
+}
+void perf_bp_event(struct perf_event *bp, void *regs)
+{
+}
 #endif
 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
@@ -4208,6 +4368,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
        case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
        case PERF_COUNT_SW_CONTEXT_SWITCHES:
        case PERF_COUNT_SW_CPU_MIGRATIONS:
+        case PERF_COUNT_SW_ALIGNMENT_FAULTS:
+        case PERF_COUNT_SW_EMULATION_FAULTS:
                if (!event->parent) {
                        atomic_inc(&perf_swevent_enabled[event_id]);
                        event->destroy = sw_perf_event_destroy;
@@ -4228,6 +4390,7 @@ perf_event_alloc(struct perf_event_attr *attr,
                   struct perf_event_context *ctx,
                   struct perf_event *group_leader,
                   struct perf_event *parent_event,
+                   perf_callback_t callback,
                   gfp_t gfpflags)
 {
        const struct pmu *pmu;
@@ -4270,6 +4433,11 @@ perf_event_alloc(struct perf_event_attr *attr,
        event->state            = PERF_EVENT_STATE_INACTIVE;
+        if (!callback && parent_event)
+                callback = parent_event->callback;
+        
+        event->callback = callback;
        if (attr->disabled)
                event->state = PERF_EVENT_STATE_OFF;
@@ -4304,6 +4472,11 @@ perf_event_alloc(struct perf_event_attr *attr,
                pmu = tp_perf_event_init(event);
                break;
+        case PERF_TYPE_BREAKPOINT:
+                pmu = bp_perf_event_init(event);
+                break;
        default:
                break;
        }
@@ -4416,7 +4589,7 @@ err_size:
        goto out;
 }
-int perf_event_set_output(struct perf_event *event, int output_fd)
+static int perf_event_set_output(struct perf_event *event, int output_fd)
 {
        struct perf_event *output_event = NULL;
        struct file *output_file = NULL;
@@ -4546,7 +4719,7 @@ SYSCALL_DEFINE5(perf_event_open,
        }
        event = perf_event_alloc(&attr, cpu, ctx, group_leader,
-                                     NULL, GFP_KERNEL);
+                                     NULL, NULL, GFP_KERNEL);
        err = PTR_ERR(event);
        if (IS_ERR(event))
                goto err_put_context;
@@ -4594,6 +4767,60 @@ err_put_context:
        return err;
 }
+/**
+ * perf_event_create_kernel_counter
+ *
+ * @attr: attributes of the counter to create
+ * @cpu: cpu in which the counter is bound
+ * @pid: task to profile
+ */
+struct perf_event *
+perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
+                                 pid_t pid, perf_callback_t callback)
+{
+        struct perf_event *event;
+        struct perf_event_context *ctx;
+        int err;
+        /*
+         * Get the target context (task or percpu):
+         */
+        ctx = find_get_context(pid, cpu);
+        if (IS_ERR(ctx)) {
+                err = PTR_ERR(ctx);
+                goto err_exit;
+        }
+        event = perf_event_alloc(attr, cpu, ctx, NULL,
+                                     NULL, callback, GFP_KERNEL);
+        if (IS_ERR(event)) {
+                err = PTR_ERR(event);
+                goto err_put_context;
+        }
+        event->filp = NULL;
+        WARN_ON_ONCE(ctx->parent_ctx);
+        mutex_lock(&ctx->mutex);
+        perf_install_in_context(ctx, event, cpu);
+        ++ctx->generation;
+        mutex_unlock(&ctx->mutex);
+        event->owner = current;
+        get_task_struct(current);
+        mutex_lock(&current->perf_event_mutex);
+        list_add_tail(&event->owner_entry, &current->perf_event_list);
+        mutex_unlock(&current->perf_event_mutex);
+        return event;
+ err_put_context:
+        put_ctx(ctx);
+ err_exit:
+        return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
 /*
 * inherit a event from parent task to child task:
 */
@@ -4619,7 +4846,7 @@ inherit_event(struct perf_event *parent_event,
        child_event = perf_event_alloc(&parent_event->attr,
                                           parent_event->cpu, child_ctx,
                                           group_leader, parent_event,
-                                           GFP_KERNEL);
+                                           NULL, GFP_KERNEL);
        if (IS_ERR(child_event))
                return child_event;
        get_ctx(child_ctx);
@@ -4637,6 +4864,8 @@ inherit_event(struct perf_event *parent_event,
        if (parent_event->attr.freq)
                child_event->hw.sample_period = parent_event->hw.sample_period;
+        child_event->overflow_handler = parent_event->overflow_handler;
        /*
         * Link it up in the child's context:
         */
@@ -4726,7 +4955,6 @@ __perf_event_exit_task(struct perf_event *child_event,
 {
        struct perf_event *parent_event;
-        update_event_times(child_event);
        perf_event_remove_from_context(child_event);
        parent_event = child_event->parent;
@@ -4778,6 +5006,7 @@ void perf_event_exit_task(struct task_struct *child)
         * the events from it.
         */
        unclone_ctx(child_ctx);
+        update_context_time(child_ctx);
        spin_unlock_irqrestore(&child_ctx->lock, flags);
        /*
diff --git a/kernel/signal.c b/kernel/signal.c
index fe08008133da..6b982f2cf524 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -28,7 +28,8 @@
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
-#include <trace/events/sched.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/signal.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
@@ -856,7 +857,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
        struct sigqueue *q;
        int override_rlimit;
-        trace_sched_signal_send(sig, t);
+        trace_signal_generate(sig, info, t);
        assert_spin_locked(&t->sighand->siglock);
@@ -918,12 +919,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
                        break;
                }
        } else if (!is_si_special(info)) {
-                if (sig >= SIGRTMIN && info->si_code != SI_USER)
+                if (sig >= SIGRTMIN && info->si_code != SI_USER) {
-                /*
+                        /*
-                 * Queue overflow, abort.  We may abort if the signal was rt
+                         * Queue overflow, abort.  We may abort if the
-                 * and sent by user using something other than kill().
+                         * signal was rt and sent by user using something
-                 */
+                         * other than kill().
+                         */
+                        trace_signal_overflow_fail(sig, group, info);
                        return -EAGAIN;
+                } else {
+                        /*
+                         * This is a silent loss of information.  We still
+                         * send the signal, but the *info bits are lost.
+                         */
+                        trace_signal_lose_info(sig, group, info);
+                }
        }
 out_set:
@@ -1859,6 +1869,9 @@ relock:
                        ka = &sighand->action[signr-1];
                }
+                /* Trace actually delivered signals. */
+                trace_signal_deliver(signr, info, ka);
                if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
                        continue;
                if (ka->sa.sa_handler != SIG_DFL) {
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b416512ad17f..d006554888dc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -339,6 +339,27 @@ config POWER_TRACER
          power management decisions, specifically the C-state and P-state
          behavior.
+config KSYM_TRACER
+        bool "Trace read and write access on kernel memory locations"
+        depends on HAVE_HW_BREAKPOINT
+        select TRACING
+        help
+          This tracer helps find read and write operations on any given kernel
+          symbol i.e. /proc/kallsyms.
+config PROFILE_KSYM_TRACER
+        bool "Profile all kernel memory accesses on 'watched' variables"
+        depends on KSYM_TRACER
+        help
+          This tracer profiles kernel accesses on variables watched through the
+          ksym tracer ftrace plugin. Depending upon the hardware, all read
+          and write operations on kernel variables can be monitored for
+          accesses.
+          The results will be displayed in:
+          /debugfs/tracing/profile_ksym
+          Say N if unsure.
 config STACK_TRACER
        bool "Trace max stack"
@@ -428,6 +449,23 @@ config BLK_DEV_IO_TRACE
          If unsure, say N.
+config KPROBE_EVENT
+        depends on KPROBES
+        depends on X86
+        bool "Enable kprobes-based dynamic events"
+        select TRACING
+        default y
+        help
+          This allows the user to add tracing events (similar to tracepoints) on the fly
+          via the ftrace interface. See Documentation/trace/kprobetrace.txt
+          for more details.
+          Those events can be inserted wherever kprobes can probe, and record
+          various register and memory values.
+          This option is also required by perf-probe subcommand of perf tools. If
+          you want to use perf tools, this option is strongly recommended.
 config DYNAMIC_FTRACE
        bool "enable/disable ftrace tracepoints dynamically"
        depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 26f03ac07c2b..cd9ecd89ec77 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,8 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
+obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
+obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
 obj-$(CONFIG_EVENT_TRACING) += power-traces.o
 libftrace-y := ftrace.o
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a72c6e03deec..a1ca4956ab5e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -397,18 +397,21 @@ int ring_buffer_print_page_header(struct trace_seq *s)
        int ret;
        ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
-                               "offset:0;\tsize:%u;\n",
+                               "offset:0;\tsize:%u;\tsigned:%u;\n",
-                               (unsigned int)sizeof(field.time_stamp));
+                               (unsigned int)sizeof(field.time_stamp),
+                               (unsigned int)is_signed_type(u64));
        ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
-                               "offset:%u;\tsize:%u;\n",
+                               "offset:%u;\tsize:%u;\tsigned:%u;\n",
                               (unsigned int)offsetof(typeof(field), commit),
-                               (unsigned int)sizeof(field.commit));
+                               (unsigned int)sizeof(field.commit),
+                               (unsigned int)is_signed_type(long));
        ret = trace_seq_printf(s, "\tfield: char data;\t"
-                               "offset:%u;\tsize:%u;\n",
+                               "offset:%u;\tsize:%u;\tsigned:%u;\n",
                               (unsigned int)offsetof(typeof(field), data),
-                               (unsigned int)BUF_PAGE_SIZE);
+                               (unsigned int)BUF_PAGE_SIZE,
+                               (unsigned int)is_signed_type(char));
        return ret;
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index acef8b4636f0..1d7f4830a80d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,6 +11,7 @@
 #include <linux/ftrace.h>
 #include <trace/boot.h>
 #include <linux/kmemtrace.h>
+#include <linux/hw_breakpoint.h>
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
@@ -37,6 +38,7 @@ enum trace_type {
        TRACE_KMEM_ALLOC,
        TRACE_KMEM_FREE,
        TRACE_BLK,
+        TRACE_KSYM,
        __TRACE_LAST_TYPE,
 };
@@ -98,9 +100,32 @@ struct syscall_trace_enter {
 struct syscall_trace_exit {
        struct trace_entry      ent;
        int                     nr;
-        unsigned long           ret;
+        long                    ret;
 };
+struct kprobe_trace_entry {
+        struct trace_entry      ent;
+        unsigned long           ip;
+        int                     nargs;
+        unsigned long           args[];
+};
+#define SIZEOF_KPROBE_TRACE_ENTRY(n)                    \
+        (offsetof(struct kprobe_trace_entry, args) +    \
+        (sizeof(unsigned long) * (n)))
+struct kretprobe_trace_entry {
+        struct trace_entry      ent;
+        unsigned long           func;
+        unsigned long           ret_ip;
+        int                     nargs;
+        unsigned long           args[];
+};
+#define SIZEOF_KRETPROBE_TRACE_ENTRY(n)                 \
+        (offsetof(struct kretprobe_trace_entry, args) + \
+        (sizeof(unsigned long) * (n)))
 /*
 * trace_flag_type is an enumeration that holds different
 * states when a trace occurs. These are:
@@ -209,6 +234,7 @@ extern void __ftrace_bad_type(void);
                          TRACE_KMEM_ALLOC);    \
                IF_ASSIGN(var, ent, struct kmemtrace_free_entry,        \
                          TRACE_KMEM_FREE);     \
+                IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\
                __ftrace_bad_type();                                    \
        } while (0)
@@ -364,6 +390,8 @@ int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 int is_tracing_stopped(void);
+extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
 extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 #ifdef CONFIG_TRACER_MAX_TRACE
@@ -438,6 +466,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
                                         struct trace_array *tr);
 extern int trace_selftest_startup_hw_branches(struct tracer *trace,
                                              struct trace_array *tr);
+extern int trace_selftest_startup_ksym(struct tracer *trace,
+                                         struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 extern void *head_page(struct trace_array_cpu *data);
@@ -683,7 +713,6 @@ struct event_filter {
        int                     n_preds;
        struct filter_pred      **preds;
        char                    *filter_string;
-        bool                    no_reset;
 };
 struct event_subsystem {
@@ -703,7 +732,7 @@ typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
 typedef int (*regex_match_func)(char *str, struct regex *r, int len);
 enum regex_type {
-        MATCH_FULL,
+        MATCH_FULL = 0,
        MATCH_FRONT_ONLY,
        MATCH_MIDDLE_ONLY,
        MATCH_END_ONLY,
@@ -744,7 +773,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
                     struct ring_buffer *buffer,
                     struct ring_buffer_event *event)
 {
-        if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
+        if (unlikely(call->filter_active) &&
+            !filter_match_preds(call->filter, rec)) {
                ring_buffer_discard_commit(buffer, event);
                return 1;
        }
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ead3d724599d..c16a08f399df 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
        F_printk("type:%u call_site:%lx ptr:%p",
                 __entry->type_id, __entry->call_site, __entry->ptr)
 );
+FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
+        TRACE_KSYM,
+        F_STRUCT(
+                __field(        unsigned long,  ip                        )
+                __field(        unsigned char,  type                      )
+                __array(        char         ,  cmd,       TASK_COMM_LEN  )
+                __field(        unsigned long,  addr                      )
+        ),
+        F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s",
+                (void *)__entry->ip, (unsigned int)__entry->type,
+                (void *)__entry->addr,  __entry->cmd)
+);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 8d5c171cc998..d9c60f80aa0d 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -8,17 +8,14 @@
 #include <linux/module.h>
 #include "trace.h"
-/*
- * We can't use a size but a type in alloc_percpu()
- * So let's create a dummy type that matches the desired size
- */
-typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
-char            *trace_profile_buf;
+char *perf_trace_buf;
-EXPORT_SYMBOL_GPL(trace_profile_buf);
+EXPORT_SYMBOL_GPL(perf_trace_buf);
+char *perf_trace_buf_nmi;
+EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
-char            *trace_profile_buf_nmi;
+typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
-EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
 /* Count the events in use (per event id, not per instance) */
 static int      total_profile_count;
@@ -32,20 +29,20 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
                return 0;
        if (!total_profile_count) {
-                buf = (char *)alloc_percpu(profile_buf_t);
+                buf = (char *)alloc_percpu(perf_trace_t);
                if (!buf)
                        goto fail_buf;
-                rcu_assign_pointer(trace_profile_buf, buf);
+                rcu_assign_pointer(perf_trace_buf, buf);
-                buf = (char *)alloc_percpu(profile_buf_t);
+                buf = (char *)alloc_percpu(perf_trace_t);
                if (!buf)
                        goto fail_buf_nmi;
-                rcu_assign_pointer(trace_profile_buf_nmi, buf);
+                rcu_assign_pointer(perf_trace_buf_nmi, buf);
        }
-        ret = event->profile_enable();
+        ret = event->profile_enable(event);
        if (!ret) {
                total_profile_count++;
                return 0;
@@ -53,10 +50,10 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
 fail_buf_nmi:
        if (!total_profile_count) {
-                free_percpu(trace_profile_buf_nmi);
+                free_percpu(perf_trace_buf_nmi);
-                free_percpu(trace_profile_buf);
+                free_percpu(perf_trace_buf);
-                trace_profile_buf_nmi = NULL;
+                perf_trace_buf_nmi = NULL;
-                trace_profile_buf = NULL;
+                perf_trace_buf = NULL;
        }
 fail_buf:
        atomic_dec(&event->profile_count);
@@ -89,14 +86,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
        if (!atomic_add_negative(-1, &event->profile_count))
                return;
-        event->profile_disable();
+        event->profile_disable(event);
        if (!--total_profile_count) {
-                buf = trace_profile_buf;
+                buf = perf_trace_buf;
-                rcu_assign_pointer(trace_profile_buf, NULL);
+                rcu_assign_pointer(perf_trace_buf, NULL);
-                nmi_buf = trace_profile_buf_nmi;
+                nmi_buf = perf_trace_buf_nmi;
-                rcu_assign_pointer(trace_profile_buf_nmi, NULL);
+                rcu_assign_pointer(perf_trace_buf_nmi, NULL);
                /*
                 * Ensure every events in profiling have finished before
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5e9ffc33f6db..1d18315dc836 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -93,9 +93,7 @@ int trace_define_common_fields(struct ftrace_event_call *call)
 }
 EXPORT_SYMBOL_GPL(trace_define_common_fields);
-#ifdef CONFIG_MODULES
+void trace_destroy_fields(struct ftrace_event_call *call)
-static void trace_destroy_fields(struct ftrace_event_call *call)
 {
        struct ftrace_event_field *field, *next;
@@ -107,8 +105,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
        }
 }
-#endif /* CONFIG_MODULES */
 static void ftrace_event_enable_disable(struct ftrace_event_call *call,
                                        int enable)
 {
@@ -117,14 +113,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
                if (call->enabled) {
                        call->enabled = 0;
                        tracing_stop_cmdline_record();
-                        call->unregfunc(call->data);
+                        call->unregfunc(call);
                }
                break;
        case 1:
                if (!call->enabled) {
                        call->enabled = 1;
                        tracing_start_cmdline_record();
-                        call->regfunc(call->data);
+                        call->regfunc(call);
                }
                break;
        }
@@ -507,7 +503,7 @@ extern char *__bad_type_size(void);
 #define FIELD(type, name)                                               \
        sizeof(type) != sizeof(field.name) ? __bad_type_size() :        \
        #type, "common_" #name, offsetof(typeof(field), name),          \
-                sizeof(field.name)
+                sizeof(field.name), is_signed_type(type)
 static int trace_write_header(struct trace_seq *s)
 {
@@ -515,17 +511,17 @@ static int trace_write_header(struct trace_seq *s)
        /* struct trace_entry */
        return trace_seq_printf(s,
-                                "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                                "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                                "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                                "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                                "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                                "\n",
+                        "\n",
-                                FIELD(unsigned short, type),
+                        FIELD(unsigned short, type),
-                                FIELD(unsigned char, flags),
+                        FIELD(unsigned char, flags),
-                                FIELD(unsigned char, preempt_count),
+                        FIELD(unsigned char, preempt_count),
-                                FIELD(int, pid),
+                        FIELD(int, pid),
-                                FIELD(int, lock_depth));
+                        FIELD(int, lock_depth));
 }
 static ssize_t
@@ -937,27 +933,46 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
        return 0;
 }
-#define for_each_event(event, start, end)                       \
+static int __trace_add_event_call(struct ftrace_event_call *call)
-        for (event = start;                                     \
+{
-             (unsigned long)event < (unsigned long)end;         \
+        struct dentry *d_events;
-             event++)
+        int ret;
-#ifdef CONFIG_MODULES
+        if (!call->name)
+                return -EINVAL;
-static LIST_HEAD(ftrace_module_file_list);
+        if (call->raw_init) {
+                ret = call->raw_init(call);
+                if (ret < 0) {
+                        if (ret != -ENOSYS)
+                                pr_warning("Could not initialize trace "
+                                "events/%s\n", call->name);
+                        return ret;
+                }
+        }
-/*
+        d_events = event_trace_events_dir();
- * Modules must own their file_operations to keep up with
+        if (!d_events)
- * reference counting.
+                return -ENOENT;
- */
-struct ftrace_module_file_ops {
+        ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
-        struct list_head                list;
+                                &ftrace_enable_fops, &ftrace_event_filter_fops,
-        struct module                   *mod;
+                                &ftrace_event_format_fops);
-        struct file_operations          id;
+        if (!ret)
-        struct file_operations          enable;
+                list_add(&call->list, &ftrace_events);
-        struct file_operations          format;
-        struct file_operations          filter;
+        return ret;
-};
+}
+/* Add an additional event_call dynamically */
+int trace_add_event_call(struct ftrace_event_call *call)
+{
+        int ret;
+        mutex_lock(&event_mutex);
+        ret = __trace_add_event_call(call);
+        mutex_unlock(&event_mutex);
+        return ret;
+}
 static void remove_subsystem_dir(const char *name)
 {
@@ -985,6 +1000,53 @@ static void remove_subsystem_dir(const char *name)
        }
 }
+/*
+ * Must be called under locking both of event_mutex and trace_event_mutex.
+ */
+static void __trace_remove_event_call(struct ftrace_event_call *call)
+{
+        ftrace_event_enable_disable(call, 0);
+        if (call->event)
+                __unregister_ftrace_event(call->event);
+        debugfs_remove_recursive(call->dir);
+        list_del(&call->list);
+        trace_destroy_fields(call);
+        destroy_preds(call);
+        remove_subsystem_dir(call->system);
+}
+/* Remove an event_call */
+void trace_remove_event_call(struct ftrace_event_call *call)
+{
+        mutex_lock(&event_mutex);
+        down_write(&trace_event_mutex);
+        __trace_remove_event_call(call);
+        up_write(&trace_event_mutex);
+        mutex_unlock(&event_mutex);
+}
+#define for_each_event(event, start, end)                       \
+        for (event = start;                                     \
+             (unsigned long)event < (unsigned long)end;         \
+             event++)
+#ifdef CONFIG_MODULES
+static LIST_HEAD(ftrace_module_file_list);
+/*
+ * Modules must own their file_operations to keep up with
+ * reference counting.
+ */
+struct ftrace_module_file_ops {
+        struct list_head                list;
+        struct module                   *mod;
+        struct file_operations          id;
+        struct file_operations          enable;
+        struct file_operations          format;
+        struct file_operations          filter;
+};
 static struct ftrace_module_file_ops *
 trace_create_file_ops(struct module *mod)
 {
@@ -1042,7 +1104,7 @@ static void trace_module_add_events(struct module *mod)
                if (!call->name)
                        continue;
                if (call->raw_init) {
-                        ret = call->raw_init();
+                        ret = call->raw_init(call);
                        if (ret < 0) {
                                if (ret != -ENOSYS)
                                        pr_warning("Could not initialize trace "
@@ -1060,10 +1122,11 @@ static void trace_module_add_events(struct module *mod)
                                return;
                }
                call->mod = mod;
-                list_add(&call->list, &ftrace_events);
+                ret = event_create_dir(call, d_events,
-                event_create_dir(call, d_events,
+                                       &file_ops->id, &file_ops->enable,
-                                 &file_ops->id, &file_ops->enable,
+                                       &file_ops->filter, &file_ops->format);
-                                 &file_ops->filter, &file_ops->format);
+                if (!ret)
+                        list_add(&call->list, &ftrace_events);
        }
 }
@@ -1077,14 +1140,7 @@ static void trace_module_remove_events(struct module *mod)
        list_for_each_entry_safe(call, p, &ftrace_events, list) {
                if (call->mod == mod) {
                        found = true;
-                        ftrace_event_enable_disable(call, 0);
+                        __trace_remove_event_call(call);
-                        if (call->event)
-                                __unregister_ftrace_event(call->event);
-                        debugfs_remove_recursive(call->dir);
-                        list_del(&call->list);
-                        trace_destroy_fields(call);
-                        destroy_preds(call);
-                        remove_subsystem_dir(call->system);
                }
        }
@@ -1202,7 +1258,7 @@ static __init int event_trace_init(void)
                if (!call->name)
                        continue;
                if (call->raw_init) {
-                        ret = call->raw_init();
+                        ret = call->raw_init(call);
                        if (ret < 0) {
                                if (ret != -ENOSYS)
                                        pr_warning("Could not initialize trace "
@@ -1210,10 +1266,12 @@ static __init int event_trace_init(void)
                                continue;
                        }
                }
-                list_add(&call->list, &ftrace_events);
+                ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
-                event_create_dir(call, d_events, &ftrace_event_id_fops,
+                                       &ftrace_enable_fops,
-                                 &ftrace_enable_fops, &ftrace_event_filter_fops,
+                                       &ftrace_event_filter_fops,
-                                 &ftrace_event_format_fops);
+                                       &ftrace_event_format_fops);
+                if (!ret)
+                        list_add(&call->list, &ftrace_events);
        }
        while (true) {
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 92672016da28..50504cb228de 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/mutex.h>
+#include <linux/perf_event.h>
 #include "trace.h"
 #include "trace_output.h"
@@ -29,6 +30,7 @@ enum filter_op_ids
 {
        OP_OR,
        OP_AND,
+        OP_GLOB,
        OP_NE,
        OP_EQ,
        OP_LT,
@@ -46,16 +48,17 @@ struct filter_op {
 };
 static struct filter_op filter_ops[] = {
-        { OP_OR, "||", 1 },
+        { OP_OR,        "||",           1 },
-        { OP_AND, "&&", 2 },
+        { OP_AND,       "&&",           2 },
-        { OP_NE, "!=", 4 },
+        { OP_GLOB,      "~",            4 },
-        { OP_EQ, "==", 4 },
+        { OP_NE,        "!=",           4 },
-        { OP_LT, "<", 5 },
+        { OP_EQ,        "==",           4 },
-        { OP_LE, "<=", 5 },
+        { OP_LT,        "<",            5 },
-        { OP_GT, ">", 5 },
+        { OP_LE,        "<=",           5 },
-        { OP_GE, ">=", 5 },
+        { OP_GT,        ">",            5 },
-        { OP_NONE, "OP_NONE", 0 },
+        { OP_GE,        ">=",           5 },
-        { OP_OPEN_PAREN, "(", 0 },
+        { OP_NONE,      "OP_NONE",      0 },
+        { OP_OPEN_PAREN, "(",           0 },
 };
 enum {
@@ -329,22 +332,18 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
        return type;
 }
-static int filter_build_regex(struct filter_pred *pred)
+static void filter_build_regex(struct filter_pred *pred)
 {
        struct regex *r = &pred->regex;
-        char *search, *dup;
+        char *search;
-        enum regex_type type;
+        enum regex_type type = MATCH_FULL;
-        int not;
+        int not = 0;
-        type = filter_parse_regex(r->pattern, r->len, &search, &not);
+        if (pred->op == OP_GLOB) {
-        dup = kstrdup(search, GFP_KERNEL);
+                type = filter_parse_regex(r->pattern, r->len, &search, &not);
-        if (!dup)
+                r->len = strlen(search);
-                return -ENOMEM;
+                memmove(r->pattern, search, r->len+1);
+        }
-        strcpy(r->pattern, dup);
-        kfree(dup);
-        r->len = strlen(r->pattern);
        switch (type) {
        case MATCH_FULL:
@@ -362,14 +361,11 @@ static int filter_build_regex(struct filter_pred *pred)
        }
        pred->not ^= not;
-        return 0;
 }
 /* return 1 if event matches, 0 otherwise (discard) */
-int filter_match_preds(struct ftrace_event_call *call, void *rec)
+int filter_match_preds(struct event_filter *filter, void *rec)
 {
-        struct event_filter *filter = call->filter;
        int match, top = 0, val1 = 0, val2 = 0;
        int stack[MAX_FILTER_PRED];
        struct filter_pred *pred;
@@ -542,9 +538,8 @@ static void filter_disable_preds(struct ftrace_event_call *call)
                filter->preds[i]->fn = filter_pred_none;
 }
-void destroy_preds(struct ftrace_event_call *call)
+static void __free_preds(struct event_filter *filter)
 {
-        struct event_filter *filter = call->filter;
        int i;
        if (!filter)
@@ -557,21 +552,24 @@ void destroy_preds(struct ftrace_event_call *call)
        kfree(filter->preds);
        kfree(filter->filter_string);
        kfree(filter);
+}
+void destroy_preds(struct ftrace_event_call *call)
+{
+        __free_preds(call->filter);
        call->filter = NULL;
+        call->filter_active = 0;
 }
-static int init_preds(struct ftrace_event_call *call)
+static struct event_filter *__alloc_preds(void)
 {
        struct event_filter *filter;
        struct filter_pred *pred;
        int i;
-        if (call->filter)
+        filter = kzalloc(sizeof(*filter), GFP_KERNEL);
-                return 0;
+        if (!filter)
+                return ERR_PTR(-ENOMEM);
-        filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
-        if (!call->filter)
-                return -ENOMEM;
        filter->n_preds = 0;
@@ -587,12 +585,24 @@ static int init_preds(struct ftrace_event_call *call)
                filter->preds[i] = pred;
        }
-        return 0;
+        return filter;
 oom:
-        destroy_preds(call);
+        __free_preds(filter);
+        return ERR_PTR(-ENOMEM);
+}
+static int init_preds(struct ftrace_event_call *call)
+{
+        if (call->filter)
+                return 0;
-        return -ENOMEM;
+        call->filter_active = 0;
+        call->filter = __alloc_preds();
+        if (IS_ERR(call->filter))
+                return PTR_ERR(call->filter);
+        return 0;
 }
 static int init_subsystem_preds(struct event_subsystem *system)
@@ -615,14 +625,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
        return 0;
 }
-enum {
+static void filter_free_subsystem_preds(struct event_subsystem *system)
-        FILTER_DISABLE_ALL,
-        FILTER_INIT_NO_RESET,
-        FILTER_SKIP_NO_RESET,
-};
-static void filter_free_subsystem_preds(struct event_subsystem *system,
-                                        int flag)
 {
        struct ftrace_event_call *call;
@@ -633,14 +636,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
                if (strcmp(call->system, system->name) != 0)
                        continue;
-                if (flag == FILTER_INIT_NO_RESET) {
-                        call->filter->no_reset = false;
-                        continue;
-                }
-                if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
-                        continue;
                filter_disable_preds(call);
                remove_filter_string(call->filter);
        }
@@ -648,10 +643,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
 static int filter_add_pred_fn(struct filter_parse_state *ps,
                              struct ftrace_event_call *call,
+                              struct event_filter *filter,
                              struct filter_pred *pred,
                              filter_pred_fn_t fn)
 {
-        struct event_filter *filter = call->filter;
        int idx, err;
        if (filter->n_preds == MAX_FILTER_PRED) {
@@ -666,7 +661,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
                return err;
        filter->n_preds++;
-        call->filter_active = 1;
        return 0;
 }
@@ -691,7 +685,10 @@ static bool is_string_field(struct ftrace_event_field *field)
 static int is_legal_op(struct ftrace_event_field *field, int op)
 {
-        if (is_string_field(field) && (op != OP_EQ && op != OP_NE))
+        if (is_string_field(field) &&
+            (op != OP_EQ && op != OP_NE && op != OP_GLOB))
+                return 0;
+        if (!is_string_field(field) && op == OP_GLOB)
                return 0;
        return 1;
@@ -742,6 +739,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
 static int filter_add_pred(struct filter_parse_state *ps,
                           struct ftrace_event_call *call,
+                           struct event_filter *filter,
                           struct filter_pred *pred,
                           bool dry_run)
 {
@@ -776,15 +774,13 @@ static int filter_add_pred(struct filter_parse_state *ps,
        }
        if (is_string_field(field)) {
-                ret = filter_build_regex(pred);
+                filter_build_regex(pred);
-                if (ret)
-                        return ret;
                if (field->filter_type == FILTER_STATIC_STRING) {
                        fn = filter_pred_string;
                        pred->regex.field_len = field->size;
                } else if (field->filter_type == FILTER_DYN_STRING)
-                                fn = filter_pred_strloc;
+                        fn = filter_pred_strloc;
                else {
                        fn = filter_pred_pchar;
                        pred->regex.field_len = strlen(pred->regex.pattern);
@@ -813,45 +809,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
 add_pred_fn:
        if (!dry_run)
-                return filter_add_pred_fn(ps, call, pred, fn);
+                return filter_add_pred_fn(ps, call, filter, pred, fn);
-        return 0;
-}
-static int filter_add_subsystem_pred(struct filter_parse_state *ps,
-                                     struct event_subsystem *system,
-                                     struct filter_pred *pred,
-                                     char *filter_string,
-                                     bool dry_run)
-{
-        struct ftrace_event_call *call;
-        int err = 0;
-        bool fail = true;
-        list_for_each_entry(call, &ftrace_events, list) {
-                if (!call->define_fields)
-                        continue;
-                if (strcmp(call->system, system->name))
-                        continue;
-                if (call->filter->no_reset)
-                        continue;
-                err = filter_add_pred(ps, call, pred, dry_run);
-                if (err)
-                        call->filter->no_reset = true;
-                else
-                        fail = false;
-                if (!dry_run)
-                        replace_filter_string(call->filter, filter_string);
-        }
-        if (fail) {
-                parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
-                return err;
-        }
        return 0;
 }
@@ -1209,8 +1167,8 @@ static int check_preds(struct filter_parse_state *ps)
        return 0;
 }
-static int replace_preds(struct event_subsystem *system,
+static int replace_preds(struct ftrace_event_call *call,
-                         struct ftrace_event_call *call,
+                         struct event_filter *filter,
                         struct filter_parse_state *ps,
                         char *filter_string,
                         bool dry_run)
@@ -1257,11 +1215,7 @@ static int replace_preds(struct event_subsystem *system,
 add_pred:
                if (!pred)
                        return -ENOMEM;
-                if (call)
+                err = filter_add_pred(ps, call, filter, pred, dry_run);
-                        err = filter_add_pred(ps, call, pred, false);
-                else
-                        err = filter_add_subsystem_pred(ps, system, pred,
-                                                filter_string, dry_run);
                filter_free_pred(pred);
                if (err)
                        return err;
@@ -1272,10 +1226,50 @@ add_pred:
        return 0;
 }
-int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+static int replace_system_preds(struct event_subsystem *system,
+                                struct filter_parse_state *ps,
+                                char *filter_string)
 {
+        struct ftrace_event_call *call;
+        bool fail = true;
        int err;
+        list_for_each_entry(call, &ftrace_events, list) {
+                struct event_filter *filter = call->filter;
+                if (!call->define_fields)
+                        continue;
+                if (strcmp(call->system, system->name) != 0)
+                        continue;
+                /* try to see if the filter can be applied */
+                err = replace_preds(call, filter, ps, filter_string, true);
+                if (err)
+                        continue;
+                /* really apply the filter */
+                filter_disable_preds(call);
+                err = replace_preds(call, filter, ps, filter_string, false);
+                if (err)
+                        filter_disable_preds(call);
+                else {
+                        call->filter_active = 1;
+                        replace_filter_string(filter, filter_string);
+                }
+                fail = false;
+        }
+        if (fail) {
+                parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+                return -EINVAL;
+        }
+        return 0;
+}
+int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+{
+        int err;
        struct filter_parse_state *ps;
        mutex_lock(&event_mutex);
@@ -1287,8 +1281,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
        if (!strcmp(strstrip(filter_string), "0")) {
                filter_disable_preds(call);
                remove_filter_string(call->filter);
-                mutex_unlock(&event_mutex);
+                goto out_unlock;
-                return 0;
        }
        err = -ENOMEM;
@@ -1306,10 +1299,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
                goto out;
        }
-        err = replace_preds(NULL, call, ps, filter_string, false);
+        err = replace_preds(call, call->filter, ps, filter_string, false);
        if (err)
                append_filter_err(ps, call->filter);
+        else
+                call->filter_active = 1;
 out:
        filter_opstack_clear(ps);
        postfix_clear(ps);
@@ -1324,7 +1318,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
                                 char *filter_string)
 {
        int err;
        struct filter_parse_state *ps;
        mutex_lock(&event_mutex);
@@ -1334,10 +1327,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
                goto out_unlock;
        if (!strcmp(strstrip(filter_string), "0")) {
-                filter_free_subsystem_preds(system, FILTER_DISABLE_ALL);
+                filter_free_subsystem_preds(system);
                remove_filter_string(system->filter);
-                mutex_unlock(&event_mutex);
+                goto out_unlock;
-                return 0;
        }
        err = -ENOMEM;
@@ -1354,31 +1346,87 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
                goto out;
        }
-        filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET);
+        err = replace_system_preds(system, ps, filter_string);
+        if (err)
-        /* try to see the filter can be applied to which events */
-        err = replace_preds(system, NULL, ps, filter_string, true);
-        if (err) {
                append_filter_err(ps, system->filter);
-                goto out;
+out:
+        filter_opstack_clear(ps);
+        postfix_clear(ps);
+        kfree(ps);
+out_unlock:
+        mutex_unlock(&event_mutex);
+        return err;
+}
+#ifdef CONFIG_EVENT_PROFILE
+void ftrace_profile_free_filter(struct perf_event *event)
+{
+        struct event_filter *filter = event->filter;
+        event->filter = NULL;
+        __free_preds(filter);
+}
+int ftrace_profile_set_filter(struct perf_event *event, int event_id,
+                              char *filter_str)
+{
+        int err;
+        struct event_filter *filter;
+        struct filter_parse_state *ps;
+        struct ftrace_event_call *call = NULL;
+        mutex_lock(&event_mutex);
+        list_for_each_entry(call, &ftrace_events, list) {
+                if (call->id == event_id)
+                        break;
        }
-        filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET);
+        err = -EINVAL;
+        if (!call)
+                goto out_unlock;
-        /* really apply the filter to the events */
+        err = -EEXIST;
-        err = replace_preds(system, NULL, ps, filter_string, false);
+        if (event->filter)
-        if (err) {
+                goto out_unlock;
-                append_filter_err(ps, system->filter);
-                filter_free_subsystem_preds(system, 2);
+        filter = __alloc_preds();
+        if (IS_ERR(filter)) {
+                err = PTR_ERR(filter);
+                goto out_unlock;
        }
-out:
+        err = -ENOMEM;
+        ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+        if (!ps)
+                goto free_preds;
+        parse_init(ps, filter_ops, filter_str);
+        err = filter_parse(ps);
+        if (err)
+                goto free_ps;
+        err = replace_preds(call, filter, ps, filter_str, false);
+        if (!err)
+                event->filter = filter;
+free_ps:
        filter_opstack_clear(ps);
        postfix_clear(ps);
        kfree(ps);
+free_preds:
+        if (err)
+                __free_preds(filter);
 out_unlock:
        mutex_unlock(&event_mutex);
        return err;
 }
+#endif /* CONFIG_EVENT_PROFILE */
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index c74848ddb85a..dff8c84ddf17 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -66,44 +66,47 @@ static void __always_unused ____ftrace_check_##name(void)	\
 #undef __field
 #define __field(type, item)                                             \
        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                               "offset:%zu;\tsize:%zu;\n",              \
+                               "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
                               offsetof(typeof(field), item),           \
-                               sizeof(field.item));                     \
+                               sizeof(field.item), is_signed_type(type)); \
        if (!ret)                                                       \
                return 0;
 #undef __field_desc
 #define __field_desc(type, container, item)                             \
        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                               "offset:%zu;\tsize:%zu;\n",              \
+                               "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
                               offsetof(typeof(field), container.item), \
-                               sizeof(field.container.item));           \
+                               sizeof(field.container.item),            \
+                               is_signed_type(type));                   \
        if (!ret)                                                       \
                return 0;
 #undef __array
 #define __array(type, item, len)                                        \
        ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
-                               "offset:%zu;\tsize:%zu;\n",              \
+                               "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
-                               offsetof(typeof(field), item),   \
+                               offsetof(typeof(field), item),           \
-                               sizeof(field.item));             \
+                               sizeof(field.item), is_signed_type(type)); \
        if (!ret)                                                       \
                return 0;
 #undef __array_desc
 #define __array_desc(type, container, item, len)                        \
        ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
-                               "offset:%zu;\tsize:%zu;\n",              \
+                               "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
                               offsetof(typeof(field), container.item), \
-                               sizeof(field.container.item));           \
+                               sizeof(field.container.item),            \
+                               is_signed_type(type));                   \
        if (!ret)                                                       \
                return 0;
 #undef __dynamic_array
 #define __dynamic_array(type, item)                                     \
        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                               "offset:%zu;\tsize:0;\n",                \
+                               "offset:%zu;\tsize:0;\tsigned:%u;\n",    \
-                               offsetof(typeof(field), item));          \
+                               offsetof(typeof(field), item),           \
+                               is_signed_type(type));                   \
        if (!ret)                                                       \
                return 0;
@@ -131,7 +134,6 @@ ftrace_format_##name(struct ftrace_event_call *unused,			\
 #include "trace_entries.h"
 #undef __field
 #define __field(type, item)                                             \
        ret = trace_define_field(event_call, #type, #item,              \
@@ -193,6 +195,11 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 #include "trace_entries.h"
+static int ftrace_raw_init_event(struct ftrace_event_call *call)
+{
+        INIT_LIST_HEAD(&call->fields);
+        return 0;
+}
 #undef __field
 #define __field(type, item)
@@ -211,7 +218,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, type, tstruct, print)           \
-static int ftrace_raw_init_event_##call(void);                          \
                                                                        \
 struct ftrace_event_call __used                                         \
 __attribute__((__aligned__(4)))                                         \
@@ -219,14 +225,9 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
        .name                   = #call,                                \
        .id                     = type,                                 \
        .system                 = __stringify(TRACE_SYSTEM),            \
-        .raw_init               = ftrace_raw_init_event_##call,         \
+        .raw_init               = ftrace_raw_init_event,                \
        .show_format            = ftrace_format_##call,                 \
        .define_fields          = ftrace_define_fields_##call,          \
 };                                                                      \
-static int ftrace_raw_init_event_##call(void)                           \
-{                                                                       \
-        INIT_LIST_HEAD(&event_##call.fields);                           \
-        return 0;                                                       \
-}                                                                       \
 #include "trace_entries.h"
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
new file mode 100644
index 000000000000..aff5f80b59b8
--- /dev/null
+++ b/kernel/trace/trace_kprobe.c
@@ -0,0 +1,1523 @@
+/*
+ * Kprobes-based tracing events
+ *
+ * Created by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/kprobes.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/debugfs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/ptrace.h>
+#include <linux/perf_event.h>
+#include "trace.h"
+#include "trace_output.h"
+#define MAX_TRACE_ARGS 128
+#define MAX_ARGSTR_LEN 63
+#define MAX_EVENT_NAME_LEN 64
+#define KPROBE_EVENT_SYSTEM "kprobes"
+/* Reserved field names */
+#define FIELD_STRING_IP "__probe_ip"
+#define FIELD_STRING_NARGS "__probe_nargs"
+#define FIELD_STRING_RETIP "__probe_ret_ip"
+#define FIELD_STRING_FUNC "__probe_func"
+const char *reserved_field_names[] = {
+        "common_type",
+        "common_flags",
+        "common_preempt_count",
+        "common_pid",
+        "common_tgid",
+        "common_lock_depth",
+        FIELD_STRING_IP,
+        FIELD_STRING_NARGS,
+        FIELD_STRING_RETIP,
+        FIELD_STRING_FUNC,
+};
+struct fetch_func {
+        unsigned long (*func)(struct pt_regs *, void *);
+        void *data;
+};
+static __kprobes unsigned long call_fetch(struct fetch_func *f,
+                                          struct pt_regs *regs)
+{
+        return f->func(regs, f->data);
+}
+/* fetch handlers */
+static __kprobes unsigned long fetch_register(struct pt_regs *regs,
+                                              void *offset)
+{
+        return regs_get_register(regs, (unsigned int)((unsigned long)offset));
+}
+static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
+                                           void *num)
+{
+        return regs_get_kernel_stack_nth(regs,
+                                         (unsigned int)((unsigned long)num));
+}
+static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
+{
+        unsigned long retval;
+        if (probe_kernel_address(addr, retval))
+                return 0;
+        return retval;
+}
+static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
+{
+        return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
+}
+static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
+                                              void *dummy)
+{
+        return regs_return_value(regs);
+}
+static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
+                                                   void *dummy)
+{
+        return kernel_stack_pointer(regs);
+}
+/* Memory fetching by symbol */
+struct symbol_cache {
+        char *symbol;
+        long offset;
+        unsigned long addr;
+};
+static unsigned long update_symbol_cache(struct symbol_cache *sc)
+{
+        sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
+        if (sc->addr)
+                sc->addr += sc->offset;
+        return sc->addr;
+}
+static void free_symbol_cache(struct symbol_cache *sc)
+{
+        kfree(sc->symbol);
+        kfree(sc);
+}
+static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
+{
+        struct symbol_cache *sc;
+        if (!sym || strlen(sym) == 0)
+                return NULL;
+        sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
+        if (!sc)
+                return NULL;
+        sc->symbol = kstrdup(sym, GFP_KERNEL);
+        if (!sc->symbol) {
+                kfree(sc);
+                return NULL;
+        }
+        sc->offset = offset;
+        update_symbol_cache(sc);
+        return sc;
+}
+static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
+{
+        struct symbol_cache *sc = data;
+        if (sc->addr)
+                return fetch_memory(regs, (void *)sc->addr);
+        else
+                return 0;
+}
+/* Special indirect memory access interface */
+struct indirect_fetch_data {
+        struct fetch_func orig;
+        long offset;
+};
+static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
+{
+        struct indirect_fetch_data *ind = data;
+        unsigned long addr;
+        addr = call_fetch(&ind->orig, regs);
+        if (addr) {
+                addr += ind->offset;
+                return fetch_memory(regs, (void *)addr);
+        } else
+                return 0;
+}
+static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
+{
+        if (data->orig.func == fetch_indirect)
+                free_indirect_fetch_data(data->orig.data);
+        else if (data->orig.func == fetch_symbol)
+                free_symbol_cache(data->orig.data);
+        kfree(data);
+}
+/**
+ * Kprobe event core functions
+ */
+struct probe_arg {
+        struct fetch_func       fetch;
+        const char              *name;
+};
+/* Flags for trace_probe */
+#define TP_FLAG_TRACE   1
+#define TP_FLAG_PROFILE 2
+struct trace_probe {
+        struct list_head        list;
+        struct kretprobe        rp;     /* Use rp.kp for kprobe use */
+        unsigned long           nhit;
+        unsigned int            flags;  /* For TP_FLAG_* */
+        const char              *symbol;        /* symbol name */
+        struct ftrace_event_call        call;
+        struct trace_event              event;
+        unsigned int            nr_args;
+        struct probe_arg        args[];
+};
+#define SIZEOF_TRACE_PROBE(n)                   \
+        (offsetof(struct trace_probe, args) +   \
+        (sizeof(struct probe_arg) * (n)))
+static __kprobes int probe_is_return(struct trace_probe *tp)
+{
+        return tp->rp.handler != NULL;
+}
+static __kprobes const char *probe_symbol(struct trace_probe *tp)
+{
+        return tp->symbol ? tp->symbol : "unknown";
+}
+static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
+{
+        int ret = -EINVAL;
+        if (ff->func == fetch_argument)
+                ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
+        else if (ff->func == fetch_register) {
+                const char *name;
+                name = regs_query_register_name((unsigned int)((long)ff->data));
+                ret = snprintf(buf, n, "%%%s", name);
+        } else if (ff->func == fetch_stack)
+                ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
+        else if (ff->func == fetch_memory)
+                ret = snprintf(buf, n, "@0x%p", ff->data);
+        else if (ff->func == fetch_symbol) {
+                struct symbol_cache *sc = ff->data;
+                if (sc->offset)
+                        ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
+                                        sc->offset);
+                else
+                        ret = snprintf(buf, n, "@%s", sc->symbol);
+        } else if (ff->func == fetch_retvalue)
+                ret = snprintf(buf, n, "$retval");
+        else if (ff->func == fetch_stack_address)
+                ret = snprintf(buf, n, "$stack");
+        else if (ff->func == fetch_indirect) {
+                struct indirect_fetch_data *id = ff->data;
+                size_t l = 0;
+                ret = snprintf(buf, n, "%+ld(", id->offset);
+                if (ret >= n)
+                        goto end;
+                l += ret;
+                ret = probe_arg_string(buf + l, n - l, &id->orig);
+                if (ret < 0)
+                        goto end;
+                l += ret;
+                ret = snprintf(buf + l, n - l, ")");
+                ret += l;
+        }
+end:
+        if (ret >= n)
+                return -ENOSPC;
+        return ret;
+}
+static int register_probe_event(struct trace_probe *tp);
+static void unregister_probe_event(struct trace_probe *tp);
+static DEFINE_MUTEX(probe_lock);
+static LIST_HEAD(probe_list);
+static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
+static int kretprobe_dispatcher(struct kretprobe_instance *ri,
+                                struct pt_regs *regs);
+/*
+ * Allocate new trace_probe and initialize it (including kprobes).
+ */
+static struct trace_probe *alloc_trace_probe(const char *group,
+                                             const char *event,
+                                             void *addr,
+                                             const char *symbol,
+                                             unsigned long offs,
+                                             int nargs, int is_return)
+{
+        struct trace_probe *tp;
+        tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
+        if (!tp)
+                return ERR_PTR(-ENOMEM);
+        if (symbol) {
+                tp->symbol = kstrdup(symbol, GFP_KERNEL);
+                if (!tp->symbol)
+                        goto error;
+                tp->rp.kp.symbol_name = tp->symbol;
+                tp->rp.kp.offset = offs;
+        } else
+                tp->rp.kp.addr = addr;
+        if (is_return)
+                tp->rp.handler = kretprobe_dispatcher;
+        else
+                tp->rp.kp.pre_handler = kprobe_dispatcher;
+        if (!event)
+                goto error;
+        tp->call.name = kstrdup(event, GFP_KERNEL);
+        if (!tp->call.name)
+                goto error;
+        if (!group)
+                goto error;
+        tp->call.system = kstrdup(group, GFP_KERNEL);
+        if (!tp->call.system)
+                goto error;
+        INIT_LIST_HEAD(&tp->list);
+        return tp;
+error:
+        kfree(tp->call.name);
+        kfree(tp->symbol);
+        kfree(tp);
+        return ERR_PTR(-ENOMEM);
+}
+static void free_probe_arg(struct probe_arg *arg)
+{
+        if (arg->fetch.func == fetch_symbol)
+                free_symbol_cache(arg->fetch.data);
+        else if (arg->fetch.func == fetch_indirect)
+                free_indirect_fetch_data(arg->fetch.data);
+        kfree(arg->name);
+}
+static void free_trace_probe(struct trace_probe *tp)
+{
+        int i;
+        for (i = 0; i < tp->nr_args; i++)
+                free_probe_arg(&tp->args[i]);
+        kfree(tp->call.system);
+        kfree(tp->call.name);
+        kfree(tp->symbol);
+        kfree(tp);
+}
+static struct trace_probe *find_probe_event(const char *event,
+                                            const char *group)
+{
+        struct trace_probe *tp;
+        list_for_each_entry(tp, &probe_list, list)
+                if (strcmp(tp->call.name, event) == 0 &&
+                    strcmp(tp->call.system, group) == 0)
+                        return tp;
+        return NULL;
+}
+/* Unregister a trace_probe and probe_event: call with locking probe_lock */
+static void unregister_trace_probe(struct trace_probe *tp)
+{
+        if (probe_is_return(tp))
+                unregister_kretprobe(&tp->rp);
+        else
+                unregister_kprobe(&tp->rp.kp);
+        list_del(&tp->list);
+        unregister_probe_event(tp);
+}
+/* Register a trace_probe and probe_event */
+static int register_trace_probe(struct trace_probe *tp)
+{
+        struct trace_probe *old_tp;
+        int ret;
+        mutex_lock(&probe_lock);
+        /* register as an event */
+        old_tp = find_probe_event(tp->call.name, tp->call.system);
+        if (old_tp) {
+                /* delete old event */
+                unregister_trace_probe(old_tp);
+                free_trace_probe(old_tp);
+        }
+        ret = register_probe_event(tp);
+        if (ret) {
+                pr_warning("Faild to register probe event(%d)\n", ret);
+                goto end;
+        }
+        tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
+        if (probe_is_return(tp))
+                ret = register_kretprobe(&tp->rp);
+        else
+                ret = register_kprobe(&tp->rp.kp);
+        if (ret) {
+                pr_warning("Could not insert probe(%d)\n", ret);
+                if (ret == -EILSEQ) {
+                        pr_warning("Probing address(0x%p) is not an "
+                                   "instruction boundary.\n",
+                                   tp->rp.kp.addr);
+                        ret = -EINVAL;
+                }
+                unregister_probe_event(tp);
+        } else
+                list_add_tail(&tp->list, &probe_list);
+end:
+        mutex_unlock(&probe_lock);
+        return ret;
+}
+/* Split symbol and offset. */
+static int split_symbol_offset(char *symbol, unsigned long *offset)
+{
+        char *tmp;
+        int ret;
+        if (!offset)
+                return -EINVAL;
+        tmp = strchr(symbol, '+');
+        if (tmp) {
+                /* skip sign because strict_strtol doesn't accept '+' */
+                ret = strict_strtoul(tmp + 1, 0, offset);
+                if (ret)
+                        return ret;
+                *tmp = '\0';
+        } else
+                *offset = 0;
+        return 0;
+}
+#define PARAM_MAX_ARGS 16
+#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
+static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
+{
+        int ret = 0;
+        unsigned long param;
+        if (strcmp(arg, "retval") == 0) {
+                if (is_return) {
+                        ff->func = fetch_retvalue;
+                        ff->data = NULL;
+                } else
+                        ret = -EINVAL;
+        } else if (strncmp(arg, "stack", 5) == 0) {
+                if (arg[5] == '\0') {
+                        ff->func = fetch_stack_address;
+                        ff->data = NULL;
+                } else if (isdigit(arg[5])) {
+                        ret = strict_strtoul(arg + 5, 10, &param);
+                        if (ret || param > PARAM_MAX_STACK)
+                                ret = -EINVAL;
+                        else {
+                                ff->func = fetch_stack;
+                                ff->data = (void *)param;
+                        }
+                } else
+                        ret = -EINVAL;
+        } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
+                ret = strict_strtoul(arg + 3, 10, &param);
+                if (ret || param > PARAM_MAX_ARGS)
+                        ret = -EINVAL;
+                else {
+                        ff->func = fetch_argument;
+                        ff->data = (void *)param;
+                }
+        } else
+                ret = -EINVAL;
+        return ret;
+}
+/* Recursive argument parser */
+static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+{
+        int ret = 0;
+        unsigned long param;
+        long offset;
+        char *tmp;
+        switch (arg[0]) {
+        case '$':
+                ret = parse_probe_vars(arg + 1, ff, is_return);
+                break;
+        case '%':       /* named register */
+                ret = regs_query_register_offset(arg + 1);
+                if (ret >= 0) {
+                        ff->func = fetch_register;
+                        ff->data = (void *)(unsigned long)ret;
+                        ret = 0;
+                }
+                break;
+        case '@':       /* memory or symbol */
+                if (isdigit(arg[1])) {
+                        ret = strict_strtoul(arg + 1, 0, &param);
+                        if (ret)
+                                break;
+                        ff->func = fetch_memory;
+                        ff->data = (void *)param;
+                } else {
+                        ret = split_symbol_offset(arg + 1, &offset);
+                        if (ret)
+                                break;
+                        ff->data = alloc_symbol_cache(arg + 1, offset);
+                        if (ff->data)
+                                ff->func = fetch_symbol;
+                        else
+                                ret = -EINVAL;
+                }
+                break;
+        case '+':       /* indirect memory */
+        case '-':
+                tmp = strchr(arg, '(');
+                if (!tmp) {
+                        ret = -EINVAL;
+                        break;
+                }
+                *tmp = '\0';
+                ret = strict_strtol(arg + 1, 0, &offset);
+                if (ret)
+                        break;
+                if (arg[0] == '-')
+                        offset = -offset;
+                arg = tmp + 1;
+                tmp = strrchr(arg, ')');
+                if (tmp) {
+                        struct indirect_fetch_data *id;
+                        *tmp = '\0';
+                        id = kzalloc(sizeof(struct indirect_fetch_data),
+                                     GFP_KERNEL);
+                        if (!id)
+                                return -ENOMEM;
+                        id->offset = offset;
+                        ret = __parse_probe_arg(arg, &id->orig, is_return);
+                        if (ret)
+                                kfree(id);
+                        else {
+                                ff->func = fetch_indirect;
+                                ff->data = (void *)id;
+                        }
+                } else
+                        ret = -EINVAL;
+                break;
+        default:
+                /* TODO: support custom handler */
+                ret = -EINVAL;
+        }
+        return ret;
+}
+/* String length checking wrapper */
+static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+{
+        if (strlen(arg) > MAX_ARGSTR_LEN) {
+                pr_info("Argument is too long.: %s\n",  arg);
+                return -ENOSPC;
+        }
+        return __parse_probe_arg(arg, ff, is_return);
+}
+/* Return 1 if name is reserved or already used by another argument */
+static int conflict_field_name(const char *name,
+                               struct probe_arg *args, int narg)
+{
+        int i;
+        for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
+                if (strcmp(reserved_field_names[i], name) == 0)
+                        return 1;
+        for (i = 0; i < narg; i++)
+                if (strcmp(args[i].name, name) == 0)
+                        return 1;
+        return 0;
+}
+static int create_trace_probe(int argc, char **argv)
+{
+        /*
+         * Argument syntax:
+         *  - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
+         *  - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
+         * Fetch args:
+         *  $argN       : fetch Nth of function argument. (N:0-)
+         *  $retval     : fetch return value
+         *  $stack      : fetch stack address
+         *  $stackN     : fetch Nth of stack (N:0-)
+         *  @ADDR       : fetch memory at ADDR (ADDR should be in kernel)
+         *  @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
+         *  %REG        : fetch register REG
+         * Indirect memory fetch:
+         *  +|-offs(ARG) : fetch memory at ARG +|- offs address.
+         * Alias name of args:
+         *  NAME=FETCHARG : set NAME as alias of FETCHARG.
+         */
+        struct trace_probe *tp;
+        int i, ret = 0;
+        int is_return = 0;
+        char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
+        unsigned long offset = 0;
+        void *addr = NULL;
+        char buf[MAX_EVENT_NAME_LEN];
+        if (argc < 2) {
+                pr_info("Probe point is not specified.\n");
+                return -EINVAL;
+        }
+        if (argv[0][0] == 'p')
+                is_return = 0;
+        else if (argv[0][0] == 'r')
+                is_return = 1;
+        else {
+                pr_info("Probe definition must be started with 'p' or 'r'.\n");
+                return -EINVAL;
+        }
+        if (argv[0][1] == ':') {
+                event = &argv[0][2];
+                if (strchr(event, '/')) {
+                        group = event;
+                        event = strchr(group, '/') + 1;
+                        event[-1] = '\0';
+                        if (strlen(group) == 0) {
+                                pr_info("Group name is not specifiled\n");
+                                return -EINVAL;
+                        }
+                }
+                if (strlen(event) == 0) {
+                        pr_info("Event name is not specifiled\n");
+                        return -EINVAL;
+                }
+        }
+        if (isdigit(argv[1][0])) {
+                if (is_return) {
+                        pr_info("Return probe point must be a symbol.\n");
+                        return -EINVAL;
+                }
+                /* an address specified */
+                ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
+                if (ret) {
+                        pr_info("Failed to parse address.\n");
+                        return ret;
+                }
+        } else {
+                /* a symbol specified */
+                symbol = argv[1];
+                /* TODO: support .init module functions */
+                ret = split_symbol_offset(symbol, &offset);
+                if (ret) {
+                        pr_info("Failed to parse symbol.\n");
+                        return ret;
+                }
+                if (offset && is_return) {
+                        pr_info("Return probe must be used without offset.\n");
+                        return -EINVAL;
+                }
+        }
+        argc -= 2; argv += 2;
+        /* setup a probe */
+        if (!group)
+                group = KPROBE_EVENT_SYSTEM;
+        if (!event) {
+                /* Make a new event name */
+                if (symbol)
+                        snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld",
+                                 is_return ? 'r' : 'p', symbol, offset);
+                else
+                        snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p",
+                                 is_return ? 'r' : 'p', addr);
+                event = buf;
+        }
+        tp = alloc_trace_probe(group, event, addr, symbol, offset, argc,
+                               is_return);
+        if (IS_ERR(tp)) {
+                pr_info("Failed to allocate trace_probe.(%d)\n",
+                        (int)PTR_ERR(tp));
+                return PTR_ERR(tp);
+        }
+        /* parse arguments */
+        ret = 0;
+        for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+                /* Parse argument name */
+                arg = strchr(argv[i], '=');
+                if (arg)
+                        *arg++ = '\0';
+                else
+                        arg = argv[i];
+                if (conflict_field_name(argv[i], tp->args, i)) {
+                        pr_info("Argument%d name '%s' conflicts with "
+                                "another field.\n", i, argv[i]);
+                        ret = -EINVAL;
+                        goto error;
+                }
+                tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
+                if (!tp->args[i].name) {
+                        pr_info("Failed to allocate argument%d name '%s'.\n",
+                                i, argv[i]);
+                        ret = -ENOMEM;
+                        goto error;
+                }
+                /* Parse fetch argument */
+                ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
+                if (ret) {
+                        pr_info("Parse error at argument%d. (%d)\n", i, ret);
+                        kfree(tp->args[i].name);
+                        goto error;
+                }
+                tp->nr_args++;
+        }
+        ret = register_trace_probe(tp);
+        if (ret)
+                goto error;
+        return 0;
+error:
+        free_trace_probe(tp);
+        return ret;
+}
+static void cleanup_all_probes(void)
+{
+        struct trace_probe *tp;
+        mutex_lock(&probe_lock);
+        /* TODO: Use batch unregistration */
+        while (!list_empty(&probe_list)) {
+                tp = list_entry(probe_list.next, struct trace_probe, list);
+                unregister_trace_probe(tp);
+                free_trace_probe(tp);
+        }
+        mutex_unlock(&probe_lock);
+}
+/* Probes listing interfaces */
+static void *probes_seq_start(struct seq_file *m, loff_t *pos)
+{
+        mutex_lock(&probe_lock);
+        return seq_list_start(&probe_list, *pos);
+}
+static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        return seq_list_next(v, &probe_list, pos);
+}
+static void probes_seq_stop(struct seq_file *m, void *v)
+{
+        mutex_unlock(&probe_lock);
+}
+static int probes_seq_show(struct seq_file *m, void *v)
+{
+        struct trace_probe *tp = v;
+        int i, ret;
+        char buf[MAX_ARGSTR_LEN + 1];
+        seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
+        seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
+        if (!tp->symbol)
+                seq_printf(m, " 0x%p", tp->rp.kp.addr);
+        else if (tp->rp.kp.offset)
+                seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
+        else
+                seq_printf(m, " %s", probe_symbol(tp));
+        for (i = 0; i < tp->nr_args; i++) {
+                ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
+                if (ret < 0) {
+                        pr_warning("Argument%d decoding error(%d).\n", i, ret);
+                        return ret;
+                }
+                seq_printf(m, " %s=%s", tp->args[i].name, buf);
+        }
+        seq_printf(m, "\n");
+        return 0;
+}
+static const struct seq_operations probes_seq_op = {
+        .start  = probes_seq_start,
+        .next   = probes_seq_next,
+        .stop   = probes_seq_stop,
+        .show   = probes_seq_show
+};
+static int probes_open(struct inode *inode, struct file *file)
+{
+        if ((file->f_mode & FMODE_WRITE) &&
+            (file->f_flags & O_TRUNC))
+                cleanup_all_probes();
+        return seq_open(file, &probes_seq_op);
+}
+static int command_trace_probe(const char *buf)
+{
+        char **argv;
+        int argc = 0, ret = 0;
+        argv = argv_split(GFP_KERNEL, buf, &argc);
+        if (!argv)
+                return -ENOMEM;
+        if (argc)
+                ret = create_trace_probe(argc, argv);
+        argv_free(argv);
+        return ret;
+}
+#define WRITE_BUFSIZE 128
+static ssize_t probes_write(struct file *file, const char __user *buffer,
+                            size_t count, loff_t *ppos)
+{
+        char *kbuf, *tmp;
+        int ret;
+        size_t done;
+        size_t size;
+        kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+        if (!kbuf)
+                return -ENOMEM;
+        ret = done = 0;
+        while (done < count) {
+                size = count - done;
+                if (size >= WRITE_BUFSIZE)
+                        size = WRITE_BUFSIZE - 1;
+                if (copy_from_user(kbuf, buffer + done, size)) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                kbuf[size] = '\0';
+                tmp = strchr(kbuf, '\n');
+                if (tmp) {
+                        *tmp = '\0';
+                        size = tmp - kbuf + 1;
+                } else if (done + size < count) {
+                        pr_warning("Line length is too long: "
+                                   "Should be less than %d.", WRITE_BUFSIZE);
+                        ret = -EINVAL;
+                        goto out;
+                }
+                done += size;
+                /* Remove comments */
+                tmp = strchr(kbuf, '#');
+                if (tmp)
+                        *tmp = '\0';
+                ret = command_trace_probe(kbuf);
+                if (ret)
+                        goto out;
+        }
+        ret = done;
+out:
+        kfree(kbuf);
+        return ret;
+}
+static const struct file_operations kprobe_events_ops = {
+        .owner          = THIS_MODULE,
+        .open           = probes_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+        .write          = probes_write,
+};
+/* Probes profiling interfaces */
+static int probes_profile_seq_show(struct seq_file *m, void *v)
+{
+        struct trace_probe *tp = v;
+        seq_printf(m, "  %-44s %15lu %15lu\n", tp->call.name, tp->nhit,
+                   tp->rp.kp.nmissed);
+        return 0;
+}
+static const struct seq_operations profile_seq_op = {
+        .start  = probes_seq_start,
+        .next   = probes_seq_next,
+        .stop   = probes_seq_stop,
+        .show   = probes_profile_seq_show
+};
+static int profile_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &profile_seq_op);
+}
+static const struct file_operations kprobe_profile_ops = {
+        .owner          = THIS_MODULE,
+        .open           = profile_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+/* Kprobe handler */
+static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+{
+        struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+        struct kprobe_trace_entry *entry;
+        struct ring_buffer_event *event;
+        struct ring_buffer *buffer;
+        int size, i, pc;
+        unsigned long irq_flags;
+        struct ftrace_event_call *call = &tp->call;
+        tp->nhit++;
+        local_save_flags(irq_flags);
+        pc = preempt_count();
+        size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+        event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
+                                                  irq_flags, pc);
+        if (!event)
+                return 0;
+        entry = ring_buffer_event_data(event);
+        entry->nargs = tp->nr_args;
+        entry->ip = (unsigned long)kp->addr;
+        for (i = 0; i < tp->nr_args; i++)
+                entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+        if (!filter_current_check_discard(buffer, call, entry, event))
+                trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+        return 0;
+}
+/* Kretprobe handler */
+static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
+                                          struct pt_regs *regs)
+{
+        struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+        struct kretprobe_trace_entry *entry;
+        struct ring_buffer_event *event;
+        struct ring_buffer *buffer;
+        int size, i, pc;
+        unsigned long irq_flags;
+        struct ftrace_event_call *call = &tp->call;
+        local_save_flags(irq_flags);
+        pc = preempt_count();
+        size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+        event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
+                                                  irq_flags, pc);
+        if (!event)
+                return 0;
+        entry = ring_buffer_event_data(event);
+        entry->nargs = tp->nr_args;
+        entry->func = (unsigned long)tp->rp.kp.addr;
+        entry->ret_ip = (unsigned long)ri->ret_addr;
+        for (i = 0; i < tp->nr_args; i++)
+                entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+        if (!filter_current_check_discard(buffer, call, entry, event))
+                trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+        return 0;
+}
+/* Event entry printers */
+enum print_line_t
+print_kprobe_event(struct trace_iterator *iter, int flags)
+{
+        struct kprobe_trace_entry *field;
+        struct trace_seq *s = &iter->seq;
+        struct trace_event *event;
+        struct trace_probe *tp;
+        int i;
+        field = (struct kprobe_trace_entry *)iter->ent;
+        event = ftrace_find_event(field->ent.type);
+        tp = container_of(event, struct trace_probe, event);
+        if (!trace_seq_printf(s, "%s: (", tp->call.name))
+                goto partial;
+        if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
+                goto partial;
+        if (!trace_seq_puts(s, ")"))
+                goto partial;
+        for (i = 0; i < field->nargs; i++)
+                if (!trace_seq_printf(s, " %s=%lx",
+                                      tp->args[i].name, field->args[i]))
+                        goto partial;
+        if (!trace_seq_puts(s, "\n"))
+                goto partial;
+        return TRACE_TYPE_HANDLED;
+partial:
+        return TRACE_TYPE_PARTIAL_LINE;
+}
+enum print_line_t
+print_kretprobe_event(struct trace_iterator *iter, int flags)
+{
+        struct kretprobe_trace_entry *field;
+        struct trace_seq *s = &iter->seq;
+        struct trace_event *event;
+        struct trace_probe *tp;
+        int i;
+        field = (struct kretprobe_trace_entry *)iter->ent;
+        event = ftrace_find_event(field->ent.type);
+        tp = container_of(event, struct trace_probe, event);
+        if (!trace_seq_printf(s, "%s: (", tp->call.name))
+                goto partial;
+        if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
+                goto partial;
+        if (!trace_seq_puts(s, " <- "))
+                goto partial;
+        if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
+                goto partial;
+        if (!trace_seq_puts(s, ")"))
+                goto partial;
+        for (i = 0; i < field->nargs; i++)
+                if (!trace_seq_printf(s, " %s=%lx",
+                                      tp->args[i].name, field->args[i]))
+                        goto partial;
+        if (!trace_seq_puts(s, "\n"))
+                goto partial;
+        return TRACE_TYPE_HANDLED;
+partial:
+        return TRACE_TYPE_PARTIAL_LINE;
+}
+static int probe_event_enable(struct ftrace_event_call *call)
+{
+        struct trace_probe *tp = (struct trace_probe *)call->data;
+        tp->flags |= TP_FLAG_TRACE;
+        if (probe_is_return(tp))
+                return enable_kretprobe(&tp->rp);
+        else
+                return enable_kprobe(&tp->rp.kp);
+}
+static void probe_event_disable(struct ftrace_event_call *call)
+{
+        struct trace_probe *tp = (struct trace_probe *)call->data;
+        tp->flags &= ~TP_FLAG_TRACE;
+        if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
+                if (probe_is_return(tp))
+                        disable_kretprobe(&tp->rp);
+                else
+                        disable_kprobe(&tp->rp.kp);
+        }
+}
+static int probe_event_raw_init(struct ftrace_event_call *event_call)
+{
+        INIT_LIST_HEAD(&event_call->fields);
+        return 0;
+}
+#undef DEFINE_FIELD
+#define DEFINE_FIELD(type, item, name, is_signed)                       \
+        do {                                                            \
+                ret = trace_define_field(event_call, #type, name,       \
+                                         offsetof(typeof(field), item), \
+                                         sizeof(field.item), is_signed, \
+                                         FILTER_OTHER);                 \
+                if (ret)                                                \
+                        return ret;                                     \
+        } while (0)
+static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+        int ret, i;
+        struct kprobe_trace_entry field;
+        struct trace_probe *tp = (struct trace_probe *)event_call->data;
+        ret = trace_define_common_fields(event_call);
+        if (!ret)
+                return ret;
+        DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
+        DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
+        /* Set argument names as fields */
+        for (i = 0; i < tp->nr_args; i++)
+                DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+        return 0;
+}
+static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+        int ret, i;
+        struct kretprobe_trace_entry field;
+        struct trace_probe *tp = (struct trace_probe *)event_call->data;
+        ret = trace_define_common_fields(event_call);
+        if (!ret)
+                return ret;
+        DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
+        DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
+        DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
+        /* Set argument names as fields */
+        for (i = 0; i < tp->nr_args; i++)
+                DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+        return 0;
+}
+static int __probe_event_show_format(struct trace_seq *s,
+                                     struct trace_probe *tp, const char *fmt,
+                                     const char *arg)
+{
+        int i;
+        /* Show format */
+        if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
+                return 0;
+        for (i = 0; i < tp->nr_args; i++)
+                if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name))
+                        return 0;
+        if (!trace_seq_printf(s, "\", %s", arg))
+                return 0;
+        for (i = 0; i < tp->nr_args; i++)
+                if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
+                        return 0;
+        return trace_seq_puts(s, "\n");
+}
+#undef SHOW_FIELD
+#define SHOW_FIELD(type, item, name)                                    \
+        do {                                                            \
+                ret = trace_seq_printf(s, "\tfield: " #type " %s;\t"    \
+                                "offset:%u;\tsize:%u;\n", name,         \
+                                (unsigned int)offsetof(typeof(field), item),\
+                                (unsigned int)sizeof(type));            \
+                if (!ret)                                               \
+                        return 0;                                       \
+        } while (0)
+static int kprobe_event_show_format(struct ftrace_event_call *call,
+                                    struct trace_seq *s)
+{
+        struct kprobe_trace_entry field __attribute__((unused));
+        int ret, i;
+        struct trace_probe *tp = (struct trace_probe *)call->data;
+        SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP);
+        SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
+        /* Show fields */
+        for (i = 0; i < tp->nr_args; i++)
+                SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
+        trace_seq_puts(s, "\n");
+        return __probe_event_show_format(s, tp, "(%lx)",
+                                         "REC->" FIELD_STRING_IP);
+}
+static int kretprobe_event_show_format(struct ftrace_event_call *call,
+                                       struct trace_seq *s)
+{
+        struct kretprobe_trace_entry field __attribute__((unused));
+        int ret, i;
+        struct trace_probe *tp = (struct trace_probe *)call->data;
+        SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC);
+        SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP);
+        SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
+        /* Show fields */
+        for (i = 0; i < tp->nr_args; i++)
+                SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
+        trace_seq_puts(s, "\n");
+        return __probe_event_show_format(s, tp, "(%lx <- %lx)",
+                                         "REC->" FIELD_STRING_FUNC
+                                         ", REC->" FIELD_STRING_RETIP);
+}
+#ifdef CONFIG_EVENT_PROFILE
+/* Kprobe profile handler */
+static __kprobes int kprobe_profile_func(struct kprobe *kp,
+                                         struct pt_regs *regs)
+{
+        struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+        struct ftrace_event_call *call = &tp->call;
+        struct kprobe_trace_entry *entry;
+        struct trace_entry *ent;
+        int size, __size, i, pc, __cpu;
+        unsigned long irq_flags;
+        char *trace_buf;
+        char *raw_data;
+        int rctx;
+        pc = preempt_count();
+        __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+        size = ALIGN(__size + sizeof(u32), sizeof(u64));
+        size -= sizeof(u32);
+        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+                     "profile buffer not large enough"))
+                return 0;
+        /*
+         * Protect the non nmi buffer
+         * This also protects the rcu read side
+         */
+        local_irq_save(irq_flags);
+        rctx = perf_swevent_get_recursion_context();
+        if (rctx < 0)
+                goto end_recursion;
+        __cpu = smp_processor_id();
+        if (in_nmi())
+                trace_buf = rcu_dereference(perf_trace_buf_nmi);
+        else
+                trace_buf = rcu_dereference(perf_trace_buf);
+        if (!trace_buf)
+                goto end;
+        raw_data = per_cpu_ptr(trace_buf, __cpu);
+        /* Zero dead bytes from alignment to avoid buffer leak to userspace */
+        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+        entry = (struct kprobe_trace_entry *)raw_data;
+        ent = &entry->ent;
+        tracing_generic_entry_update(ent, irq_flags, pc);
+        ent->type = call->id;
+        entry->nargs = tp->nr_args;
+        entry->ip = (unsigned long)kp->addr;
+        for (i = 0; i < tp->nr_args; i++)
+                entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+        perf_tp_event(call->id, entry->ip, 1, entry, size);
+end:
+        perf_swevent_put_recursion_context(rctx);
+end_recursion:
+        local_irq_restore(irq_flags);
+        return 0;
+}
+/* Kretprobe profile handler */
+static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
+                                            struct pt_regs *regs)
+{
+        struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+        struct ftrace_event_call *call = &tp->call;
+        struct kretprobe_trace_entry *entry;
+        struct trace_entry *ent;
+        int size, __size, i, pc, __cpu;
+        unsigned long irq_flags;
+        char *trace_buf;
+        char *raw_data;
+        int rctx;
+        pc = preempt_count();
+        __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+        size = ALIGN(__size + sizeof(u32), sizeof(u64));
+        size -= sizeof(u32);
+        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+                     "profile buffer not large enough"))
+                return 0;
+        /*
+         * Protect the non nmi buffer
+         * This also protects the rcu read side
+         */
+        local_irq_save(irq_flags);
+        rctx = perf_swevent_get_recursion_context();
+        if (rctx < 0)
+                goto end_recursion;
+        __cpu = smp_processor_id();
+        if (in_nmi())
+                trace_buf = rcu_dereference(perf_trace_buf_nmi);
+        else
+                trace_buf = rcu_dereference(perf_trace_buf);
+        if (!trace_buf)
+                goto end;
+        raw_data = per_cpu_ptr(trace_buf, __cpu);
+        /* Zero dead bytes from alignment to avoid buffer leak to userspace */
+        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+        entry = (struct kretprobe_trace_entry *)raw_data;
+        ent = &entry->ent;
+        tracing_generic_entry_update(ent, irq_flags, pc);
+        ent->type = call->id;
+        entry->nargs = tp->nr_args;
+        entry->func = (unsigned long)tp->rp.kp.addr;
+        entry->ret_ip = (unsigned long)ri->ret_addr;
+        for (i = 0; i < tp->nr_args; i++)
+                entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+        perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
+end:
+        perf_swevent_put_recursion_context(rctx);
+end_recursion:
+        local_irq_restore(irq_flags);
+        return 0;
+}
+static int probe_profile_enable(struct ftrace_event_call *call)
+{
+        struct trace_probe *tp = (struct trace_probe *)call->data;
+        tp->flags |= TP_FLAG_PROFILE;
+        if (probe_is_return(tp))
+                return enable_kretprobe(&tp->rp);
+        else
+                return enable_kprobe(&tp->rp.kp);
+}
+static void probe_profile_disable(struct ftrace_event_call *call)
+{
+        struct trace_probe *tp = (struct trace_probe *)call->data;
+        tp->flags &= ~TP_FLAG_PROFILE;
+        if (!(tp->flags & TP_FLAG_TRACE)) {
+                if (probe_is_return(tp))
+                        disable_kretprobe(&tp->rp);
+                else
+                        disable_kprobe(&tp->rp.kp);
+        }
+}
+#endif  /* CONFIG_EVENT_PROFILE */
+static __kprobes
+int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
+{
+        struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+        if (tp->flags & TP_FLAG_TRACE)
+                kprobe_trace_func(kp, regs);
+#ifdef CONFIG_EVENT_PROFILE
+        if (tp->flags & TP_FLAG_PROFILE)
+                kprobe_profile_func(kp, regs);
+#endif  /* CONFIG_EVENT_PROFILE */
+        return 0;       /* We don't tweek kernel, so just return 0 */
+}
+static __kprobes
+int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+        struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+        if (tp->flags & TP_FLAG_TRACE)
+                kretprobe_trace_func(ri, regs);
+#ifdef CONFIG_EVENT_PROFILE
+        if (tp->flags & TP_FLAG_PROFILE)
+                kretprobe_profile_func(ri, regs);
+#endif  /* CONFIG_EVENT_PROFILE */
+        return 0;       /* We don't tweek kernel, so just return 0 */
+}
+static int register_probe_event(struct trace_probe *tp)
+{
+        struct ftrace_event_call *call = &tp->call;
+        int ret;
+        /* Initialize ftrace_event_call */
+        if (probe_is_return(tp)) {
+                tp->event.trace = print_kretprobe_event;
+                call->raw_init = probe_event_raw_init;
+                call->show_format = kretprobe_event_show_format;
+                call->define_fields = kretprobe_event_define_fields;
+        } else {
+                tp->event.trace = print_kprobe_event;
+                call->raw_init = probe_event_raw_init;
+                call->show_format = kprobe_event_show_format;
+                call->define_fields = kprobe_event_define_fields;
+        }
+        call->event = &tp->event;
+        call->id = register_ftrace_event(&tp->event);
+        if (!call->id)
+                return -ENODEV;
+        call->enabled = 0;
+        call->regfunc = probe_event_enable;
+        call->unregfunc = probe_event_disable;
+#ifdef CONFIG_EVENT_PROFILE
+        atomic_set(&call->profile_count, -1);
+        call->profile_enable = probe_profile_enable;
+        call->profile_disable = probe_profile_disable;
+#endif
+        call->data = tp;
+        ret = trace_add_event_call(call);
+        if (ret) {
+                pr_info("Failed to register kprobe event: %s\n", call->name);
+                unregister_ftrace_event(&tp->event);
+        }
+        return ret;
+}
+static void unregister_probe_event(struct trace_probe *tp)
+{
+        /* tp->event is unregistered in trace_remove_event_call() */
+        trace_remove_event_call(&tp->call);
+}
+/* Make a debugfs interface for controling probe points */
+static __init int init_kprobe_trace(void)
+{
+        struct dentry *d_tracer;
+        struct dentry *entry;
+        d_tracer = tracing_init_dentry();
+        if (!d_tracer)
+                return 0;
+        entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
+                                    NULL, &kprobe_events_ops);
+        /* Event list interface */
+        if (!entry)
+                pr_warning("Could not create debugfs "
+                           "'kprobe_events' entry\n");
+        /* Profile interface */
+        entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
+                                    NULL, &kprobe_profile_ops);
+        if (!entry)
+                pr_warning("Could not create debugfs "
+                           "'kprobe_profile' entry\n");
+        return 0;
+}
+fs_initcall(init_kprobe_trace);
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+static int kprobe_trace_selftest_target(int a1, int a2, int a3,
+                                        int a4, int a5, int a6)
+{
+        return a1 + a2 + a3 + a4 + a5 + a6;
+}
+static __init int kprobe_trace_self_tests_init(void)
+{
+        int ret;
+        int (*target)(int, int, int, int, int, int);
+        target = kprobe_trace_selftest_target;
+        pr_info("Testing kprobe tracing: ");
+        ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
+                                  "$arg1 $arg2 $arg3 $arg4 $stack $stack0");
+        if (WARN_ON_ONCE(ret))
+                pr_warning("error enabling function entry\n");
+        ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
+                                  "$retval");
+        if (WARN_ON_ONCE(ret))
+                pr_warning("error enabling function return\n");
+        ret = target(1, 2, 3, 4, 5, 6);
+        cleanup_all_probes();
+        pr_cont("OK\n");
+        return 0;
+}
+late_initcall(kprobe_trace_self_tests_init);
+#endif
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
new file mode 100644
index 000000000000..ddfa0fd43bc0
--- /dev/null
+++ b/kernel/trace/trace_ksym.c
@@ -0,0 +1,550 @@
+/*
+ * trace_ksym.c - Kernel Symbol Tracer
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include "trace_output.h"
+#include "trace_stat.h"
+#include "trace.h"
+#include <linux/hw_breakpoint.h>
+#include <asm/hw_breakpoint.h>
+/*
+ * For now, let us restrict the no. of symbols traced simultaneously to number
+ * of available hardware breakpoint registers.
+ */
+#define KSYM_TRACER_MAX HBP_NUM
+#define KSYM_TRACER_OP_LEN 3 /* rw- */
+struct trace_ksym {
+        struct perf_event       **ksym_hbp;
+        struct perf_event_attr  attr;
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+        unsigned long           counter;
+#endif
+        struct hlist_node       ksym_hlist;
+};
+static struct trace_array *ksym_trace_array;
+static unsigned int ksym_filter_entry_count;
+static unsigned int ksym_tracing_enabled;
+static HLIST_HEAD(ksym_filter_head);
+static DEFINE_MUTEX(ksym_tracer_mutex);
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+#define MAX_UL_INT 0xffffffff
+void ksym_collect_stats(unsigned long hbp_hit_addr)
+{
+        struct hlist_node *node;
+        struct trace_ksym *entry;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
+                if ((entry->attr.bp_addr == hbp_hit_addr) &&
+                    (entry->counter <= MAX_UL_INT)) {
+                        entry->counter++;
+                        break;
+                }
+        }
+        rcu_read_unlock();
+}
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+void ksym_hbp_handler(struct perf_event *hbp, void *data)
+{
+        struct ring_buffer_event *event;
+        struct ksym_trace_entry *entry;
+        struct pt_regs *regs = data;
+        struct ring_buffer *buffer;
+        int pc;
+        if (!ksym_tracing_enabled)
+                return;
+        buffer = ksym_trace_array->buffer;
+        pc = preempt_count();
+        event = trace_buffer_lock_reserve(buffer, TRACE_KSYM,
+                                                        sizeof(*entry), 0, pc);
+        if (!event)
+                return;
+        entry           = ring_buffer_event_data(event);
+        entry->ip       = instruction_pointer(regs);
+        entry->type     = hw_breakpoint_type(hbp);
+        entry->addr     = hw_breakpoint_addr(hbp);
+        strlcpy(entry->cmd, current->comm, TASK_COMM_LEN);
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+        ksym_collect_stats(hw_breakpoint_addr(hbp));
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
+        trace_buffer_unlock_commit(buffer, event, 0, pc);
+}
+/* Valid access types are represented as
+ *
+ * rw- : Set Read/Write Access Breakpoint
+ * -w- : Set Write Access Breakpoint
+ * --- : Clear Breakpoints
+ * --x : Set Execution Break points (Not available yet)
+ *
+ */
+static int ksym_trace_get_access_type(char *str)
+{
+        int access = 0;
+        if (str[0] == 'r')
+                access |= HW_BREAKPOINT_R;
+        if (str[1] == 'w')
+                access |= HW_BREAKPOINT_W;
+        if (str[2] == 'x')
+                access |= HW_BREAKPOINT_X;
+        switch (access) {
+        case HW_BREAKPOINT_R:
+        case HW_BREAKPOINT_W:
+        case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+                return access;
+        default:
+                return -EINVAL;
+        }
+}
+/*
+ * There can be several possible malformed requests and we attempt to capture
+ * all of them. We enumerate some of the rules
+ * 1. We will not allow kernel symbols with ':' since it is used as a delimiter.
+ *    i.e. multiple ':' symbols disallowed. Possible uses are of the form
+ *    <module>:<ksym_name>:<op>.
+ * 2. No delimiter symbol ':' in the input string
+ * 3. Spurious operator symbols or symbols not in their respective positions
+ * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file
+ * 5. Kernel symbol not a part of /proc/kallsyms
+ * 6. Duplicate requests
+ */
+static int parse_ksym_trace_str(char *input_string, char **ksymname,
+                                                        unsigned long *addr)
+{
+        int ret;
+        *ksymname = strsep(&input_string, ":");
+        *addr = kallsyms_lookup_name(*ksymname);
+        /* Check for malformed request: (2), (1) and (5) */
+        if ((!input_string) ||
+            (strlen(input_string) != KSYM_TRACER_OP_LEN) ||
+            (*addr == 0))
+                return -EINVAL;;
+        ret = ksym_trace_get_access_type(input_string);
+        return ret;
+}
+int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
+{
+        struct trace_ksym *entry;
+        int ret = -ENOMEM;
+        if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
+                printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
+                " new requests for tracing can be accepted now.\n",
+                        KSYM_TRACER_MAX);
+                return -ENOSPC;
+        }
+        entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
+        if (!entry)
+                return -ENOMEM;
+        hw_breakpoint_init(&entry->attr);
+        entry->attr.bp_type = op;
+        entry->attr.bp_addr = addr;
+        entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
+        ret = -EAGAIN;
+        entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
+                                        ksym_hbp_handler);
+        if (IS_ERR(entry->ksym_hbp)) {
+                ret = PTR_ERR(entry->ksym_hbp);
+                printk(KERN_INFO "ksym_tracer request failed. Try again"
+                                        " later!!\n");
+                goto err;
+        }
+        hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
+        ksym_filter_entry_count++;
+        return 0;
+err:
+        kfree(entry);
+        return ret;
+}
+static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
+                                                size_t count, loff_t *ppos)
+{
+        struct trace_ksym *entry;
+        struct hlist_node *node;
+        struct trace_seq *s;
+        ssize_t cnt = 0;
+        int ret;
+        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s)
+                return -ENOMEM;
+        trace_seq_init(s);
+        mutex_lock(&ksym_tracer_mutex);
+        hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+                ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr);
+                if (entry->attr.bp_type == HW_BREAKPOINT_R)
+                        ret = trace_seq_puts(s, "r--\n");
+                else if (entry->attr.bp_type == HW_BREAKPOINT_W)
+                        ret = trace_seq_puts(s, "-w-\n");
+                else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R))
+                        ret = trace_seq_puts(s, "rw-\n");
+                WARN_ON_ONCE(!ret);
+        }
+        cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+        mutex_unlock(&ksym_tracer_mutex);
+        kfree(s);
+        return cnt;
+}
+static void __ksym_trace_reset(void)
+{
+        struct trace_ksym *entry;
+        struct hlist_node *node, *node1;
+        mutex_lock(&ksym_tracer_mutex);
+        hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
+                                                                ksym_hlist) {
+                unregister_wide_hw_breakpoint(entry->ksym_hbp);
+                ksym_filter_entry_count--;
+                hlist_del_rcu(&(entry->ksym_hlist));
+                synchronize_rcu();
+                kfree(entry);
+        }
+        mutex_unlock(&ksym_tracer_mutex);
+}
+static ssize_t ksym_trace_filter_write(struct file *file,
+                                        const char __user *buffer,
+                                                size_t count, loff_t *ppos)
+{
+        struct trace_ksym *entry;
+        struct hlist_node *node;
+        char *input_string, *ksymname = NULL;
+        unsigned long ksym_addr = 0;
+        int ret, op, changed = 0;
+        input_string = kzalloc(count + 1, GFP_KERNEL);
+        if (!input_string)
+                return -ENOMEM;
+        if (copy_from_user(input_string, buffer, count)) {
+                kfree(input_string);
+                return -EFAULT;
+        }
+        input_string[count] = '\0';
+        strstrip(input_string);
+        /*
+         * Clear all breakpoints if:
+         * 1: echo > ksym_trace_filter
+         * 2: echo 0 > ksym_trace_filter
+         * 3: echo "*:---" > ksym_trace_filter
+         */
+        if (!input_string[0] || !strcmp(input_string, "0") ||
+            !strcmp(input_string, "*:---")) {
+                __ksym_trace_reset();
+                kfree(input_string);
+                return count;
+        }
+        ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
+        if (ret < 0) {
+                kfree(input_string);
+                return ret;
+        }
+        mutex_lock(&ksym_tracer_mutex);
+        ret = -EINVAL;
+        hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
+                if (entry->attr.bp_addr == ksym_addr) {
+                        /* Check for malformed request: (6) */
+                        if (entry->attr.bp_type != op)
+                                changed = 1;
+                        else
+                                goto out;
+                        break;
+                }
+        }
+        if (changed) {
+                unregister_wide_hw_breakpoint(entry->ksym_hbp);
+                entry->attr.bp_type = op;
+                ret = 0;
+                if (op > 0) {
+                        entry->ksym_hbp =
+                                register_wide_hw_breakpoint(&entry->attr,
+                                        ksym_hbp_handler);
+                        if (IS_ERR(entry->ksym_hbp))
+                                ret = PTR_ERR(entry->ksym_hbp);
+                        else
+                                goto out;
+                }
+                /* Error or "symbol:---" case: drop it */
+                ksym_filter_entry_count--;
+                hlist_del_rcu(&(entry->ksym_hlist));
+                synchronize_rcu();
+                kfree(entry);
+                goto out;
+        } else {
+                /* Check for malformed request: (4) */
+                if (op == 0)
+                        goto out;
+                ret = process_new_ksym_entry(ksymname, op, ksym_addr);
+        }
+out:
+        mutex_unlock(&ksym_tracer_mutex);
+        kfree(input_string);
+        if (!ret)
+                ret = count;
+        return ret;
+}
+static const struct file_operations ksym_tracing_fops = {
+        .open           = tracing_open_generic,
+        .read           = ksym_trace_filter_read,
+        .write          = ksym_trace_filter_write,
+};
+static void ksym_trace_reset(struct trace_array *tr)
+{
+        ksym_tracing_enabled = 0;
+        __ksym_trace_reset();
+}
+static int ksym_trace_init(struct trace_array *tr)
+{
+        int cpu, ret = 0;
+        for_each_online_cpu(cpu)
+                tracing_reset(tr, cpu);
+        ksym_tracing_enabled = 1;
+        ksym_trace_array = tr;
+        return ret;
+}
+static void ksym_trace_print_header(struct seq_file *m)
+{
+        seq_puts(m,
+                 "#       TASK-PID   CPU#      Symbol                    "
+                 "Type    Function\n");
+        seq_puts(m,
+                 "#          |        |          |                       "
+                 " |         |\n");
+}
+static enum print_line_t ksym_trace_output(struct trace_iterator *iter)
+{
+        struct trace_entry *entry = iter->ent;
+        struct trace_seq *s = &iter->seq;
+        struct ksym_trace_entry *field;
+        char str[KSYM_SYMBOL_LEN];
+        int ret;
+        if (entry->type != TRACE_KSYM)
+                return TRACE_TYPE_UNHANDLED;
+        trace_assign_type(field, entry);
+        ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd,
+                                entry->pid, iter->cpu, (char *)field->addr);
+        if (!ret)
+                return TRACE_TYPE_PARTIAL_LINE;
+        switch (field->type) {
+        case HW_BREAKPOINT_R:
+                ret = trace_seq_printf(s, " R  ");
+                break;
+        case HW_BREAKPOINT_W:
+                ret = trace_seq_printf(s, " W  ");
+                break;
+        case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
+                ret = trace_seq_printf(s, " RW ");
+                break;
+        default:
+                return TRACE_TYPE_PARTIAL_LINE;
+        }
+        if (!ret)
+                return TRACE_TYPE_PARTIAL_LINE;
+        sprint_symbol(str, field->ip);
+        ret = trace_seq_printf(s, "%s\n", str);
+        if (!ret)
+                return TRACE_TYPE_PARTIAL_LINE;
+        return TRACE_TYPE_HANDLED;
+}
+struct tracer ksym_tracer __read_mostly =
+{
+        .name           = "ksym_tracer",
+        .init           = ksym_trace_init,
+        .reset          = ksym_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+        .selftest       = trace_selftest_startup_ksym,
+#endif
+        .print_header   = ksym_trace_print_header,
+        .print_line     = ksym_trace_output
+};
+__init static int init_ksym_trace(void)
+{
+        struct dentry *d_tracer;
+        struct dentry *entry;
+        d_tracer = tracing_init_dentry();
+        ksym_filter_entry_count = 0;
+        entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
+                                    NULL, &ksym_tracing_fops);
+        if (!entry)
+                pr_warning("Could not create debugfs "
+                           "'ksym_trace_filter' file\n");
+        return register_tracer(&ksym_tracer);
+}
+device_initcall(init_ksym_trace);
+#ifdef CONFIG_PROFILE_KSYM_TRACER
+static int ksym_tracer_stat_headers(struct seq_file *m)
+{
+        seq_puts(m, "  Access Type ");
+        seq_puts(m, "  Symbol                                       Counter\n");
+        seq_puts(m, "  ----------- ");
+        seq_puts(m, "  ------                                       -------\n");
+        return 0;
+}
+static int ksym_tracer_stat_show(struct seq_file *m, void *v)
+{
+        struct hlist_node *stat = v;
+        struct trace_ksym *entry;
+        int access_type = 0;
+        char fn_name[KSYM_NAME_LEN];
+        entry = hlist_entry(stat, struct trace_ksym, ksym_hlist);
+        access_type = entry->attr.bp_type;
+        switch (access_type) {
+        case HW_BREAKPOINT_R:
+                seq_puts(m, "  R           ");
+                break;
+        case HW_BREAKPOINT_W:
+                seq_puts(m, "  W           ");
+                break;
+        case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
+                seq_puts(m, "  RW          ");
+                break;
+        default:
+                seq_puts(m, "  NA          ");
+        }
+        if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
+                seq_printf(m, "  %-36s", fn_name);
+        else
+                seq_printf(m, "  %-36s", "<NA>");
+        seq_printf(m, " %15lu\n", entry->counter);
+        return 0;
+}
+static void *ksym_tracer_stat_start(struct tracer_stat *trace)
+{
+        return ksym_filter_head.first;
+}
+static void *
+ksym_tracer_stat_next(void *v, int idx)
+{
+        struct hlist_node *stat = v;
+        return stat->next;
+}
+static struct tracer_stat ksym_tracer_stats = {
+        .name = "ksym_tracer",
+        .stat_start = ksym_tracer_stat_start,
+        .stat_next = ksym_tracer_stat_next,
+        .stat_headers = ksym_tracer_stat_headers,
+        .stat_show = ksym_tracer_stat_show
+};
+__init static int ksym_tracer_stat_init(void)
+{
+        int ret;
+        ret = register_stat_tracer(&ksym_tracer_stats);
+        if (ret) {
+                printk(KERN_WARNING "Warning: could not register "
+                                    "ksym tracer stats\n");
+                return 1;
+        }
+        return 0;
+}
+fs_initcall(ksym_tracer_stat_init);
+#endif /* CONFIG_PROFILE_KSYM_TRACER */
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index d2cdbabb4ead..dc98309e839a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
        case TRACE_GRAPH_ENT:
        case TRACE_GRAPH_RET:
        case TRACE_HW_BRANCHES:
+        case TRACE_KSYM:
                return 1;
        }
        return 0;
@@ -808,3 +809,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace,
        return ret;
 }
 #endif /* CONFIG_HW_BRANCH_TRACER */
+#ifdef CONFIG_KSYM_TRACER
+static int ksym_selftest_dummy;
+int
+trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr)
+{
+        unsigned long count;
+        int ret;
+        /* start the tracing */
+        ret = tracer_init(trace, tr);
+        if (ret) {
+                warn_failed_init_tracer(trace, ret);
+                return ret;
+        }
+        ksym_selftest_dummy = 0;
+        /* Register the read-write tracing request */
+        ret = process_new_ksym_entry("ksym_selftest_dummy",
+                                     HW_BREAKPOINT_R | HW_BREAKPOINT_W,
+                                        (unsigned long)(&ksym_selftest_dummy));
+        if (ret < 0) {
+                printk(KERN_CONT "ksym_trace read-write startup test failed\n");
+                goto ret_path;
+        }
+        /* Perform a read and a write operation over the dummy variable to
+         * trigger the tracer
+         */
+        if (ksym_selftest_dummy == 0)
+                ksym_selftest_dummy++;
+        /* stop the tracing. */
+        tracing_stop();
+        /* check the trace buffer */
+        ret = trace_test_buffer(tr, &count);
+        trace->reset(tr);
+        tracing_start();
+        /* read & write operations - one each is performed on the dummy variable
+         * triggering two entries in the trace buffer
+         */
+        if (!ret && count != 2) {
+                printk(KERN_CONT "Ksym tracer startup test failed");
+                ret = -1;
+        }
+ret_path:
+        return ret;
+}
+#endif /* CONFIG_KSYM_TRACER */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index ddee9c593732..57501d90096a 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -51,32 +51,6 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
        return syscalls_metadata[nr];
 }
-int syscall_name_to_nr(char *name)
-{
-        int i;
-        if (!syscalls_metadata)
-                return -1;
-        for (i = 0; i < NR_syscalls; i++) {
-                if (syscalls_metadata[i]) {
-                        if (!strcmp(syscalls_metadata[i]->name, name))
-                                return i;
-                }
-        }
-        return -1;
-}
-void set_syscall_enter_id(int num, int id)
-{
-        syscalls_metadata[num]->enter_id = id;
-}
-void set_syscall_exit_id(int num, int id)
-{
-        syscalls_metadata[num]->exit_id = id;
-}
 enum print_line_t
 print_syscall_enter(struct trace_iterator *iter, int flags)
 {
@@ -93,7 +67,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
        if (!entry)
                goto end;
-        if (entry->enter_id != ent->type) {
+        if (entry->enter_event->id != ent->type) {
                WARN_ON_ONCE(1);
                goto end;
        }
@@ -148,7 +122,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
                return TRACE_TYPE_HANDLED;
        }
-        if (entry->exit_id != ent->type) {
+        if (entry->exit_event->id != ent->type) {
                WARN_ON_ONCE(1);
                return TRACE_TYPE_UNHANDLED;
        }
@@ -166,24 +140,19 @@ extern char *__bad_type_size(void);
 #define SYSCALL_FIELD(type, name)                                       \
        sizeof(type) != sizeof(trace.name) ?                            \
                __bad_type_size() :                                     \
-                #type, #name, offsetof(typeof(trace), name), sizeof(trace.name)
+                #type, #name, offsetof(typeof(trace), name),            \
+                sizeof(trace.name), is_signed_type(type)
 int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
 {
        int i;
-        int nr;
        int ret;
-        struct syscall_metadata *entry;
+        struct syscall_metadata *entry = call->data;
        struct syscall_trace_enter trace;
        int offset = offsetof(struct syscall_trace_enter, args);
-        nr = syscall_name_to_nr(call->data);
+        ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
-        entry = syscall_nr_to_meta(nr);
+                               "\tsigned:%u;\n",
-        if (!entry)
-                return 0;
-        ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
                               SYSCALL_FIELD(int, nr));
        if (!ret)
                return 0;
@@ -193,8 +162,10 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
                                        entry->args[i]);
                if (!ret)
                        return 0;
-                ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset,
+                ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
-                                       sizeof(unsigned long));
+                                       "\tsigned:%u;\n", offset,
+                                       sizeof(unsigned long),
+                                       is_signed_type(unsigned long));
                if (!ret)
                        return 0;
                offset += sizeof(unsigned long);
@@ -226,8 +197,10 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
        struct syscall_trace_exit trace;
        ret = trace_seq_printf(s,
-                               "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+                               "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
-                               "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
+                               "\tsigned:%u;\n"
+                               "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
+                               "\tsigned:%u;\n",
                               SYSCALL_FIELD(int, nr),
                               SYSCALL_FIELD(long, ret));
        if (!ret)
@@ -239,22 +212,19 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
 int syscall_enter_define_fields(struct ftrace_event_call *call)
 {
        struct syscall_trace_enter trace;
-        struct syscall_metadata *meta;
+        struct syscall_metadata *meta = call->data;
        int ret;
-        int nr;
        int i;
        int offset = offsetof(typeof(trace), args);
-        nr = syscall_name_to_nr(call->data);
-        meta = syscall_nr_to_meta(nr);
-        if (!meta)
-                return 0;
        ret = trace_define_common_fields(call);
        if (ret)
                return ret;
+        ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+        if (ret)
+                return ret;
        for (i = 0; i < meta->nb_args; i++) {
                ret = trace_define_field(call, meta->types[i],
                                         meta->args[i], offset,
@@ -275,7 +245,11 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
        if (ret)
                return ret;
-        ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 0,
+        ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+        if (ret)
+                return ret;
+        ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
                                 FILTER_OTHER);
        return ret;
@@ -302,8 +276,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
        size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
-        event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id,
+        event = trace_current_buffer_lock_reserve(&buffer,
-                                                  size, 0, 0);
+                        sys_data->enter_event->id, size, 0, 0);
        if (!event)
                return;
@@ -334,8 +308,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
        if (!sys_data)
                return;
-        event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id,
+        event = trace_current_buffer_lock_reserve(&buffer,
-                                sizeof(*entry), 0, 0);
+                        sys_data->exit_event->id, sizeof(*entry), 0, 0);
        if (!event)
                return;
@@ -348,14 +322,12 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
                trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 }
-int reg_event_syscall_enter(void *ptr)
+int reg_event_syscall_enter(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
-        char *name;
-        name = (char *)ptr;
+        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        num = syscall_name_to_nr(name);
        if (num < 0 || num >= NR_syscalls)
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
@@ -372,13 +344,11 @@ int reg_event_syscall_enter(void *ptr)
        return ret;
 }
-void unreg_event_syscall_enter(void *ptr)
+void unreg_event_syscall_enter(struct ftrace_event_call *call)
 {
        int num;
-        char *name;
-        name = (char *)ptr;
+        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        num = syscall_name_to_nr(name);
        if (num < 0 || num >= NR_syscalls)
                return;
        mutex_lock(&syscall_trace_lock);
@@ -389,14 +359,12 @@ void unreg_event_syscall_enter(void *ptr)
        mutex_unlock(&syscall_trace_lock);
 }
-int reg_event_syscall_exit(void *ptr)
+int reg_event_syscall_exit(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
-        char *name;
-        name = (char *)ptr;
+        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        num = syscall_name_to_nr(name);
        if (num < 0 || num >= NR_syscalls)
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
@@ -413,13 +381,11 @@ int reg_event_syscall_exit(void *ptr)
        return ret;
 }
-void unreg_event_syscall_exit(void *ptr)
+void unreg_event_syscall_exit(struct ftrace_event_call *call)
 {
        int num;
-        char *name;
-        name = (char *)ptr;
+        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        num = syscall_name_to_nr(name);
        if (num < 0 || num >= NR_syscalls)
                return;
        mutex_lock(&syscall_trace_lock);
@@ -430,13 +396,17 @@ void unreg_event_syscall_exit(void *ptr)
        mutex_unlock(&syscall_trace_lock);
 }
-struct trace_event event_syscall_enter = {
+int init_syscall_trace(struct ftrace_event_call *call)
-        .trace                  = print_syscall_enter,
+{
-};
+        int id;
-struct trace_event event_syscall_exit = {
+        id = register_ftrace_event(call->event);
-        .trace                  = print_syscall_exit,
+        if (!id)
-};
+                return -ENODEV;
+        call->id = id;
+        INIT_LIST_HEAD(&call->fields);
+        return 0;
+}
 int __init init_ftrace_syscalls(void)
 {
@@ -454,6 +424,10 @@ int __init init_ftrace_syscalls(void)
        for (i = 0; i < NR_syscalls; i++) {
                addr = arch_syscall_addr(i);
                meta = find_syscall_meta(addr);
+                if (!meta)
+                        continue;
+                meta->syscall_nr = i;
                syscalls_metadata[i] = meta;
        }
@@ -473,8 +447,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
        struct syscall_metadata *sys_data;
        struct syscall_trace_enter *rec;
        unsigned long flags;
+        char *trace_buf;
        char *raw_data;
        int syscall_nr;
+        int rctx;
        int size;
        int cpu;
@@ -498,41 +474,42 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
        /* Protect the per cpu buffer, begin the rcu read side */
        local_irq_save(flags);
+        rctx = perf_swevent_get_recursion_context();
+        if (rctx < 0)
+                goto end_recursion;
        cpu = smp_processor_id();
-        if (in_nmi())
+        trace_buf = rcu_dereference(perf_trace_buf);
-                raw_data = rcu_dereference(trace_profile_buf_nmi);
-        else
-                raw_data = rcu_dereference(trace_profile_buf);
-        if (!raw_data)
+        if (!trace_buf)
                goto end;
-        raw_data = per_cpu_ptr(raw_data, cpu);
+        raw_data = per_cpu_ptr(trace_buf, cpu);
        /* zero the dead bytes from align to not leak stack to user */
        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
        rec = (struct syscall_trace_enter *) raw_data;
        tracing_generic_entry_update(&rec->ent, 0, 0);
-        rec->ent.type = sys_data->enter_id;
+        rec->ent.type = sys_data->enter_event->id;
        rec->nr = syscall_nr;
        syscall_get_arguments(current, regs, 0, sys_data->nb_args,
                               (unsigned long *)&rec->args);
-        perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
+        perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
 end:
+        perf_swevent_put_recursion_context(rctx);
+end_recursion:
        local_irq_restore(flags);
 }
-int reg_prof_syscall_enter(char *name)
+int prof_sysenter_enable(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
-        num = syscall_name_to_nr(name);
+        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        if (num < 0 || num >= NR_syscalls)
-                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
        if (!sys_prof_refcount_enter)
@@ -548,13 +525,11 @@ int reg_prof_syscall_enter(char *name)
        return ret;
 }
-void unreg_prof_syscall_enter(char *name)
+void prof_sysenter_disable(struct ftrace_event_call *call)
 {
        int num;
-        num = syscall_name_to_nr(name);
+        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        if (num < 0 || num >= NR_syscalls)
-                return;
        mutex_lock(&syscall_trace_lock);
        sys_prof_refcount_enter--;
@@ -570,7 +545,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
        struct syscall_trace_exit *rec;
        unsigned long flags;
        int syscall_nr;
+        char *trace_buf;
        char *raw_data;
+        int rctx;
        int size;
        int cpu;
@@ -596,17 +573,19 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
        /* Protect the per cpu buffer, begin the rcu read side */
        local_irq_save(flags);
+        rctx = perf_swevent_get_recursion_context();
+        if (rctx < 0)
+                goto end_recursion;
        cpu = smp_processor_id();
-        if (in_nmi())
+        trace_buf = rcu_dereference(perf_trace_buf);
-                raw_data = rcu_dereference(trace_profile_buf_nmi);
-        else
-                raw_data = rcu_dereference(trace_profile_buf);
-        if (!raw_data)
+        if (!trace_buf)
                goto end;
-        raw_data = per_cpu_ptr(raw_data, cpu);
+        raw_data = per_cpu_ptr(trace_buf, cpu);
        /* zero the dead bytes from align to not leak stack to user */
        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -614,24 +593,24 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
        rec = (struct syscall_trace_exit *)raw_data;
        tracing_generic_entry_update(&rec->ent, 0, 0);
-        rec->ent.type = sys_data->exit_id;
+        rec->ent.type = sys_data->exit_event->id;
        rec->nr = syscall_nr;
        rec->ret = syscall_get_return_value(current, regs);
-        perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
+        perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
 end:
+        perf_swevent_put_recursion_context(rctx);
+end_recursion:
        local_irq_restore(flags);
 }
-int reg_prof_syscall_exit(char *name)
+int prof_sysexit_enable(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
-        num = syscall_name_to_nr(name);
+        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        if (num < 0 || num >= NR_syscalls)
-                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
        if (!sys_prof_refcount_exit)
@@ -647,13 +626,11 @@ int reg_prof_syscall_exit(char *name)
        return ret;
 }
-void unreg_prof_syscall_exit(char *name)
+void prof_sysexit_disable(struct ftrace_event_call *call)
 {
        int num;
-        num = syscall_name_to_nr(name);
+        num = ((struct syscall_metadata *)call->data)->syscall_nr;
-        if (num < 0 || num >= NR_syscalls)
-                return;
        mutex_lock(&syscall_trace_lock);
        sys_prof_refcount_exit--;
author	Linus Torvalds <torvalds@linux-foundation.org>	2009-12-05 18:30:21 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2009-12-05 18:30:21 -0500
commit	c3fa27d1367fac63ac8533d6f20ea851d0d70a10 (patch)
tree	e7731554085e22b6b63411b1ebb401079f3e0bbb /kernel
parent	96fa2b508d2d3fe040cf4ef2fffb955f0a537ea1 (diff)
parent	d103d01e4b19f185d3c85f77402b605534c32e89 (diff)