81 files changed, 3048 insertions, 2145 deletions
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index 0e50bc2aa1e2..5d9016795fd8 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -182,16 +182,6 @@ Similarly, sched_expedited RCU provides the following:
        sched_expedited-torture: Reader Pipe:  12660320201 95875 0 0 0 0 0 0 0 0 0
        sched_expedited-torture: Reader Batch:  12660424885 0 0 0 0 0 0 0 0 0 0
        sched_expedited-torture: Free-Block Circulation:  1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0
-        state: -1 / 0:0 3:0 4:0
-As before, the first four lines are similar to those for RCU.
-The last line shows the task-migration state.  The first number is
-1 if synchronize_sched_expedited() is idle, -2 if in the process of
-posting wakeups to the migration kthreads, and N when waiting on CPU N.
-Each of the colon-separated fields following the "/" is a CPU:state pair.
-Valid states are "0" for idle, "1" for waiting for quiescent state,
-"2" for passed through quiescent state, and "3" when a race with a
-CPU-hotplug event forces use of the synchronize_sched() primitive.
 USAGE
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 839b21b0699a..907010cea9ad 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -784,8 +784,12 @@ and is between 256 and 4096 characters. It is defined in the file
                        as early as possible in order to facilitate early
                        boot debugging.
-        ftrace_dump_on_oops
+        ftrace_dump_on_oops[=orig_cpu]
                        [FTRACE] will dump the trace buffers on oops.
+                        If no parameter is passed, ftrace will dump
+                        buffers of all CPUs, but if you pass orig_cpu, it will
+                        dump only the buffer of the CPU that triggered the
+                        oops.
        ftrace_filter=[function-list]
                        [FTRACE] Limit the functions traced by the function
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
index 6f33593e59e2..8239ebbcddce 100644
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -211,7 +211,7 @@ provide fair CPU time to each such task group.  For example, it may be
 desirable to first provide fair CPU time to each user on the system and then to
 each task belonging to a user.
-CONFIG_GROUP_SCHED strives to achieve exactly that.  It lets tasks to be
+CONFIG_CGROUP_SCHED strives to achieve exactly that.  It lets tasks to be
 grouped and divides CPU time fairly among such groups.
 CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and
@@ -220,38 +220,11 @@ SCHED_RR) tasks.
 CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and
 SCHED_BATCH) tasks.
-At present, there are two (mutually exclusive) mechanisms to group tasks for
+   These options need CONFIG_CGROUPS to be defined, and let the administrator
-CPU bandwidth control purposes:
- - Based on user id (CONFIG_USER_SCHED)
-   With this option, tasks are grouped according to their user id.
- - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED)
-   This options needs CONFIG_CGROUPS to be defined, and lets the administrator
   create arbitrary groups of tasks, using the "cgroup" pseudo filesystem.  See
   Documentation/cgroups/cgroups.txt for more information about this filesystem.
-Only one of these options to group tasks can be chosen and not both.
+When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each
-When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new
-user and a "cpu_share" file is added in that directory.
-        # cd /sys/kernel/uids
-        # cat 512/cpu_share             # Display user 512's CPU share
-        1024
-        # echo 2048 > 512/cpu_share     # Modify user 512's CPU share
-        # cat 512/cpu_share             # Display user 512's CPU share
-        2048
-        #
-CPU bandwidth between two users is divided in the ratio of their CPU shares.
-For example: if you would like user "root" to get twice the bandwidth of user
-"guest," then set the cpu_share for both the users such that "root"'s cpu_share
-is twice "guest"'s cpu_share.
-When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each
 group created using the pseudo filesystem.  See example steps below to create
 task groups and modify their CPU share using the "cgroups" pseudo filesystem.
@@ -273,24 +246,3 @@ task groups and modify their CPU share using the "cgroups" pseudo filesystem.
        # #Launch gmplayer (or your favourite movie player)
        # echo <movie_player_pid> > multimedia/tasks
-8. Implementation note: user namespaces
-User namespaces are intended to be hierarchical.  But they are currently
-only partially implemented.  Each of those has ramifications for CFS.
-First, since user namespaces are hierarchical, the /sys/kernel/uids
-presentation is inadequate.  Eventually we will likely want to use sysfs
-tagging to provide private views of /sys/kernel/uids within each user
-namespace.
-Second, the hierarchical nature is intended to support completely
-unprivileged use of user namespaces.  So if using user groups, then
-we want the users in a user namespace to be children of the user
-who created it.
-That is currently unimplemented.  So instead, every user in a new
-user namespace will receive 1024 shares just like any user in the
-initial user namespace.  Note that at the moment creation of a new
-user namespace requires each of CAP_SYS_ADMIN, CAP_SETUID, and
-CAP_SETGID.
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 86eabe6c3419..605b0d40329d 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -126,23 +126,12 @@ priority!
 2.3 Basis for grouping tasks
 ----------------------------
-There are two compile-time settings for allocating CPU bandwidth. These are
+Enabling CONFIG_RT_GROUP_SCHED lets you explicitly allocate real
-configured using the "Basis for grouping tasks" multiple choice menu under
+CPU bandwidth to task groups.
-General setup > Group CPU Scheduler:
-a. CONFIG_USER_SCHED (aka "Basis for grouping tasks" =  "user id")
-This lets you use the virtual files under
-"/sys/kernel/uids/<uid>/cpu_rt_runtime_us" to control he CPU time reserved for
-each user .
-The other option is:
-.o CONFIG_CGROUP_SCHED (aka "Basis for grouping tasks" = "Control groups")
 This uses the /cgroup virtual file system and
 "/cgroup/<cgroup>/cpu.rt_runtime_us" to control the CPU time reserved for each
-control group instead.
+control group.
 For more information on working with control groups, you should read
 Documentation/cgroups/cgroups.txt as well.
@@ -161,8 +150,7 @@ For now, this can be simplified to just the following (but see Future plans):
 ===============
 There is work in progress to make the scheduling period for each group
-("/sys/kernel/uids/<uid>/cpu_rt_period_us" or
+("/cgroup/<cgroup>/cpu.rt_period_us") configurable as well.
-"/cgroup/<cgroup>/cpu.rt_period_us" respectively) configurable as well.
 The constraint on the period is that a subgroup must have a smaller or
 equal period to its parent. But realistically its not very useful _yet_
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index 02ac6ed38b2d..778ddf38b82c 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -90,7 +90,8 @@ In order to facilitate early boot debugging, use boot option:
        trace_event=[event-list]
-The format of this boot option is the same as described in section 2.1.
+event-list is a comma separated list of events. See section 2.1 for event
+format.
 3. Defining an event-enabled tracepoint
 =======================================
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 03485bfbd797..557c1edeccaf 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -155,6 +155,9 @@ of ftrace. Here is a list of some of the key files:
        to be traced. Echoing names of functions into this file
        will limit the trace to only those functions.
+        This interface also allows for commands to be used. See the
+        "Filter commands" section for more details.
  set_ftrace_notrace:
        This has an effect opposite to that of
@@ -1337,12 +1340,14 @@ ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one
 can either use the sysctl function or set it via the proc system
 interface.
-  sysctl kernel.ftrace_dump_on_oops=1
+  sysctl kernel.ftrace_dump_on_oops=n
 or
-  echo 1 > /proc/sys/kernel/ftrace_dump_on_oops
+  echo n > /proc/sys/kernel/ftrace_dump_on_oops
+If n = 1, ftrace will dump buffers of all CPUs, if n = 2 ftrace will
+only dump the buffer of the CPU that triggered the oops.
 Here's an example of such a dump after a null pointer
 dereference in a kernel module:
@@ -1822,6 +1827,47 @@ this special filter via:
 echo > set_graph_function
+Filter commands
+---------------
+A few commands are supported by the set_ftrace_filter interface.
+Trace commands have the following format:
+<function>:<command>:<parameter>
+The following commands are supported:
+- mod
+  This command enables function filtering per module. The
+  parameter defines the module. For example, if only the write*
+  functions in the ext3 module are desired, run:
+   echo 'write*:mod:ext3' > set_ftrace_filter
+  This command interacts with the filter in the same way as
+  filtering based on function names. Thus, adding more functions
+  in a different module is accomplished by appending (>>) to the
+  filter file. Remove specific module functions by prepending
+  '!':
+   echo '!writeback*:mod:ext3' >> set_ftrace_filter
+- traceon/traceoff
+  These commands turn tracing on and off when the specified
+  functions are hit. The parameter determines how many times the
+  tracing system is turned on and off. If unspecified, there is
+  no limit. For example, to disable tracing when a schedule bug
+  is hit the first 5 times, run:
+   echo '__schedule_bug:traceoff:5' > set_ftrace_filter
+  These commands are cumulative whether or not they are appended
+  to set_ftrace_filter. To remove a command, prepend it by '!'
+  and drop the parameter:
+   echo '!__schedule_bug:traceoff' > set_ftrace_filter
 trace_pipe
 ----------
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index d906bf19c14a..a2163c95eb98 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -391,7 +391,6 @@ static void __init time_init_wq(void)
        if (time_sync_wq)
                return;
        time_sync_wq = create_singlethread_workqueue("timesync");
-        stop_machine_create();
 }
 /*
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 59de2525d303..d4e8b213a462 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -289,7 +289,7 @@ static struct sysrq_key_op sysrq_showstate_blocked_op = {
 static void sysrq_ftrace_dump(int key, struct tty_struct *tty)
 {
-        ftrace_dump();
+        ftrace_dump(DUMP_ALL);
 }
 static struct sysrq_key_op sysrq_ftrace_dump_op = {
        .handler        = sysrq_ftrace_dump,
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index bd444dc93cf2..8e9dbdc6c700 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -73,6 +73,7 @@ enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE};
 struct cpu_dbs_info_s {
        cputime64_t prev_cpu_idle;
+        cputime64_t prev_cpu_iowait;
        cputime64_t prev_cpu_wall;
        cputime64_t prev_cpu_nice;
        struct cpufreq_policy *cur_policy;
@@ -108,6 +109,7 @@ static struct dbs_tuners {
        unsigned int down_differential;
        unsigned int ignore_nice;
        unsigned int powersave_bias;
+        unsigned int io_is_busy;
 } dbs_tuners_ins = {
        .up_threshold = DEF_FREQUENCY_UP_THRESHOLD,
        .down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL,
@@ -148,6 +150,16 @@ static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
        return idle_time;
 }
+static inline cputime64_t get_cpu_iowait_time(unsigned int cpu, cputime64_t *wall)
+{
+        u64 iowait_time = get_cpu_iowait_time_us(cpu, wall);
+        if (iowait_time == -1ULL)
+                return 0;
+        return iowait_time;
+}
 /*
 * Find right freq to be set now with powersave_bias on.
 * Returns the freq_hi to be used right now and will set freq_hi_jiffies,
@@ -249,6 +261,7 @@ static ssize_t show_##file_name						\
        return sprintf(buf, "%u\n", dbs_tuners_ins.object);             \
 }
 show_one(sampling_rate, sampling_rate);
+show_one(io_is_busy, io_is_busy);
 show_one(up_threshold, up_threshold);
 show_one(ignore_nice_load, ignore_nice);
 show_one(powersave_bias, powersave_bias);
@@ -299,6 +312,23 @@ static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b,
        return count;
 }
+static ssize_t store_io_is_busy(struct kobject *a, struct attribute *b,
+                                   const char *buf, size_t count)
+{
+        unsigned int input;
+        int ret;
+        ret = sscanf(buf, "%u", &input);
+        if (ret != 1)
+                return -EINVAL;
+        mutex_lock(&dbs_mutex);
+        dbs_tuners_ins.io_is_busy = !!input;
+        mutex_unlock(&dbs_mutex);
+        return count;
+}
 static ssize_t store_up_threshold(struct kobject *a, struct attribute *b,
                                  const char *buf, size_t count)
 {
@@ -381,6 +411,7 @@ static struct global_attr _name = \
 __ATTR(_name, 0644, show_##_name, store_##_name)
 define_one_rw(sampling_rate);
+define_one_rw(io_is_busy);
 define_one_rw(up_threshold);
 define_one_rw(ignore_nice_load);
 define_one_rw(powersave_bias);
@@ -392,6 +423,7 @@ static struct attribute *dbs_attributes[] = {
        &up_threshold.attr,
        &ignore_nice_load.attr,
        &powersave_bias.attr,
+        &io_is_busy.attr,
        NULL
 };
@@ -470,14 +502,15 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
        for_each_cpu(j, policy->cpus) {
                struct cpu_dbs_info_s *j_dbs_info;
-                cputime64_t cur_wall_time, cur_idle_time;
+                cputime64_t cur_wall_time, cur_idle_time, cur_iowait_time;
-                unsigned int idle_time, wall_time;
+                unsigned int idle_time, wall_time, iowait_time;
                unsigned int load, load_freq;
                int freq_avg;
                j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
                cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
+                cur_iowait_time = get_cpu_iowait_time(j, &cur_wall_time);
                wall_time = (unsigned int) cputime64_sub(cur_wall_time,
                                j_dbs_info->prev_cpu_wall);
@@ -487,6 +520,10 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
                                j_dbs_info->prev_cpu_idle);
                j_dbs_info->prev_cpu_idle = cur_idle_time;
+                iowait_time = (unsigned int) cputime64_sub(cur_iowait_time,
+                                j_dbs_info->prev_cpu_iowait);
+                j_dbs_info->prev_cpu_iowait = cur_iowait_time;
                if (dbs_tuners_ins.ignore_nice) {
                        cputime64_t cur_nice;
                        unsigned long cur_nice_jiffies;
@@ -504,6 +541,16 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
                        idle_time += jiffies_to_usecs(cur_nice_jiffies);
                }
+                /*
+                 * For the purpose of ondemand, waiting for disk IO is an
+                 * indication that you're performance critical, and not that
+                 * the system is actually idle. So subtract the iowait time
+                 * from the cpu idle time.
+                 */
+                if (dbs_tuners_ins.io_is_busy && idle_time >= iowait_time)
+                        idle_time -= iowait_time;
                if (unlikely(!wall_time || wall_time < idle_time))
                        continue;
@@ -617,6 +664,29 @@ static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
        cancel_delayed_work_sync(&dbs_info->work);
 }
+/*
+ * Not all CPUs want IO time to be accounted as busy; this dependson how
+ * efficient idling at a higher frequency/voltage is.
+ * Pavel Machek says this is not so for various generations of AMD and old
+ * Intel systems.
+ * Mike Chan (androidlcom) calis this is also not true for ARM.
+ * Because of this, whitelist specific known (series) of CPUs by default, and
+ * leave all others up to the user.
+ */
+static int should_io_be_busy(void)
+{
+#if defined(CONFIG_X86)
+        /*
+         * For Intel, Core 2 (model 15) andl later have an efficient idle.
+         */
+        if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+            boot_cpu_data.x86 == 6 &&
+            boot_cpu_data.x86_model >= 15)
+                return 1;
+#endif
+        return 0;
+}
 static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
                                   unsigned int event)
 {
@@ -679,6 +749,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
                        dbs_tuners_ins.sampling_rate =
                                max(min_sampling_rate,
                                    latency * LATENCY_MULTIPLIER);
+                        dbs_tuners_ins.io_is_busy = should_io_be_busy();
                }
                mutex_unlock(&dbs_mutex);
diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index 166b67ea622f..7581dbe456da 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -186,14 +186,14 @@ int op_cpu_buffer_write_commit(struct op_entry *entry)
 struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
 {
        struct ring_buffer_event *e;
-        e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
+        e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL, NULL);
        if (e)
                goto event;
        if (ring_buffer_swap_cpu(op_ring_buffer_read,
                                 op_ring_buffer_write,
                                 cpu))
                return NULL;
-        e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
+        e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL, NULL);
        if (e)
                goto event;
        return NULL;
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 2ac4440e7b08..8943b8ccee1a 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -80,12 +80,6 @@ static void do_suspend(void)
        shutting_down = SHUTDOWN_SUSPEND;
-        err = stop_machine_create();
-        if (err) {
-                printk(KERN_ERR "xen suspend: failed to setup stop_machine %d\n", err);
-                goto out;
-        }
 #ifdef CONFIG_PREEMPT
        /* If the kernel is preemptible, we need to freeze all the processes
           to prevent them from being in the middle of a pagetable update
@@ -93,7 +87,7 @@ static void do_suspend(void)
        err = freeze_processes();
        if (err) {
                printk(KERN_ERR "xen suspend: freeze failed %d\n", err);
-                goto out_destroy_sm;
+                goto out;
        }
 #endif
@@ -136,12 +130,8 @@ out_resume:
 out_thaw:
 #ifdef CONFIG_PREEMPT
        thaw_processes();
-out_destroy_sm:
-#endif
-        stop_machine_destroy();
 out:
+#endif
        shutting_down = SHUTDOWN_INVALID;
 }
 #endif  /* CONFIG_PM_SLEEP */
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index bd056a5b4efc..3817149919cb 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1140,8 +1140,7 @@ retry:
                 * ep_poll_callback() when events will become available.
                 */
                init_waitqueue_entry(&wait, current);
-                wait.flags |= WQ_FLAG_EXCLUSIVE;
+                __add_wait_queue_exclusive(&ep->wq, &wait);
-                __add_wait_queue(&ep->wq, &wait);
                for (;;) {
                        /*
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a5740fc4d04b..a73454aec333 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -21,8 +21,7 @@ extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
-extern void cpuset_cpus_allowed_locked(struct task_struct *p,
+extern int cpuset_cpus_allowed_fallback(struct task_struct *p);
-                                       struct cpumask *mask);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -69,9 +68,6 @@ struct seq_file;
 extern void cpuset_task_status_allowed(struct seq_file *m,
                                        struct task_struct *task);
-extern void cpuset_lock(void);
-extern void cpuset_unlock(void);
 extern int cpuset_mem_spread_node(void);
 static inline int cpuset_do_page_mem_spread(void)
@@ -105,10 +101,11 @@ static inline void cpuset_cpus_allowed(struct task_struct *p,
 {
        cpumask_copy(mask, cpu_possible_mask);
 }
-static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
-                                              struct cpumask *mask)
+static inline int cpuset_cpus_allowed_fallback(struct task_struct *p)
 {
-        cpumask_copy(mask, cpu_possible_mask);
+        cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
+        return cpumask_any(cpu_active_mask);
 }
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
@@ -157,9 +154,6 @@ static inline void cpuset_task_status_allowed(struct seq_file *m,
 {
 }
-static inline void cpuset_lock(void) {}
-static inline void cpuset_unlock(void) {}
 static inline int cpuset_mem_spread_node(void)
 {
        return 0;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index cc12b3c556b3..41e46330d9be 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -82,9 +82,13 @@ void clear_ftrace_function(void);
 extern void ftrace_stub(unsigned long a0, unsigned long a1);
 #else /* !CONFIG_FUNCTION_TRACER */
-# define register_ftrace_function(ops) do { } while (0)
+/*
-# define unregister_ftrace_function(ops) do { } while (0)
+ * (un)register_ftrace_function must be a macro since the ops parameter
-# define clear_ftrace_function(ops) do { } while (0)
+ * must not be evaluated.
+ */
+#define register_ftrace_function(ops) ({ 0; })
+#define unregister_ftrace_function(ops) ({ 0; })
+static inline void clear_ftrace_function(void) { }
 static inline void ftrace_kill(void) { }
 static inline void ftrace_stop(void) { }
 static inline void ftrace_start(void) { }
@@ -237,11 +241,13 @@ extern int skip_trace(unsigned long ip);
 extern void ftrace_disable_daemon(void);
 extern void ftrace_enable_daemon(void);
 #else
-# define skip_trace(ip)                         ({ 0; })
+static inline int skip_trace(unsigned long ip) { return 0; }
-# define ftrace_force_update()                  ({ 0; })
+static inline int ftrace_force_update(void) { return 0; }
-# define ftrace_set_filter(buf, len, reset)     do { } while (0)
+static inline void ftrace_set_filter(unsigned char *buf, int len, int reset)
-# define ftrace_disable_daemon()                do { } while (0)
+{
-# define ftrace_enable_daemon()                 do { } while (0)
+}
+static inline void ftrace_disable_daemon(void) { }
+static inline void ftrace_enable_daemon(void) { }
 static inline void ftrace_release_mod(struct module *mod) {}
 static inline int register_ftrace_command(struct ftrace_func_command *cmd)
 {
@@ -314,16 +320,16 @@ static inline void __ftrace_enabled_restore(int enabled)
  extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
  extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
 #else
-# define time_hardirqs_on(a0, a1)               do { } while (0)
+  static inline void time_hardirqs_on(unsigned long a0, unsigned long a1) { }
-# define time_hardirqs_off(a0, a1)              do { } while (0)
+  static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { }
 #endif
 #ifdef CONFIG_PREEMPT_TRACER
  extern void trace_preempt_on(unsigned long a0, unsigned long a1);
  extern void trace_preempt_off(unsigned long a0, unsigned long a1);
 #else
-# define trace_preempt_on(a0, a1)               do { } while (0)
+  static inline void trace_preempt_on(unsigned long a0, unsigned long a1) { }
-# define trace_preempt_off(a0, a1)              do { } while (0)
+  static inline void trace_preempt_off(unsigned long a0, unsigned long a1) { }
 #endif
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
@@ -352,6 +358,10 @@ struct ftrace_graph_ret {
        int depth;
 };
+/* Type of the callback handlers for tracing function graph*/
+typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
+typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 /* for init task */
@@ -400,10 +410,6 @@ extern char __irqentry_text_end[];
 #define FTRACE_RETFUNC_DEPTH 50
 #define FTRACE_RETSTACK_ALLOC_SIZE 32
-/* Type of the callback handlers for tracing function graph*/
-typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
-typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
 extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
                                trace_func_graph_ent_t entryfunc);
@@ -441,6 +447,13 @@ static inline void unpause_graph_tracing(void)
 static inline void ftrace_graph_init_task(struct task_struct *t) { }
 static inline void ftrace_graph_exit_task(struct task_struct *t) { }
+static inline int register_ftrace_graph(trace_func_graph_ret_t retfunc,
+                          trace_func_graph_ent_t entryfunc)
+{
+        return -1;
+}
+static inline void unregister_ftrace_graph(void) { }
 static inline int task_curr_ret_stack(struct task_struct *tsk)
 {
        return -1;
@@ -492,7 +505,9 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk)
        return tsk->trace & TSK_TRACE_FL_GRAPH;
 }
-extern int ftrace_dump_on_oops;
+enum ftrace_dump_mode;
+extern enum ftrace_dump_mode ftrace_dump_on_oops;
 #ifdef CONFIG_PREEMPT
 #define INIT_TRACE_RECURSION            .trace_recursion = 0,
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index c0f4b364c711..dc7fc646fa2e 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -58,6 +58,7 @@ struct trace_iterator {
        /* The below is zeroed out in pipe_read */
        struct trace_seq        seq;
        struct trace_entry      *ent;
+        unsigned long           lost_events;
        int                     leftover;
        int                     cpu;
        u64                     ts;
@@ -69,18 +70,25 @@ struct trace_iterator {
 };
+struct trace_event;
 typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
-                                              int flags);
+                                      int flags, struct trace_event *event);
-struct trace_event {
-        struct hlist_node       node;
+struct trace_event_functions {
-        struct list_head        list;
-        int                     type;
        trace_print_func        trace;
        trace_print_func        raw;
        trace_print_func        hex;
        trace_print_func        binary;
 };
+struct trace_event {
+        struct hlist_node               node;
+        struct list_head                list;
+        int                             type;
+        struct trace_event_functions    *funcs;
+};
 extern int register_ftrace_event(struct trace_event *event);
 extern int unregister_ftrace_event(struct trace_event *event);
@@ -112,28 +120,67 @@ void tracing_record_cmdline(struct task_struct *tsk);
 struct event_filter;
+enum trace_reg {
+        TRACE_REG_REGISTER,
+        TRACE_REG_UNREGISTER,
+        TRACE_REG_PERF_REGISTER,
+        TRACE_REG_PERF_UNREGISTER,
+};
+struct ftrace_event_call;
+struct ftrace_event_class {
+        char                    *system;
+        void                    *probe;
+#ifdef CONFIG_PERF_EVENTS
+        void                    *perf_probe;
+#endif
+        int                     (*reg)(struct ftrace_event_call *event,
+                                       enum trace_reg type);
+        int                     (*define_fields)(struct ftrace_event_call *);
+        struct list_head        *(*get_fields)(struct ftrace_event_call *);
+        struct list_head        fields;
+        int                     (*raw_init)(struct ftrace_event_call *);
+};
+enum {
+        TRACE_EVENT_FL_ENABLED_BIT,
+        TRACE_EVENT_FL_FILTERED_BIT,
+};
+enum {
+        TRACE_EVENT_FL_ENABLED  = (1 << TRACE_EVENT_FL_ENABLED_BIT),
+        TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT),
+};
 struct ftrace_event_call {
        struct list_head        list;
+        struct ftrace_event_class *class;
        char                    *name;
-        char                    *system;
        struct dentry           *dir;
-        struct trace_event      *event;
+        struct trace_event      event;
-        int                     enabled;
-        int                     (*regfunc)(struct ftrace_event_call *);
-        void                    (*unregfunc)(struct ftrace_event_call *);
-        int                     id;
        const char              *print_fmt;
-        int                     (*raw_init)(struct ftrace_event_call *);
-        int                     (*define_fields)(struct ftrace_event_call *);
-        struct list_head        fields;
-        int                     filter_active;
        struct event_filter     *filter;
        void                    *mod;
        void                    *data;
+        /*
+         * 32 bit flags:
+         *   bit 1:             enabled
+         *   bit 2:             filter_active
+         *
+         * Changes to flags must hold the event_mutex.
+         *
+         * Note: Reads of flags do not hold the event_mutex since
+         * they occur in critical sections. But the way flags
+         * is currently used, these changes do no affect the code
+         * except that when a change is made, it may have a slight
+         * delay in propagating the changes to other CPUs due to
+         * caching and such.
+         */
+        unsigned int            flags;
        int                     perf_refcount;
-        int                     (*perf_event_enable)(struct ftrace_event_call *);
-        void                    (*perf_event_disable)(struct ftrace_event_call *);
 };
 #define PERF_MAX_TRACE_SIZE     2048
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 9365227dbaf6..9fb1c1299032 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -490,6 +490,13 @@ static inline void tracing_off(void) { }
 static inline void tracing_off_permanent(void) { }
 static inline int tracing_is_on(void) { return 0; }
 #endif
+enum ftrace_dump_mode {
+        DUMP_NONE,
+        DUMP_ALL,
+        DUMP_ORIG,
+};
 #ifdef CONFIG_TRACING
 extern void tracing_start(void);
 extern void tracing_stop(void);
@@ -571,7 +578,7 @@ __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap);
 extern int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
-extern void ftrace_dump(void);
+extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode);
 #else
 static inline void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
@@ -592,7 +599,7 @@ ftrace_vprintk(const char *fmt, va_list ap)
 {
        return 0;
 }
-static inline void ftrace_dump(void) { }
+static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 #endif /* CONFIG_TRACING */
 /*
diff --git a/include/linux/module.h b/include/linux/module.h
index 515d53ae6a79..6914fcad4673 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -465,8 +465,7 @@ static inline void __module_get(struct module *module)
        if (module) {
                preempt_disable();
                __this_cpu_inc(module->refptr->incs);
-                trace_module_get(module, _THIS_IP_,
+                trace_module_get(module, _THIS_IP_);
-                                 __this_cpu_read(module->refptr->incs));
                preempt_enable();
        }
 }
@@ -480,8 +479,7 @@ static inline int try_module_get(struct module *module)
                if (likely(module_is_live(module))) {
                        __this_cpu_inc(module->refptr->incs);
-                        trace_module_get(module, _THIS_IP_,
+                        trace_module_get(module, _THIS_IP_);
-                                __this_cpu_read(module->refptr->incs));
                } else
                        ret = 0;
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index a5195875480a..0006b2df00e1 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -60,8 +60,6 @@ static inline long rcu_batches_completed_bh(void)
        return 0;
 }
-extern int rcu_expedited_torture_stats(char *page);
 static inline void rcu_force_quiescent_state(void)
 {
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 42cc3a04779e..24e467e526b8 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -35,7 +35,6 @@ struct notifier_block;
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
 extern int rcu_needs_cpu(int cpu);
-extern int rcu_expedited_torture_stats(char *page);
 #ifdef CONFIG_TREE_PREEMPT_RCU
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 5fcc31ed5771..25b4f686d918 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -120,12 +120,16 @@ int ring_buffer_write(struct ring_buffer *buffer,
                      unsigned long length, void *data);
 struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts);
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+                 unsigned long *lost_events);
 struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts);
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+                    unsigned long *lost_events);
 struct ring_buffer_iter *
-ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu);
+void ring_buffer_read_prepare_sync(void);
+void ring_buffer_read_start(struct ring_buffer_iter *iter);
 void ring_buffer_read_finish(struct ring_buffer_iter *iter);
 struct ring_buffer_event *
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e0447c64af6a..2a5b146fbaf9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -274,11 +274,17 @@ extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
 extern int get_nohz_load_balancer(void);
+extern int nohz_ratelimit(int cpu);
 #else
 static inline int select_nohz_load_balancer(int cpu)
 {
        return 0;
 }
+static inline int nohz_ratelimit(int cpu)
+{
+        return 0;
+}
 #endif
 /*
@@ -953,6 +959,7 @@ struct sched_domain {
        char *name;
 #endif
+        unsigned int span_weight;
        /*
         * Span of all CPUs in this domain.
         *
@@ -1025,12 +1032,17 @@ struct sched_domain;
 #define WF_SYNC         0x01            /* waker goes to sleep after wakup */
 #define WF_FORK         0x02            /* child wakeup after fork */
+#define ENQUEUE_WAKEUP          1
+#define ENQUEUE_WAKING          2
+#define ENQUEUE_HEAD            4
+#define DEQUEUE_SLEEP           1
 struct sched_class {
        const struct sched_class *next;
-        void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup,
+        void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
-                              bool head);
+        void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
-        void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
        void (*yield_task) (struct rq *rq);
        void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
@@ -1039,7 +1051,8 @@ struct sched_class {
        void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 #ifdef CONFIG_SMP
-        int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+        int  (*select_task_rq)(struct rq *rq, struct task_struct *p,
+                               int sd_flag, int flags);
        void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
        void (*post_schedule) (struct rq *this_rq);
@@ -1076,36 +1089,8 @@ struct load_weight {
        unsigned long weight, inv_weight;
 };
-/*
- * CFS stats for a schedulable entity (task, task-group etc)
- *
- * Current field usage histogram:
- *
- *     4 se->block_start
- *     4 se->run_node
- *     4 se->sleep_start
- *     6 se->load.weight
- */
-struct sched_entity {
-        struct load_weight      load;           /* for load-balancing */
-        struct rb_node          run_node;
-        struct list_head        group_node;
-        unsigned int            on_rq;
-        u64                     exec_start;
-        u64                     sum_exec_runtime;
-        u64                     vruntime;
-        u64                     prev_sum_exec_runtime;
-        u64                     last_wakeup;
-        u64                     avg_overlap;
-        u64                     nr_migrations;
-        u64                     start_runtime;
-        u64                     avg_wakeup;
 #ifdef CONFIG_SCHEDSTATS
+struct sched_statistics {
        u64                     wait_start;
        u64                     wait_max;
        u64                     wait_count;
@@ -1137,6 +1122,24 @@ struct sched_entity {
        u64                     nr_wakeups_affine_attempts;
        u64                     nr_wakeups_passive;
        u64                     nr_wakeups_idle;
+};
+#endif
+struct sched_entity {
+        struct load_weight      load;           /* for load-balancing */
+        struct rb_node          run_node;
+        struct list_head        group_node;
+        unsigned int            on_rq;
+        u64                     exec_start;
+        u64                     sum_exec_runtime;
+        u64                     vruntime;
+        u64                     prev_sum_exec_runtime;
+        u64                     nr_migrations;
+#ifdef CONFIG_SCHEDSTATS
+        struct sched_statistics statistics;
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1840,6 +1843,7 @@ extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 #ifdef CONFIG_HOTPLUG_CPU
+extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
 extern void idle_task_exit(void);
 #else
 static inline void idle_task_exit(void) {}
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index baba3a23a814..6b524a0d02e4 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -1,13 +1,101 @@
 #ifndef _LINUX_STOP_MACHINE
 #define _LINUX_STOP_MACHINE
-/* "Bogolock": stop the entire machine, disable interrupts.  This is a
-   very heavy lock, which is equivalent to grabbing every spinlock
-   (and more).  So the "read" side to such a lock is anything which
-   disables preeempt. */
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
+#include <linux/list.h>
 #include <asm/system.h>
+/*
+ * stop_cpu[s]() is simplistic per-cpu maximum priority cpu
+ * monopolization mechanism.  The caller can specify a non-sleeping
+ * function to be executed on a single or multiple cpus preempting all
+ * other processes and monopolizing those cpus until it finishes.
+ *
+ * Resources for this mechanism are preallocated when a cpu is brought
+ * up and requests are guaranteed to be served as long as the target
+ * cpus are online.
+ */
+typedef int (*cpu_stop_fn_t)(void *arg);
+#ifdef CONFIG_SMP
+struct cpu_stop_work {
+        struct list_head        list;           /* cpu_stopper->works */
+        cpu_stop_fn_t           fn;
+        void                    *arg;
+        struct cpu_stop_done    *done;
+};
+int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
+void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+                         struct cpu_stop_work *work_buf);
+int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
+int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
+#else   /* CONFIG_SMP */
+#include <linux/workqueue.h>
+struct cpu_stop_work {
+        struct work_struct      work;
+        cpu_stop_fn_t           fn;
+        void                    *arg;
+};
+static inline int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
+{
+        int ret = -ENOENT;
+        preempt_disable();
+        if (cpu == smp_processor_id())
+                ret = fn(arg);
+        preempt_enable();
+        return ret;
+}
+static void stop_one_cpu_nowait_workfn(struct work_struct *work)
+{
+        struct cpu_stop_work *stwork =
+                container_of(work, struct cpu_stop_work, work);
+        preempt_disable();
+        stwork->fn(stwork->arg);
+        preempt_enable();
+}
+static inline void stop_one_cpu_nowait(unsigned int cpu,
+                                       cpu_stop_fn_t fn, void *arg,
+                                       struct cpu_stop_work *work_buf)
+{
+        if (cpu == smp_processor_id()) {
+                INIT_WORK(&work_buf->work, stop_one_cpu_nowait_workfn);
+                work_buf->fn = fn;
+                work_buf->arg = arg;
+                schedule_work(&work_buf->work);
+        }
+}
+static inline int stop_cpus(const struct cpumask *cpumask,
+                            cpu_stop_fn_t fn, void *arg)
+{
+        if (cpumask_test_cpu(raw_smp_processor_id(), cpumask))
+                return stop_one_cpu(raw_smp_processor_id(), fn, arg);
+        return -ENOENT;
+}
+static inline int try_stop_cpus(const struct cpumask *cpumask,
+                                cpu_stop_fn_t fn, void *arg)
+{
+        return stop_cpus(cpumask, fn, arg);
+}
+#endif  /* CONFIG_SMP */
+/*
+ * stop_machine "Bogolock": stop the entire machine, disable
+ * interrupts.  This is a very heavy lock, which is equivalent to
+ * grabbing every spinlock (and more).  So the "read" side to such a
+ * lock is anything which disables preeempt.
+ */
 #if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
 /**
@@ -36,24 +124,7 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
 */
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
-/**
+#else    /* CONFIG_STOP_MACHINE && CONFIG_SMP */
- * stop_machine_create: create all stop_machine threads
- *
- * Description: This causes all stop_machine threads to be created before
- * stop_machine actually gets called. This can be used by subsystems that
- * need a non failing stop_machine infrastructure.
- */
-int stop_machine_create(void);
-/**
- * stop_machine_destroy: destroy all stop_machine threads
- *
- * Description: This causes all stop_machine threads which were created with
- * stop_machine_create to be destroyed again.
- */
-void stop_machine_destroy(void);
-#else
 static inline int stop_machine(int (*fn)(void *), void *data,
                               const struct cpumask *cpus)
@@ -65,8 +136,5 @@ static inline int stop_machine(int (*fn)(void *), void *data,
        return ret;
 }
-static inline int stop_machine_create(void) { return 0; }
+#endif  /* CONFIG_STOP_MACHINE && CONFIG_SMP */
-static inline void stop_machine_destroy(void) { }
+#endif  /* _LINUX_STOP_MACHINE */
-#endif /* CONFIG_SMP */
-#endif /* _LINUX_STOP_MACHINE */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 057929b0a651..a1a86a53bc73 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -103,22 +103,6 @@ struct perf_event_attr;
 #define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
 #define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
-#ifdef CONFIG_PERF_EVENTS
-#define TRACE_SYS_ENTER_PERF_INIT(sname)                                       \
-        .perf_event_enable = perf_sysenter_enable,                             \
-        .perf_event_disable = perf_sysenter_disable,
-#define TRACE_SYS_EXIT_PERF_INIT(sname)                                        \
-        .perf_event_enable = perf_sysexit_enable,                              \
-        .perf_event_disable = perf_sysexit_disable,
-#else
-#define TRACE_SYS_ENTER_PERF(sname)
-#define TRACE_SYS_ENTER_PERF_INIT(sname)
-#define TRACE_SYS_EXIT_PERF(sname)
-#define TRACE_SYS_EXIT_PERF_INIT(sname)
-#endif /* CONFIG_PERF_EVENTS */
 #ifdef CONFIG_FTRACE_SYSCALLS
 #define __SC_STR_ADECL1(t, a)           #a
 #define __SC_STR_ADECL2(t, a, ...)      #a, __SC_STR_ADECL1(__VA_ARGS__)
@@ -134,54 +118,43 @@ struct perf_event_attr;
 #define __SC_STR_TDECL5(t, a, ...)      #t, __SC_STR_TDECL4(__VA_ARGS__)
 #define __SC_STR_TDECL6(t, a, ...)      #t, __SC_STR_TDECL5(__VA_ARGS__)
+extern struct ftrace_event_class event_class_syscall_enter;
+extern struct ftrace_event_class event_class_syscall_exit;
+extern struct trace_event_functions enter_syscall_print_funcs;
+extern struct trace_event_functions exit_syscall_print_funcs;
 #define SYSCALL_TRACE_ENTER_EVENT(sname)                                \
-        static const struct syscall_metadata __syscall_meta_##sname;    \
+        static struct syscall_metadata __syscall_meta_##sname;          \
        static struct ftrace_event_call                                 \
        __attribute__((__aligned__(4))) event_enter_##sname;            \
-        static struct trace_event enter_syscall_print_##sname = {       \
-                .trace                  = print_syscall_enter,          \
-        };                                                              \
        static struct ftrace_event_call __used                          \
          __attribute__((__aligned__(4)))                               \
          __attribute__((section("_ftrace_events")))                    \
          event_enter_##sname = {                                       \
                .name                   = "sys_enter"#sname,            \
-                .system                 = "syscalls",                   \
+                .class                  = &event_class_syscall_enter,   \
-                .event                  = &enter_syscall_print_##sname, \
+                .event.funcs            = &enter_syscall_print_funcs,   \
-                .raw_init               = init_syscall_trace,           \
-                .define_fields          = syscall_enter_define_fields,  \
-                .regfunc                = reg_event_syscall_enter,      \
-                .unregfunc              = unreg_event_syscall_enter,    \
                .data                   = (void *)&__syscall_meta_##sname,\
-                TRACE_SYS_ENTER_PERF_INIT(sname)                        \
        }
 #define SYSCALL_TRACE_EXIT_EVENT(sname)                                 \
-        static const struct syscall_metadata __syscall_meta_##sname;    \
+        static struct syscall_metadata __syscall_meta_##sname;          \
        static struct ftrace_event_call                                 \
        __attribute__((__aligned__(4))) event_exit_##sname;             \
-        static struct trace_event exit_syscall_print_##sname = {        \
-                .trace                  = print_syscall_exit,           \
-        };                                                              \
        static struct ftrace_event_call __used                          \
          __attribute__((__aligned__(4)))                               \
          __attribute__((section("_ftrace_events")))                    \
          event_exit_##sname = {                                        \
                .name                   = "sys_exit"#sname,             \
-                .system                 = "syscalls",                   \
+                .class                  = &event_class_syscall_exit,    \
-                .event                  = &exit_syscall_print_##sname,  \
+                .event.funcs            = &exit_syscall_print_funcs,    \
-                .raw_init               = init_syscall_trace,           \
-                .define_fields          = syscall_exit_define_fields,   \
-                .regfunc                = reg_event_syscall_exit,       \
-                .unregfunc              = unreg_event_syscall_exit,     \
                .data                   = (void *)&__syscall_meta_##sname,\
-                TRACE_SYS_EXIT_PERF_INIT(sname)                 \
        }
 #define SYSCALL_METADATA(sname, nb)                             \
        SYSCALL_TRACE_ENTER_EVENT(sname);                       \
        SYSCALL_TRACE_EXIT_EVENT(sname);                        \
-        static const struct syscall_metadata __used             \
+        static struct syscall_metadata __used                   \
          __attribute__((__aligned__(4)))                       \
          __attribute__((section("__syscalls_metadata")))       \
          __syscall_meta_##sname = {                            \
@@ -191,12 +164,14 @@ struct perf_event_attr;
                .args           = args_##sname,                 \
                .enter_event    = &event_enter_##sname,         \
                .exit_event     = &event_exit_##sname,          \
+                .enter_fields   = LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \
+                .exit_fields    = LIST_HEAD_INIT(__syscall_meta_##sname.exit_fields), \
        };
 #define SYSCALL_DEFINE0(sname)                                  \
        SYSCALL_TRACE_ENTER_EVENT(_##sname);                    \
        SYSCALL_TRACE_EXIT_EVENT(_##sname);                     \
-        static const struct syscall_metadata __used             \
+        static struct syscall_metadata __used                   \
          __attribute__((__aligned__(4)))                       \
          __attribute__((section("__syscalls_metadata")))       \
          __syscall_meta__##sname = {                           \
@@ -204,6 +179,8 @@ struct perf_event_attr;
                .nb_args        = 0,                            \
                .enter_event    = &event_enter__##sname,        \
                .exit_event     = &event_exit__##sname,         \
+                .enter_fields   = LIST_HEAD_INIT(__syscall_meta__##sname.enter_fields), \
+                .exit_fields    = LIST_HEAD_INIT(__syscall_meta__##sname.exit_fields), \
        };                                                      \
        asmlinkage long sys_##sname(void)
 #else
diff --git a/include/linux/tick.h b/include/linux/tick.h
index d2ae79e21be3..b232ccc0ee29 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -42,6 +42,7 @@ enum tick_nohz_mode {
 * @idle_waketime:      Time when the idle was interrupted
 * @idle_exittime:      Time when the idle state was left
 * @idle_sleeptime:     Sum of the time slept in idle with sched tick stopped
+ * @iowait_sleeptime:   Sum of the time slept in idle with sched tick stopped, with IO outstanding
 * @sleep_length:       Duration of the current idle sleep
 * @do_timer_lst:       CPU was the last one doing do_timer before going idle
 */
@@ -60,7 +61,7 @@ struct tick_sched {
        ktime_t                         idle_waketime;
        ktime_t                         idle_exittime;
        ktime_t                         idle_sleeptime;
-        ktime_t                         idle_lastupdate;
+        ktime_t                         iowait_sleeptime;
        ktime_t                         sleep_length;
        unsigned long                   last_jiffies;
        unsigned long                   next_jiffies;
@@ -124,6 +125,7 @@ extern void tick_nohz_stop_sched_tick(int inidle);
 extern void tick_nohz_restart_sched_tick(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
+extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 # else
 static inline void tick_nohz_stop_sched_tick(int inidle) { }
 static inline void tick_nohz_restart_sched_tick(void) { }
@@ -134,6 +136,7 @@ static inline ktime_t tick_nohz_get_sleep_length(void)
        return len;
 }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
+static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
 # endif /* !NO_HZ */
 #endif
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 78b4bd3be496..9a59d1f98cd4 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -20,12 +20,17 @@
 struct module;
 struct tracepoint;
+struct tracepoint_func {
+        void *func;
+        void *data;
+};
 struct tracepoint {
        const char *name;               /* Tracepoint name */
        int state;                      /* State. */
        void (*regfunc)(void);
        void (*unregfunc)(void);
-        void **funcs;
+        struct tracepoint_func *funcs;
 } __attribute__((aligned(32)));         /*
                                         * Aligned on 32 bytes because it is
                                         * globally visible and gcc happily
@@ -33,6 +38,68 @@ struct tracepoint {
                                         * Keep in sync with vmlinux.lds.h.
                                         */
+/*
+ * Connect a probe to a tracepoint.
+ * Internal API, should not be used directly.
+ */
+extern int tracepoint_probe_register(const char *name, void *probe, void *data);
+/*
+ * Disconnect a probe from a tracepoint.
+ * Internal API, should not be used directly.
+ */
+extern int
+tracepoint_probe_unregister(const char *name, void *probe, void *data);
+extern int tracepoint_probe_register_noupdate(const char *name, void *probe,
+                                              void *data);
+extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
+                                                void *data);
+extern void tracepoint_probe_update_all(void);
+struct tracepoint_iter {
+        struct module *module;
+        struct tracepoint *tracepoint;
+};
+extern void tracepoint_iter_start(struct tracepoint_iter *iter);
+extern void tracepoint_iter_next(struct tracepoint_iter *iter);
+extern void tracepoint_iter_stop(struct tracepoint_iter *iter);
+extern void tracepoint_iter_reset(struct tracepoint_iter *iter);
+extern int tracepoint_get_iter_range(struct tracepoint **tracepoint,
+        struct tracepoint *begin, struct tracepoint *end);
+/*
+ * tracepoint_synchronize_unregister must be called between the last tracepoint
+ * probe unregistration and the end of module exit to make sure there is no
+ * caller executing a probe when it is freed.
+ */
+static inline void tracepoint_synchronize_unregister(void)
+{
+        synchronize_sched();
+}
+#define PARAMS(args...) args
+#ifdef CONFIG_TRACEPOINTS
+extern void tracepoint_update_probe_range(struct tracepoint *begin,
+        struct tracepoint *end);
+#else
+static inline void tracepoint_update_probe_range(struct tracepoint *begin,
+        struct tracepoint *end)
+{ }
+#endif /* CONFIG_TRACEPOINTS */
+#endif /* _LINUX_TRACEPOINT_H */
+/*
+ * Note: we keep the TRACE_EVENT and DECLARE_TRACE outside the include
+ *  file ifdef protection.
+ *  This is due to the way trace events work. If a file includes two
+ *  trace event headers under one "CREATE_TRACE_POINTS" the first include
+ *  will override the TRACE_EVENT and break the second include.
+ */
 #ifndef DECLARE_TRACE
 #define TP_PROTO(args...)       args
@@ -43,17 +110,27 @@ struct tracepoint {
 /*
 * it_func[0] is never NULL because there is at least one element in the array
 * when the array itself is non NULL.
+ *
+ * Note, the proto and args passed in includes "__data" as the first parameter.
+ * The reason for this is to handle the "void" prototype. If a tracepoint
+ * has a "void" prototype, then it is invalid to declare a function
+ * as "(void *, void)". The DECLARE_TRACE_NOARGS() will pass in just
+ * "void *data", where as the DECLARE_TRACE() will pass in "void *data, proto".
 */
 #define __DO_TRACE(tp, proto, args)                                     \
        do {                                                            \
-                void **it_func;                                         \
+                struct tracepoint_func *it_func_ptr;                    \
+                void *it_func;                                          \
+                void *__data;                                           \
                                                                        \
                rcu_read_lock_sched_notrace();                          \
-                it_func = rcu_dereference_sched((tp)->funcs);           \
+                it_func_ptr = rcu_dereference_sched((tp)->funcs);       \
-                if (it_func) {                                          \
+                if (it_func_ptr) {                                      \
                        do {                                            \
-                                ((void(*)(proto))(*it_func))(args);     \
+                                it_func = (it_func_ptr)->func;          \
-                        } while (*(++it_func));                         \
+                                __data = (it_func_ptr)->data;           \
+                                ((void(*)(proto))(it_func))(args);      \
+                        } while ((++it_func_ptr)->func);                \
                }                                                       \
                rcu_read_unlock_sched_notrace();                        \
        } while (0)
@@ -63,24 +140,32 @@ struct tracepoint {
 * not add unwanted padding between the beginning of the section and the
 * structure. Force alignment to the same alignment as the section start.
 */
-#define DECLARE_TRACE(name, proto, args)                                \
+#define __DECLARE_TRACE(name, proto, args, data_proto, data_args)       \
        extern struct tracepoint __tracepoint_##name;                   \
        static inline void trace_##name(proto)                          \
        {                                                               \
                if (unlikely(__tracepoint_##name.state))                \
                        __DO_TRACE(&__tracepoint_##name,                \
-                                TP_PROTO(proto), TP_ARGS(args));        \
+                                TP_PROTO(data_proto),                   \
+                                TP_ARGS(data_args));                    \
        }                                                               \
-        static inline int register_trace_##name(void (*probe)(proto))   \
+        static inline int                                               \
+        register_trace_##name(void (*probe)(data_proto), void *data)    \
        {                                                               \
-                return tracepoint_probe_register(#name, (void *)probe); \
+                return tracepoint_probe_register(#name, (void *)probe,  \
+                                                 data);                 \
        }                                                               \
-        static inline int unregister_trace_##name(void (*probe)(proto)) \
+        static inline int                                               \
+        unregister_trace_##name(void (*probe)(data_proto), void *data)  \
+        {                                                               \
+                return tracepoint_probe_unregister(#name, (void *)probe, \
+                                                   data);               \
+        }                                                               \
+        static inline void                                              \
+        check_trace_callback_type_##name(void (*cb)(data_proto))        \
        {                                                               \
-                return tracepoint_probe_unregister(#name, (void *)probe);\
        }
 #define DEFINE_TRACE_FN(name, reg, unreg)                               \
        static const char __tpstrtab_##name[]                           \
        __attribute__((section("__tracepoints_strings"))) = #name;      \
@@ -96,22 +181,24 @@ struct tracepoint {
 #define EXPORT_TRACEPOINT_SYMBOL(name)                                  \
        EXPORT_SYMBOL(__tracepoint_##name)
-extern void tracepoint_update_probe_range(struct tracepoint *begin,
-        struct tracepoint *end);
 #else /* !CONFIG_TRACEPOINTS */
-#define DECLARE_TRACE(name, proto, args)                                \
+#define __DECLARE_TRACE(name, proto, args, data_proto, data_args)       \
-        static inline void _do_trace_##name(struct tracepoint *tp, proto) \
-        { }                                                             \
        static inline void trace_##name(proto)                          \
        { }                                                             \
-        static inline int register_trace_##name(void (*probe)(proto))   \
+        static inline int                                               \
+        register_trace_##name(void (*probe)(data_proto),                \
+                              void *data)                               \
        {                                                               \
                return -ENOSYS;                                         \
        }                                                               \
-        static inline int unregister_trace_##name(void (*probe)(proto)) \
+        static inline int                                               \
+        unregister_trace_##name(void (*probe)(data_proto),              \
+                                void *data)                             \
        {                                                               \
                return -ENOSYS;                                         \
+        }                                                               \
+        static inline void check_trace_callback_type_##name(void (*cb)(data_proto)) \
+        {                                                               \
        }
 #define DEFINE_TRACE_FN(name, reg, unreg)
@@ -119,60 +206,31 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin,
 #define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
 #define EXPORT_TRACEPOINT_SYMBOL(name)
-static inline void tracepoint_update_probe_range(struct tracepoint *begin,
-        struct tracepoint *end)
-{ }
 #endif /* CONFIG_TRACEPOINTS */
-#endif /* DECLARE_TRACE */
-/*
- * Connect a probe to a tracepoint.
- * Internal API, should not be used directly.
- */
-extern int tracepoint_probe_register(const char *name, void *probe);
-/*
- * Disconnect a probe from a tracepoint.
- * Internal API, should not be used directly.
- */
-extern int tracepoint_probe_unregister(const char *name, void *probe);
-extern int tracepoint_probe_register_noupdate(const char *name, void *probe);
-extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe);
-extern void tracepoint_probe_update_all(void);
-struct tracepoint_iter {
-        struct module *module;
-        struct tracepoint *tracepoint;
-};
-extern void tracepoint_iter_start(struct tracepoint_iter *iter);
-extern void tracepoint_iter_next(struct tracepoint_iter *iter);
-extern void tracepoint_iter_stop(struct tracepoint_iter *iter);
-extern void tracepoint_iter_reset(struct tracepoint_iter *iter);
-extern int tracepoint_get_iter_range(struct tracepoint **tracepoint,
-        struct tracepoint *begin, struct tracepoint *end);
 /*
- * tracepoint_synchronize_unregister must be called between the last tracepoint
+ * The need for the DECLARE_TRACE_NOARGS() is to handle the prototype
- * probe unregistration and the end of module exit to make sure there is no
+ * (void). "void" is a special value in a function prototype and can
- * caller executing a probe when it is freed.
+ * not be combined with other arguments. Since the DECLARE_TRACE()
+ * macro adds a data element at the beginning of the prototype,
+ * we need a way to differentiate "(void *data, proto)" from
+ * "(void *data, void)". The second prototype is invalid.
+ *
+ * DECLARE_TRACE_NOARGS() passes "void" as the tracepoint prototype
+ * and "void *__data" as the callback prototype.
+ *
+ * DECLARE_TRACE() passes "proto" as the tracepoint protoype and
+ * "void *__data, proto" as the callback prototype.
 */
-static inline void tracepoint_synchronize_unregister(void)
+#define DECLARE_TRACE_NOARGS(name)                                      \
-{
+                __DECLARE_TRACE(name, void, , void *__data, __data)
-        synchronize_sched();
-}
-#define PARAMS(args...) args
+#define DECLARE_TRACE(name, proto, args)                                \
+                __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),      \
-#endif /* _LINUX_TRACEPOINT_H */
+                                PARAMS(void *__data, proto),            \
+                                PARAMS(__data, args))
-/*
+#endif /* DECLARE_TRACE */
- * Note: we keep the TRACE_EVENT outside the include file ifdef protection.
- *  This is due to the way trace events work. If a file includes two
- *  trace event headers under one "CREATE_TRACE_POINTS" the first include
- *  will override the TRACE_EVENT and break the second include.
- */
 #ifndef TRACE_EVENT
 /*
diff --git a/include/linux/wait.h b/include/linux/wait.h
index a48e16b77d5e..76d96d035ea0 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -127,12 +127,26 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
 /*
 * Used for wake-one threads:
 */
+static inline void __add_wait_queue_exclusive(wait_queue_head_t *q,
+                                              wait_queue_t *wait)
+{
+        wait->flags |= WQ_FLAG_EXCLUSIVE;
+        __add_wait_queue(q, wait);
+}
 static inline void __add_wait_queue_tail(wait_queue_head_t *head,
-                                                wait_queue_t *new)
+                                         wait_queue_t *new)
 {
        list_add_tail(&new->task_list, &head->task_list);
 }
+static inline void __add_wait_queue_tail_exclusive(wait_queue_head_t *q,
+                                              wait_queue_t *wait)
+{
+        wait->flags |= WQ_FLAG_EXCLUSIVE;
+        __add_wait_queue_tail(q, wait);
+}
 static inline void __remove_wait_queue(wait_queue_head_t *head,
                                                        wait_queue_t *old)
 {
@@ -404,25 +418,6 @@ do {									\
 })
 /*
- * Must be called with the spinlock in the wait_queue_head_t held.
- */
-static inline void add_wait_queue_exclusive_locked(wait_queue_head_t *q,
-                                                   wait_queue_t * wait)
-{
-        wait->flags |= WQ_FLAG_EXCLUSIVE;
-        __add_wait_queue_tail(q,  wait);
-}
-/*
- * Must be called with the spinlock in the wait_queue_head_t held.
- */
-static inline void remove_wait_queue_locked(wait_queue_head_t *q,
-                                            wait_queue_t * wait)
-{
-        __remove_wait_queue(q,  wait);
-}
-/*
 * These are the old interfaces to sleep waiting for an event.
 * They are racy.  DO NOT use them, use the wait_event* interfaces above.
 * We plan to remove these interfaces.
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 5acfb1eb4df9..1dfab5401511 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -65,6 +65,10 @@
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+/* Make all open coded DECLARE_TRACE nops */
+#undef DECLARE_TRACE
+#define DECLARE_TRACE(name, proto, args)
 #ifdef CONFIG_EVENT_TRACING
 #include <trace/ftrace.h>
 #endif
@@ -75,6 +79,7 @@
 #undef DEFINE_EVENT
 #undef DEFINE_EVENT_PRINT
 #undef TRACE_HEADER_MULTI_READ
+#undef DECLARE_TRACE
 /* Only undef what we defined in this file */
 #ifdef UNDEF_TRACE_INCLUDE_FILE
diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index 4b0f48ba16a6..c7bb2f0482fe 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -51,11 +51,14 @@ TRACE_EVENT(module_free,
        TP_printk("%s", __get_str(name))
 );
+#ifdef CONFIG_MODULE_UNLOAD
+/* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */
 DECLARE_EVENT_CLASS(module_refcnt,
-        TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+        TP_PROTO(struct module *mod, unsigned long ip),
-        TP_ARGS(mod, ip, refcnt),
+        TP_ARGS(mod, ip),
        TP_STRUCT__entry(
                __field(        unsigned long,  ip              )
@@ -65,7 +68,7 @@ DECLARE_EVENT_CLASS(module_refcnt,
        TP_fast_assign(
                __entry->ip     = ip;
-                __entry->refcnt = refcnt;
+                __entry->refcnt = __this_cpu_read(mod->refptr->incs) + __this_cpu_read(mod->refptr->decs);
                __assign_str(name, mod->name);
        ),
@@ -75,17 +78,18 @@ DECLARE_EVENT_CLASS(module_refcnt,
 DEFINE_EVENT(module_refcnt, module_get,
-        TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+        TP_PROTO(struct module *mod, unsigned long ip),
-        TP_ARGS(mod, ip, refcnt)
+        TP_ARGS(mod, ip)
 );
 DEFINE_EVENT(module_refcnt, module_put,
-        TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+        TP_PROTO(struct module *mod, unsigned long ip),
-        TP_ARGS(mod, ip, refcnt)
+        TP_ARGS(mod, ip)
 );
+#endif /* CONFIG_MODULE_UNLOAD */
 TRACE_EVENT(module_request,
diff --git a/include/trace/events/napi.h b/include/trace/events/napi.h
index a8989c4547e7..188deca2f3c7 100644
--- a/include/trace/events/napi.h
+++ b/include/trace/events/napi.h
@@ -1,4 +1,7 @@
-#ifndef _TRACE_NAPI_H_
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM napi
+#if !defined(_TRACE_NAPI_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_NAPI_H_
 #include <linux/netdevice.h>
@@ -8,4 +11,7 @@ DECLARE_TRACE(napi_poll,
        TP_PROTO(struct napi_struct *napi),
        TP_ARGS(napi));
-#endif
+#endif /* _TRACE_NAPI_H_ */
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index cfceb0b73e20..4f733ecea46e 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -51,15 +51,12 @@ TRACE_EVENT(sched_kthread_stop_ret,
 /*
 * Tracepoint for waiting on task to unschedule:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
 */
 TRACE_EVENT(sched_wait_task,
-        TP_PROTO(struct rq *rq, struct task_struct *p),
+        TP_PROTO(struct task_struct *p),
-        TP_ARGS(rq, p),
+        TP_ARGS(p),
        TP_STRUCT__entry(
                __array(        char,   comm,   TASK_COMM_LEN   )
@@ -79,15 +76,12 @@ TRACE_EVENT(sched_wait_task,
 /*
 * Tracepoint for waking up a task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
 */
 DECLARE_EVENT_CLASS(sched_wakeup_template,
-        TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+        TP_PROTO(struct task_struct *p, int success),
-        TP_ARGS(rq, p, success),
+        TP_ARGS(p, success),
        TP_STRUCT__entry(
                __array(        char,   comm,   TASK_COMM_LEN   )
@@ -111,31 +105,25 @@ DECLARE_EVENT_CLASS(sched_wakeup_template,
 );
 DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
-             TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+             TP_PROTO(struct task_struct *p, int success),
-             TP_ARGS(rq, p, success));
+             TP_ARGS(p, success));
 /*
 * Tracepoint for waking up a new task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
 */
 DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
-             TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+             TP_PROTO(struct task_struct *p, int success),
-             TP_ARGS(rq, p, success));
+             TP_ARGS(p, success));
 /*
 * Tracepoint for task switches, performed by the scheduler:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
 */
 TRACE_EVENT(sched_switch,
-        TP_PROTO(struct rq *rq, struct task_struct *prev,
+        TP_PROTO(struct task_struct *prev,
                 struct task_struct *next),
-        TP_ARGS(rq, prev, next),
+        TP_ARGS(prev, next),
        TP_STRUCT__entry(
                __array(        char,   prev_comm,      TASK_COMM_LEN   )
diff --git a/include/trace/events/signal.h b/include/trace/events/signal.h
index a510b75ac304..814566c99d29 100644
--- a/include/trace/events/signal.h
+++ b/include/trace/events/signal.h
@@ -100,18 +100,7 @@ TRACE_EVENT(signal_deliver,
                  __entry->sa_handler, __entry->sa_flags)
 );
-/**
+DECLARE_EVENT_CLASS(signal_queue_overflow,
- * signal_overflow_fail - called when signal queue is overflow
- * @sig: signal number
- * @group: signal to process group or not (bool)
- * @info: pointer to struct siginfo
- *
- * Kernel fails to generate 'sig' signal with 'info' siginfo, because
- * siginfo queue is overflow, and the signal is dropped.
- * 'group' is not 0 if the signal will be sent to a process group.
- * 'sig' is always one of RT signals.
- */
-TRACE_EVENT(signal_overflow_fail,
        TP_PROTO(int sig, int group, struct siginfo *info),
@@ -135,6 +124,24 @@ TRACE_EVENT(signal_overflow_fail,
 );
 /**
+ * signal_overflow_fail - called when signal queue is overflow
+ * @sig: signal number
+ * @group: signal to process group or not (bool)
+ * @info: pointer to struct siginfo
+ *
+ * Kernel fails to generate 'sig' signal with 'info' siginfo, because
+ * siginfo queue is overflow, and the signal is dropped.
+ * 'group' is not 0 if the signal will be sent to a process group.
+ * 'sig' is always one of RT signals.
+ */
+DEFINE_EVENT(signal_queue_overflow, signal_overflow_fail,
+        TP_PROTO(int sig, int group, struct siginfo *info),
+        TP_ARGS(sig, group, info)
+);
+/**
 * signal_lose_info - called when siginfo is lost
 * @sig: signal number
 * @group: signal to process group or not (bool)
@@ -145,28 +152,13 @@ TRACE_EVENT(signal_overflow_fail,
 * 'group' is not 0 if the signal will be sent to a process group.
 * 'sig' is always one of non-RT signals.
 */
-TRACE_EVENT(signal_lose_info,
+DEFINE_EVENT(signal_queue_overflow, signal_lose_info,
        TP_PROTO(int sig, int group, struct siginfo *info),
-        TP_ARGS(sig, group, info),
+        TP_ARGS(sig, group, info)
-        TP_STRUCT__entry(
-                __field(        int,    sig     )
-                __field(        int,    group   )
-                __field(        int,    errno   )
-                __field(        int,    code    )
-        ),
-        TP_fast_assign(
-                __entry->sig    = sig;
-                __entry->group  = group;
-                TP_STORE_SIGINFO(__entry, info);
-        ),
-        TP_printk("sig=%d group=%d errno=%d code=%d",
-                  __entry->sig, __entry->group, __entry->errno, __entry->code)
 );
 #endif /* _TRACE_SIGNAL_H */
 /* This part must be outside protection */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 882c64832ffe..e0e8daa6767e 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -62,7 +62,10 @@
                struct trace_entry      ent;                            \
                tstruct                                                 \
                char                    __data[0];                      \
-        };
+        };                                                              \
+                                                                        \
+        static struct ftrace_event_class event_class_##name;
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, name, proto, args)       \
        static struct ftrace_event_call                 \
@@ -147,16 +150,18 @@
 *
 *      entry = iter->ent;
 *
- *      if (entry->type != event_<call>.id) {
+ *      if (entry->type != event_<call>->event.type) {
 *              WARN_ON_ONCE(1);
 *              return TRACE_TYPE_UNHANDLED;
 *      }
 *
 *      field = (typeof(field))entry;
 *
- *      p = get_cpu_var(ftrace_event_seq);
+ *      p = &get_cpu_var(ftrace_event_seq);
 *      trace_seq_init(p);
- *      ret = trace_seq_printf(s, <TP_printk> "\n");
+ *      ret = trace_seq_printf(s, "%s: ", <call>);
+ *      if (ret)
+ *              ret = trace_seq_printf(s, <TP_printk> "\n");
 *      put_cpu();
 *      if (!ret)
 *              return TRACE_TYPE_PARTIAL_LINE;
@@ -201,18 +206,22 @@
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)  \
 static notrace enum print_line_t                                        \
-ftrace_raw_output_id_##call(int event_id, const char *name,             \
+ftrace_raw_output_##call(struct trace_iterator *iter, int flags,        \
-                            struct trace_iterator *iter, int flags)     \
+                         struct trace_event *trace_event)               \
 {                                                                       \
+        struct ftrace_event_call *event;                                \
        struct trace_seq *s = &iter->seq;                               \
        struct ftrace_raw_##call *field;                                \
        struct trace_entry *entry;                                      \
        struct trace_seq *p;                                            \
        int ret;                                                        \
                                                                        \
+        event = container_of(trace_event, struct ftrace_event_call,     \
+                             event);                                    \
+                                                                        \
        entry = iter->ent;                                              \
                                                                        \
-        if (entry->type != event_id) {                                  \
+        if (entry->type != event->event.type) {                         \
                WARN_ON_ONCE(1);                                        \
                return TRACE_TYPE_UNHANDLED;                            \
        }                                                               \
@@ -221,7 +230,7 @@ ftrace_raw_output_id_##call(int event_id, const char *name,		\
                                                                        \
        p = &get_cpu_var(ftrace_event_seq);                             \
        trace_seq_init(p);                                              \
-        ret = trace_seq_printf(s, "%s: ", name);                        \
+        ret = trace_seq_printf(s, "%s: ", event->name);                 \
        if (ret)                                                        \
                ret = trace_seq_printf(s, print);                       \
        put_cpu();                                                      \
@@ -229,21 +238,16 @@ ftrace_raw_output_id_##call(int event_id, const char *name,		\
                return TRACE_TYPE_PARTIAL_LINE;                         \
                                                                        \
        return TRACE_TYPE_HANDLED;                                      \
-}
+}                                                                       \
+static struct trace_event_functions ftrace_event_type_funcs_##call = {  \
-#undef DEFINE_EVENT
+        .trace                  = ftrace_raw_output_##call,             \
-#define DEFINE_EVENT(template, name, proto, args)                       \
+};
-static notrace enum print_line_t                                        \
-ftrace_raw_output_##name(struct trace_iterator *iter, int flags)        \
-{                                                                       \
-        return ftrace_raw_output_id_##template(event_##name.id,         \
-                                               #name, iter, flags);     \
-}
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, call, proto, args, print)          \
 static notrace enum print_line_t                                        \
-ftrace_raw_output_##call(struct trace_iterator *iter, int flags)        \
+ftrace_raw_output_##call(struct trace_iterator *iter, int flags,        \
+                         struct trace_event *event)                     \
 {                                                                       \
        struct trace_seq *s = &iter->seq;                               \
        struct ftrace_raw_##template *field;                            \
@@ -253,7 +257,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
                                                                        \
        entry = iter->ent;                                              \
                                                                        \
-        if (entry->type != event_##call.id) {                           \
+        if (entry->type != event_##call.event.type) {                   \
                WARN_ON_ONCE(1);                                        \
                return TRACE_TYPE_UNHANDLED;                            \
        }                                                               \
@@ -270,7 +274,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
                return TRACE_TYPE_PARTIAL_LINE;                         \
                                                                        \
        return TRACE_TYPE_HANDLED;                                      \
-}
+}                                                                       \
+static struct trace_event_functions ftrace_event_type_funcs_##call = {  \
+        .trace                  = ftrace_raw_output_##call,             \
+};
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
@@ -376,142 +383,83 @@ static inline notrace int ftrace_get_offsets_##call(			\
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
-#ifdef CONFIG_PERF_EVENTS
-/*
- * Generate the functions needed for tracepoint perf_event support.
- *
- * NOTE: The insertion profile callback (ftrace_profile_<call>) is defined later
- *
- * static int ftrace_profile_enable_<call>(void)
- * {
- *      return register_trace_<call>(ftrace_profile_<call>);
- * }
- *
- * static void ftrace_profile_disable_<call>(void)
- * {
- *      unregister_trace_<call>(ftrace_profile_<call>);
- * }
- *
- */
-#undef DECLARE_EVENT_CLASS
-#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)
-#undef DEFINE_EVENT
-#define DEFINE_EVENT(template, name, proto, args)                       \
-                                                                        \
-static void perf_trace_##name(proto);                                   \
-                                                                        \
-static notrace int                                                      \
-perf_trace_enable_##name(struct ftrace_event_call *unused)              \
-{                                                                       \
-        return register_trace_##name(perf_trace_##name);                \
-}                                                                       \
-                                                                        \
-static notrace void                                                     \
-perf_trace_disable_##name(struct ftrace_event_call *unused)             \
-{                                                                       \
-        unregister_trace_##name(perf_trace_##name);                     \
-}
-#undef DEFINE_EVENT_PRINT
-#define DEFINE_EVENT_PRINT(template, name, proto, args, print)  \
-        DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
-#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
-#endif /* CONFIG_PERF_EVENTS */
 /*
 * Stage 4 of the trace events.
 *
 * Override the macros in <trace/trace_events.h> to include the following:
 *
- * static void ftrace_event_<call>(proto)
- * {
- *      event_trace_printk(_RET_IP_, "<call>: " <fmt>);
- * }
- *
- * static int ftrace_reg_event_<call>(struct ftrace_event_call *unused)
- * {
- *      return register_trace_<call>(ftrace_event_<call>);
- * }
- *
- * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
- * {
- *      unregister_trace_<call>(ftrace_event_<call>);
- * }
- *
- *
 * For those macros defined with TRACE_EVENT:
 *
 * static struct ftrace_event_call event_<call>;
 *
- * static void ftrace_raw_event_<call>(proto)
+ * static void ftrace_raw_event_<call>(void *__data, proto)
 * {
+ *      struct ftrace_event_call *event_call = __data;
+ *      struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
 *      struct ring_buffer_event *event;
 *      struct ftrace_raw_<call> *entry; <-- defined in stage 1
 *      struct ring_buffer *buffer;
 *      unsigned long irq_flags;
+ *      int __data_size;
 *      int pc;
 *
 *      local_save_flags(irq_flags);
 *      pc = preempt_count();
 *
+ *      __data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
+ *
 *      event = trace_current_buffer_lock_reserve(&buffer,
- *                                event_<call>.id,
+ *                                event_<call>->event.type,
- *                                sizeof(struct ftrace_raw_<call>),
+ *                                sizeof(*entry) + __data_size,
 *                                irq_flags, pc);
 *      if (!event)
 *              return;
 *      entry   = ring_buffer_event_data(event);
 *
- *      <assign>;  <-- Here we assign the entries by the __field and
+ *      { <assign>; }  <-- Here we assign the entries by the __field and
- *                      __array macros.
+ *                         __array macros.
- *
- *      trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc);
- * }
- *
- * static int ftrace_raw_reg_event_<call>(struct ftrace_event_call *unused)
- * {
- *      int ret;
- *
- *      ret = register_trace_<call>(ftrace_raw_event_<call>);
- *      if (!ret)
- *              pr_info("event trace: Could not activate trace point "
- *                      "probe to <call>");
- *      return ret;
- * }
 *
- * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
+ *      if (!filter_current_check_discard(buffer, event_call, entry, event))
- * {
+ *              trace_current_buffer_unlock_commit(buffer,
- *      unregister_trace_<call>(ftrace_raw_event_<call>);
+ *                                                 event, irq_flags, pc);
 * }
 *
 * static struct trace_event ftrace_event_type_<call> = {
 *      .trace                  = ftrace_raw_output_<call>, <-- stage 2
 * };
 *
+ * static const char print_fmt_<call>[] = <TP_printk>;
+ *
+ * static struct ftrace_event_class __used event_class_<template> = {
+ *      .system                 = "<system>",
+ *      .define_fields          = ftrace_define_fields_<call>,
+ *      .fields                 = LIST_HEAD_INIT(event_class_##call.fields),
+ *      .raw_init               = trace_event_raw_init,
+ *      .probe                  = ftrace_raw_event_##call,
+ * };
+ *
 * static struct ftrace_event_call __used
 * __attribute__((__aligned__(4)))
 * __attribute__((section("_ftrace_events"))) event_<call> = {
 *      .name                   = "<call>",
- *      .system                 = "<system>",
+ *      .class                  = event_class_<template>,
- *      .raw_init               = trace_event_raw_init,
+ *      .event                  = &ftrace_event_type_<call>,
- *      .regfunc                = ftrace_reg_event_<call>,
+ *      .print_fmt              = print_fmt_<call>,
- *      .unregfunc              = ftrace_unreg_event_<call>,
+ * };
- * }
 *
 */
 #ifdef CONFIG_PERF_EVENTS
+#define _TRACE_PERF_PROTO(call, proto)                                  \
+        static notrace void                                             \
+        perf_trace_##call(void *__data, proto);
 #define _TRACE_PERF_INIT(call)                                          \
-        .perf_event_enable = perf_trace_enable_##call,                  \
+        .perf_probe             = perf_trace_##call,
-        .perf_event_disable = perf_trace_disable_##call,
 #else
+#define _TRACE_PERF_PROTO(call, proto)
 #define _TRACE_PERF_INIT(call)
 #endif /* CONFIG_PERF_EVENTS */
@@ -545,9 +493,9 @@ perf_trace_disable_##name(struct ftrace_event_call *unused)		\
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)  \
                                                                        \
 static notrace void                                                     \
-ftrace_raw_event_id_##call(struct ftrace_event_call *event_call,        \
+ftrace_raw_event_##call(void *__data, proto)                            \
-                                       proto)                           \
 {                                                                       \
+        struct ftrace_event_call *event_call = __data;                  \
        struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
        struct ring_buffer_event *event;                                \
        struct ftrace_raw_##call *entry;                                \
@@ -562,14 +510,13 @@ ftrace_raw_event_id_##call(struct ftrace_event_call *event_call,	\
        __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
                                                                        \
        event = trace_current_buffer_lock_reserve(&buffer,              \
-                                 event_call->id,                        \
+                                 event_call->event.type,                \
                                 sizeof(*entry) + __data_size,          \
                                 irq_flags, pc);                        \
        if (!event)                                                     \
                return;                                                 \
        entry   = ring_buffer_event_data(event);                        \
                                                                        \
-                                                                        \
        tstruct                                                         \
                                                                        \
        { assign; }                                                     \
@@ -578,34 +525,21 @@ ftrace_raw_event_id_##call(struct ftrace_event_call *event_call,	\
                trace_nowake_buffer_unlock_commit(buffer,               \
                                                  event, irq_flags, pc); \
 }
+/*
+ * The ftrace_test_probe is compiled out, it is only here as a build time check
+ * to make sure that if the tracepoint handling changes, the ftrace probe will
+ * fail to compile unless it too is updated.
+ */
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, call, proto, args)                       \
-                                                                        \
+static inline void ftrace_test_probe_##call(void)                       \
-static notrace void ftrace_raw_event_##call(proto)                      \
-{                                                                       \
-        ftrace_raw_event_id_##template(&event_##call, args);            \
-}                                                                       \
-                                                                        \
-static notrace int                                                      \
-ftrace_raw_reg_event_##call(struct ftrace_event_call *unused)           \
-{                                                                       \
-        return register_trace_##call(ftrace_raw_event_##call);          \
-}                                                                       \
-                                                                        \
-static notrace void                                                     \
-ftrace_raw_unreg_event_##call(struct ftrace_event_call *unused)         \
 {                                                                       \
-        unregister_trace_##call(ftrace_raw_event_##call);               \
+        check_trace_callback_type_##call(ftrace_raw_event_##template);  \
-}                                                                       \
+}
-                                                                        \
-static struct trace_event ftrace_event_type_##call = {                  \
-        .trace                  = ftrace_raw_output_##call,             \
-};
 #undef DEFINE_EVENT_PRINT
-#define DEFINE_EVENT_PRINT(template, name, proto, args, print)  \
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print)
-        DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
@@ -622,7 +556,16 @@ static struct trace_event ftrace_event_type_##call = {			\
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)  \
-static const char print_fmt_##call[] = print;
+_TRACE_PERF_PROTO(call, PARAMS(proto));                                 \
+static const char print_fmt_##call[] = print;                           \
+static struct ftrace_event_class __used event_class_##call = {          \
+        .system                 = __stringify(TRACE_SYSTEM),            \
+        .define_fields          = ftrace_define_fields_##call,          \
+        .fields                 = LIST_HEAD_INIT(event_class_##call.fields),\
+        .raw_init               = trace_event_raw_init,                 \
+        .probe                  = ftrace_raw_event_##call,              \
+        _TRACE_PERF_INIT(call)                                          \
+};
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, call, proto, args)                       \
@@ -631,15 +574,10 @@ static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))                                         \
 __attribute__((section("_ftrace_events"))) event_##call = {             \
        .name                   = #call,                                \
-        .system                 = __stringify(TRACE_SYSTEM),            \
+        .class                  = &event_class_##template,              \
-        .event                  = &ftrace_event_type_##call,            \
+        .event.funcs            = &ftrace_event_type_funcs_##template,  \
-        .raw_init               = trace_event_raw_init,                 \
-        .regfunc                = ftrace_raw_reg_event_##call,          \
-        .unregfunc              = ftrace_raw_unreg_event_##call,        \
        .print_fmt              = print_fmt_##template,                 \
-        .define_fields          = ftrace_define_fields_##template,      \
+};
-        _TRACE_PERF_INIT(call)                                  \
-}
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, call, proto, args, print)          \
@@ -650,14 +588,9 @@ static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))                                         \
 __attribute__((section("_ftrace_events"))) event_##call = {             \
        .name                   = #call,                                \
-        .system                 = __stringify(TRACE_SYSTEM),            \
+        .class                  = &event_class_##template,              \
-        .event                  = &ftrace_event_type_##call,            \
+        .event.funcs            = &ftrace_event_type_funcs_##call,      \
-        .raw_init               = trace_event_raw_init,                 \
-        .regfunc                = ftrace_raw_reg_event_##call,          \
-        .unregfunc              = ftrace_raw_unreg_event_##call,        \
        .print_fmt              = print_fmt_##call,                     \
-        .define_fields          = ftrace_define_fields_##template,      \
-        _TRACE_PERF_INIT(call)                                  \
 }
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
@@ -757,17 +690,20 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)  \
 static notrace void                                                     \
-perf_trace_templ_##call(struct ftrace_event_call *event_call,           \
+perf_trace_##call(void *__data, proto)                                  \
-                        struct pt_regs *__regs, proto)                  \
 {                                                                       \
+        struct ftrace_event_call *event_call = __data;                  \
        struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
        struct ftrace_raw_##call *entry;                                \
+        struct pt_regs *__regs = &get_cpu_var(perf_trace_regs);         \
        u64 __addr = 0, __count = 1;                                    \
        unsigned long irq_flags;                                        \
        int __entry_size;                                               \
        int __data_size;                                                \
        int rctx;                                                       \
                                                                        \
+        perf_fetch_caller_regs(__regs, 1);                              \
+                                                                        \
        __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
        __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
                             sizeof(u64));                              \
@@ -775,33 +711,35 @@ perf_trace_templ_##call(struct ftrace_event_call *event_call,		\
                                                                        \
        if (WARN_ONCE(__entry_size > PERF_MAX_TRACE_SIZE,               \
                      "profile buffer not large enough"))               \
-                return;                                                 \
+                goto out;                                               \
        entry = (struct ftrace_raw_##call *)perf_trace_buf_prepare(     \
-                __entry_size, event_call->id, &rctx, &irq_flags);       \
+                __entry_size, event_call->event.type, &rctx, &irq_flags); \
        if (!entry)                                                     \
-                return;                                                 \
+                goto out;                                               \
        tstruct                                                         \
                                                                        \
        { assign; }                                                     \
                                                                        \
        perf_trace_buf_submit(entry, __entry_size, rctx, __addr,        \
                               __count, irq_flags, __regs);             \
+ out:                                                                   \
+        put_cpu_var(perf_trace_regs);                                   \
 }
+/*
+ * This part is compiled out, it is only here as a build time check
+ * to make sure that if the tracepoint handling changes, the
+ * perf probe will fail to compile unless it too is updated.
+ */
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, call, proto, args)                       \
-static notrace void perf_trace_##call(proto)                            \
+static inline void perf_test_probe_##call(void)                         \
 {                                                                       \
-        struct ftrace_event_call *event_call = &event_##call;           \
+        check_trace_callback_type_##call(perf_trace_##template);        \
-        struct pt_regs *__regs = &get_cpu_var(perf_trace_regs);         \
-                                                                        \
-        perf_fetch_caller_regs(__regs, 1);                              \
                                                                        \
-        perf_trace_templ_##template(event_call, __regs, args);          \
-                                                                        \
-        put_cpu_var(perf_trace_regs);                                   \
 }
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print)  \
        DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index e5e5f48dbfb3..257e08960d7b 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -25,6 +25,8 @@ struct syscall_metadata {
        int             nb_args;
        const char      **types;
        const char      **args;
+        struct list_head enter_fields;
+        struct list_head exit_fields;
        struct ftrace_event_call *enter_event;
        struct ftrace_event_call *exit_event;
@@ -34,16 +36,16 @@ struct syscall_metadata {
 extern unsigned long arch_syscall_addr(int nr);
 extern int init_syscall_trace(struct ftrace_event_call *call);
-extern int syscall_enter_define_fields(struct ftrace_event_call *call);
-extern int syscall_exit_define_fields(struct ftrace_event_call *call);
 extern int reg_event_syscall_enter(struct ftrace_event_call *call);
 extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
 extern int reg_event_syscall_exit(struct ftrace_event_call *call);
 extern void unreg_event_syscall_exit(struct ftrace_event_call *call);
 extern int
 ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s);
-enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags);
+enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
-enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags);
+                                      struct trace_event *event);
+enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
+                                     struct trace_event *event);
 #endif
 #ifdef CONFIG_PERF_EVENTS
diff --git a/init/Kconfig b/init/Kconfig
index eb77e8ccde1c..5fe94b82e4c0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -604,8 +604,7 @@ config RT_GROUP_SCHED
        default n
        help
          This feature lets you explicitly allocate real CPU bandwidth
-          to users or control groups (depending on the "Basis for grouping tasks"
+          to task groups. If enabled, it will also make it impossible to
-          setting below. If enabled, it will also make it impossible to
          schedule realtime tasks for non-root users until you allocate
          realtime bandwidth for them.
          See Documentation/scheduler/sched-rt-group.txt for more information.
diff --git a/kernel/Makefile b/kernel/Makefile
index a987aa1676b5..149e18ef1ab1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -68,7 +68,7 @@ obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
-obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_SMP) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e4697e9b276..2f05303715a5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,7 +15,6 @@
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
 #include <asm/uaccess.h>
-#include "cred-internals.h"
 /*
 * Leveraged for setting/resetting capabilities
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e2769e13980c..4a07d057a265 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3010,7 +3010,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
        unsigned long flags = (unsigned long)key;
        if (flags & POLLHUP) {
-                remove_wait_queue_locked(event->wqh, &event->wait);
+                __remove_wait_queue(event->wqh, &event->wait);
                spin_lock(&cgrp->event_list_lock);
                list_del(&event->list);
                spin_unlock(&cgrp->event_list_lock);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 25bba73b1be3..545777574779 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -164,6 +164,7 @@ static inline void check_for_tasks(int cpu)
 }
 struct take_cpu_down_param {
+        struct task_struct *caller;
        unsigned long mod;
        void *hcpu;
 };
@@ -172,6 +173,7 @@ struct take_cpu_down_param {
 static int __ref take_cpu_down(void *_param)
 {
        struct take_cpu_down_param *param = _param;
+        unsigned int cpu = (unsigned long)param->hcpu;
        int err;
        /* Ensure this CPU doesn't handle any more interrupts. */
@@ -182,6 +184,8 @@ static int __ref take_cpu_down(void *_param)
        raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
                                param->hcpu);
+        if (task_cpu(param->caller) == cpu)
+                move_task_off_dead_cpu(cpu, param->caller);
        /* Force idle task to run as soon as we yield: it should
           immediately notice cpu is offline and die quickly. */
        sched_idle_next();
@@ -192,10 +196,10 @@ static int __ref take_cpu_down(void *_param)
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
        int err, nr_calls = 0;
-        cpumask_var_t old_allowed;
        void *hcpu = (void *)(long)cpu;
        unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
        struct take_cpu_down_param tcd_param = {
+                .caller = current,
                .mod = mod,
                .hcpu = hcpu,
        };
@@ -206,9 +210,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        if (!cpu_online(cpu))
                return -EINVAL;
-        if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
-                return -ENOMEM;
        cpu_hotplug_begin();
        set_cpu_active(cpu, false);
        err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
@@ -225,10 +226,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
                goto out_release;
        }
-        /* Ensure that we are not runnable on dying cpu */
-        cpumask_copy(old_allowed, &current->cpus_allowed);
-        set_cpus_allowed_ptr(current, cpu_active_mask);
        err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
        if (err) {
                set_cpu_active(cpu, true);
@@ -237,7 +234,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
                                            hcpu) == NOTIFY_BAD)
                        BUG();
-                goto out_allowed;
+                goto out_release;
        }
        BUG_ON(cpu_online(cpu));
@@ -255,8 +252,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        check_for_tasks(cpu);
-out_allowed:
-        set_cpus_allowed_ptr(current, old_allowed);
 out_release:
        cpu_hotplug_done();
        if (!err) {
@@ -264,7 +259,6 @@ out_release:
                                            hcpu) == NOTIFY_BAD)
                        BUG();
        }
-        free_cpumask_var(old_allowed);
        return err;
 }
@@ -272,9 +266,6 @@ int __ref cpu_down(unsigned int cpu)
 {
        int err;
-        err = stop_machine_create();
-        if (err)
-                return err;
        cpu_maps_update_begin();
        if (cpu_hotplug_disabled) {
@@ -286,7 +277,6 @@ int __ref cpu_down(unsigned int cpu)
 out:
        cpu_maps_update_done();
-        stop_machine_destroy();
        return err;
 }
 EXPORT_SYMBOL(cpu_down);
@@ -367,9 +357,6 @@ int disable_nonboot_cpus(void)
 {
        int cpu, first_cpu, error;
-        error = stop_machine_create();
-        if (error)
-                return error;
        cpu_maps_update_begin();
        first_cpu = cpumask_first(cpu_online_mask);
        /*
@@ -400,7 +387,6 @@ int disable_nonboot_cpus(void)
                printk(KERN_ERR "Non-boot CPUs are not disabled\n");
        }
        cpu_maps_update_done();
-        stop_machine_destroy();
        return error;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d10946748ec2..9a50c5f6e727 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2182,19 +2182,52 @@ void __init cpuset_init_smp(void)
 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
        mutex_lock(&callback_mutex);
-        cpuset_cpus_allowed_locked(tsk, pmask);
+        task_lock(tsk);
+        guarantee_online_cpus(task_cs(tsk), pmask);
+        task_unlock(tsk);
        mutex_unlock(&callback_mutex);
 }
-/**
+int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
- * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
- * Must be called with callback_mutex held.
- **/
-void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
 {
-        task_lock(tsk);
+        const struct cpuset *cs;
-        guarantee_online_cpus(task_cs(tsk), pmask);
+        int cpu;
-        task_unlock(tsk);
+        rcu_read_lock();
+        cs = task_cs(tsk);
+        if (cs)
+                cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+        rcu_read_unlock();
+        /*
+         * We own tsk->cpus_allowed, nobody can change it under us.
+         *
+         * But we used cs && cs->cpus_allowed lockless and thus can
+         * race with cgroup_attach_task() or update_cpumask() and get
+         * the wrong tsk->cpus_allowed. However, both cases imply the
+         * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
+         * which takes task_rq_lock().
+         *
+         * If we are called after it dropped the lock we must see all
+         * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
+         * set any mask even if it is not right from task_cs() pov,
+         * the pending set_cpus_allowed_ptr() will fix things.
+         */
+        cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
+        if (cpu >= nr_cpu_ids) {
+                /*
+                 * Either tsk->cpus_allowed is wrong (see above) or it
+                 * is actually empty. The latter case is only possible
+                 * if we are racing with remove_tasks_in_empty_cpuset().
+                 * Like above we can temporary set any mask and rely on
+                 * set_cpus_allowed_ptr() as synchronization point.
+                 */
+                cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+                cpu = cpumask_any(cpu_active_mask);
+        }
+        return cpu;
 }
 void cpuset_init_current_mems_allowed(void)
@@ -2383,22 +2416,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 }
 /**
- * cpuset_lock - lock out any changes to cpuset structures
- *
- * The out of memory (oom) code needs to mutex_lock cpusets
- * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset.  Expose callback_mutex via this
- * cpuset_lock() routine, so the oom code can lock it, before
- * locking the task list.  The tasklist_lock is a spinlock, so
- * must be taken inside callback_mutex.
- */
-void cpuset_lock(void)
-{
-        mutex_lock(&callback_mutex);
-}
-/**
 * cpuset_unlock - release lock on cpuset changes
 *
 * Undo the lock taken in a previous cpuset_lock() call.
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
deleted file mode 100644
index 2dc4fc2d0bf1..000000000000
--- a/kernel/cred-internals.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Internal credentials stuff
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-/*
- * user.c
- */
-static inline void sched_switch_user(struct task_struct *p)
-{
-#ifdef CONFIG_USER_SCHED
-        sched_move_task(p);
-#endif  /* CONFIG_USER_SCHED */
-}
diff --git a/kernel/cred.c b/kernel/cred.c
index 62af1816c235..8f3672a58a1e 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -17,7 +17,6 @@
 #include <linux/init_task.h>
 #include <linux/security.h>
 #include <linux/cn_proc.h>
-#include "cred-internals.h"
 #if 0
 #define kdebug(FMT, ...) \
@@ -560,8 +559,6 @@ int commit_creds(struct cred *new)
                atomic_dec(&old->user->processes);
        alter_cred_subscribers(old, -2);
-        sched_switch_user(task);
        /* send notifications */
        if (new->uid   != old->uid  ||
            new->euid  != old->euid ||
diff --git a/kernel/exit.c b/kernel/exit.c
index 7f2683a10ac4..eabca5a73a85 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,7 +55,6 @@
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
-#include "cred-internals.h"
 static void exit_mm(struct task_struct * tsk);
diff --git a/kernel/module.c b/kernel/module.c
index 1016b75b026a..e2564580f3f1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -59,8 +59,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
-EXPORT_TRACEPOINT_SYMBOL(module_get);
 #if 0
 #define DEBUGP printk
 #else
@@ -515,6 +513,9 @@ MODINFO_ATTR(srcversion);
 static char last_unloaded_module[MODULE_NAME_LEN+1];
 #ifdef CONFIG_MODULE_UNLOAD
+EXPORT_TRACEPOINT_SYMBOL(module_get);
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
 {
@@ -723,16 +724,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                return -EFAULT;
        name[MODULE_NAME_LEN-1] = '\0';
-        /* Create stop_machine threads since free_module relies on
+        if (mutex_lock_interruptible(&module_mutex) != 0)
-         * a non-failing stop_machine call. */
+                return -EINTR;
-        ret = stop_machine_create();
-        if (ret)
-                return ret;
-        if (mutex_lock_interruptible(&module_mutex) != 0) {
-                ret = -EINTR;
-                goto out_stop;
-        }
        mod = find_module(name);
        if (!mod) {
@@ -792,8 +785,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 out:
        mutex_unlock(&module_mutex);
-out_stop:
-        stop_machine_destroy();
        return ret;
 }
@@ -867,8 +858,7 @@ void module_put(struct module *module)
                smp_wmb(); /* see comment in module_refcount */
                __this_cpu_inc(module->refptr->decs);
-                trace_module_put(module, _RET_IP_,
+                trace_module_put(module, _RET_IP_);
-                                 __this_cpu_read(module->refptr->decs));
                /* Maybe they're waiting for us to drop reference? */
                if (unlikely(!module_is_live(module)))
                        wake_up_process(module->waiter);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 58df55bf83ed..2b676f3a0f26 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -669,7 +669,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
        .sync           = synchronize_sched_expedited,
        .cb_barrier     = NULL,
        .fqs            = rcu_sched_force_quiescent_state,
-        .stats          = rcu_expedited_torture_stats,
+        .stats          = NULL,
        .irq_capable    = 1,
        .name           = "sched_expedited"
 };
diff --git a/kernel/sched.c b/kernel/sched.c
index b11b80a3eed3..78554dd0d1a4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
-#include <linux/kthread.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/stop_machine.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
@@ -503,8 +503,11 @@ struct rq {
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 #ifdef CONFIG_NO_HZ
+        u64 nohz_stamp;
        unsigned char in_nohz_recently;
 #endif
+        unsigned int skip_clock_update;
        /* capture load from *all* tasks on this cpu: */
        struct load_weight load;
        unsigned long nr_load_updates;
@@ -546,15 +549,13 @@ struct rq {
        int post_schedule;
        int active_balance;
        int push_cpu;
+        struct cpu_stop_work active_balance_work;
        /* cpu of this runqueue: */
        int cpu;
        int online;
        unsigned long avg_load_per_task;
-        struct task_struct *migration_thread;
-        struct list_head migration_queue;
        u64 rt_avg;
        u64 age_stamp;
        u64 idle_stamp;
@@ -602,6 +603,13 @@ static inline
 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
        rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+        /*
+         * A queue event has occurred, and we're going to schedule.  In
+         * this case, we can save a useless back to back clock update.
+         */
+        if (test_tsk_need_resched(p))
+                rq->skip_clock_update = 1;
 }
 static inline int cpu_of(struct rq *rq)
@@ -636,7 +644,8 @@ static inline int cpu_of(struct rq *rq)
 inline void update_rq_clock(struct rq *rq)
 {
-        rq->clock = sched_clock_cpu(cpu_of(rq));
+        if (!rq->skip_clock_update)
+                rq->clock = sched_clock_cpu(cpu_of(rq));
 }
 /*
@@ -914,16 +923,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
- * Check whether the task is waking, we use this to synchronize against
+ * Check whether the task is waking, we use this to synchronize ->cpus_allowed
- * ttwu() so that task_cpu() reports a stable number.
+ * against ttwu().
- *
- * We need to make an exception for PF_STARTING tasks because the fork
- * path might require task_rq_lock() to work, eg. it can call
- * set_cpus_allowed_ptr() from the cpuset clone_ns code.
 */
 static inline int task_is_waking(struct task_struct *p)
 {
-        return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
+        return unlikely(p->state == TASK_WAKING);
 }
 /*
@@ -936,11 +941,9 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
        struct rq *rq;
        for (;;) {
-                while (task_is_waking(p))
-                        cpu_relax();
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-                if (likely(rq == task_rq(p) && !task_is_waking(p)))
+                if (likely(rq == task_rq(p)))
                        return rq;
                raw_spin_unlock(&rq->lock);
        }
@@ -957,12 +960,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
        struct rq *rq;
        for (;;) {
-                while (task_is_waking(p))
-                        cpu_relax();
                local_irq_save(*flags);
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-                if (likely(rq == task_rq(p) && !task_is_waking(p)))
+                if (likely(rq == task_rq(p)))
                        return rq;
                raw_spin_unlock_irqrestore(&rq->lock, *flags);
        }
@@ -1239,6 +1240,17 @@ void wake_up_idle_cpu(int cpu)
        if (!tsk_is_polling(rq->idle))
                smp_send_reschedule(cpu);
 }
+int nohz_ratelimit(int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        u64 diff = rq->clock - rq->nohz_stamp;
+        rq->nohz_stamp = rq->clock;
+        return diff < (NSEC_PER_SEC / HZ) >> 1;
+}
 #endif /* CONFIG_NO_HZ */
 static u64 sched_avg_period(void)
@@ -1781,8 +1793,6 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
                }
        }
-        update_rq_clock(rq1);
-        update_rq_clock(rq2);
 }
 /*
@@ -1813,7 +1823,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 }
 #endif
-static void calc_load_account_active(struct rq *this_rq);
+static void calc_load_account_idle(struct rq *this_rq);
 static void update_sysctl(void);
 static int get_update_sysctl_factor(void);
@@ -1870,62 +1880,43 @@ static void set_load_weight(struct task_struct *p)
        p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 }
-static void update_avg(u64 *avg, u64 sample)
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
-        s64 diff = sample - *avg;
+        update_rq_clock(rq);
-        *avg += diff >> 3;
-}
-static void
-enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
-{
-        if (wakeup)
-                p->se.start_runtime = p->se.sum_exec_runtime;
        sched_info_queued(p);
-        p->sched_class->enqueue_task(rq, p, wakeup, head);
+        p->sched_class->enqueue_task(rq, p, flags);
        p->se.on_rq = 1;
 }
-static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
-        if (sleep) {
+        update_rq_clock(rq);
-                if (p->se.last_wakeup) {
-                        update_avg(&p->se.avg_overlap,
-                                p->se.sum_exec_runtime - p->se.last_wakeup);
-                        p->se.last_wakeup = 0;
-                } else {
-                        update_avg(&p->se.avg_wakeup,
-                                sysctl_sched_wakeup_granularity);
-                }
-        }
        sched_info_dequeued(p);
-        p->sched_class->dequeue_task(rq, p, sleep);
+        p->sched_class->dequeue_task(rq, p, flags);
        p->se.on_rq = 0;
 }
 /*
 * activate_task - move a task to the runqueue.
 */
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+static void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
        if (task_contributes_to_load(p))
                rq->nr_uninterruptible--;
-        enqueue_task(rq, p, wakeup, false);
+        enqueue_task(rq, p, flags);
        inc_nr_running(rq);
 }
 /*
 * deactivate_task - remove a task from the runqueue.
 */
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
        if (task_contributes_to_load(p))
                rq->nr_uninterruptible++;
-        dequeue_task(rq, p, sleep);
+        dequeue_task(rq, p, flags);
        dec_nr_running(rq);
 }
@@ -2054,21 +2045,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        __set_task_cpu(p, new_cpu);
 }
-struct migration_req {
+struct migration_arg {
-        struct list_head list;
        struct task_struct *task;
        int dest_cpu;
-        struct completion done;
 };
+static int migration_cpu_stop(void *data);
 /*
 * The task's runqueue lock must be held.
 * Returns true if you have to wait for migration thread.
 */
-static int
+static bool migrate_task(struct task_struct *p, int dest_cpu)
-migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 {
        struct rq *rq = task_rq(p);
@@ -2076,15 +2064,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
         * If the task is not on a runqueue (and not running), then
         * the next wake-up will properly place the task.
         */
-        if (!p->se.on_rq && !task_running(rq, p))
+        return p->se.on_rq || task_running(rq, p);
-                return 0;
-        init_completion(&req->done);
-        req->task = p;
-        req->dest_cpu = dest_cpu;
-        list_add(&req->list, &rq->migration_queue);
-        return 1;
 }
 /*
@@ -2142,7 +2122,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                 * just go back and repeat.
                 */
                rq = task_rq_lock(p, &flags);
-                trace_sched_wait_task(rq, p);
+                trace_sched_wait_task(p);
                running = task_running(rq, p);
                on_rq = p->se.on_rq;
                ncsw = 0;
@@ -2240,6 +2220,9 @@ void task_oncpu_function_call(struct task_struct *p,
 }
 #ifdef CONFIG_SMP
+/*
+ * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
        int dest_cpu;
@@ -2256,12 +2239,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                return dest_cpu;
        /* No more Mr. Nice Guy. */
-        if (dest_cpu >= nr_cpu_ids) {
+        if (unlikely(dest_cpu >= nr_cpu_ids)) {
-                rcu_read_lock();
+                dest_cpu = cpuset_cpus_allowed_fallback(p);
-                cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
-                rcu_read_unlock();
-                dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
                /*
                 * Don't tell them about moving exiting tasks or
                 * kernel threads (both mm NULL), since they never
@@ -2278,17 +2257,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 }
 /*
- * Gets called from 3 sites (exec, fork, wakeup), since it is called without
+ * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
- * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
- * by:
- *
- *  exec:           is unstable, retry loop
- *  fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
 */
 static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
 {
-        int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+        int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
        /*
         * In order not to call set_task_cpu() on a blocking task we need
@@ -2306,6 +2280,12 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
        return cpu;
 }
+static void update_avg(u64 *avg, u64 sample)
+{
+        s64 diff = sample - *avg;
+        *avg += diff >> 3;
+}
 #endif
 /***
@@ -2327,16 +2307,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 {
        int cpu, orig_cpu, this_cpu, success = 0;
        unsigned long flags;
+        unsigned long en_flags = ENQUEUE_WAKEUP;
        struct rq *rq;
-        if (!sched_feat(SYNC_WAKEUPS))
-                wake_flags &= ~WF_SYNC;
        this_cpu = get_cpu();
        smp_wmb();
        rq = task_rq_lock(p, &flags);
-        update_rq_clock(rq);
        if (!(p->state & state))
                goto out;
@@ -2356,28 +2333,26 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
         *
         * First fix up the nr_uninterruptible count:
         */
-        if (task_contributes_to_load(p))
+        if (task_contributes_to_load(p)) {
-                rq->nr_uninterruptible--;
+                if (likely(cpu_online(orig_cpu)))
+                        rq->nr_uninterruptible--;
+                else
+                        this_rq()->nr_uninterruptible--;
+        }
        p->state = TASK_WAKING;
-        if (p->sched_class->task_waking)
+        if (p->sched_class->task_waking) {
                p->sched_class->task_waking(rq, p);
+                en_flags |= ENQUEUE_WAKING;
+        }
-        __task_rq_unlock(rq);
+        cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
+        if (cpu != orig_cpu)
-        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-        if (cpu != orig_cpu) {
-                /*
-                 * Since we migrate the task without holding any rq->lock,
-                 * we need to be careful with task_rq_lock(), since that
-                 * might end up locking an invalid rq.
-                 */
                set_task_cpu(p, cpu);
-        }
+        __task_rq_unlock(rq);
        rq = cpu_rq(cpu);
        raw_spin_lock(&rq->lock);
-        update_rq_clock(rq);
        /*
         * We migrated the task without holding either rq->lock, however
@@ -2405,36 +2380,20 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 out_activate:
 #endif /* CONFIG_SMP */
-        schedstat_inc(p, se.nr_wakeups);
+        schedstat_inc(p, se.statistics.nr_wakeups);
        if (wake_flags & WF_SYNC)
-                schedstat_inc(p, se.nr_wakeups_sync);
+                schedstat_inc(p, se.statistics.nr_wakeups_sync);
        if (orig_cpu != cpu)
-                schedstat_inc(p, se.nr_wakeups_migrate);
+                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
        if (cpu == this_cpu)
-                schedstat_inc(p, se.nr_wakeups_local);
+                schedstat_inc(p, se.statistics.nr_wakeups_local);
        else
-                schedstat_inc(p, se.nr_wakeups_remote);
+                schedstat_inc(p, se.statistics.nr_wakeups_remote);
-        activate_task(rq, p, 1);
+        activate_task(rq, p, en_flags);
        success = 1;
-        /*
-         * Only attribute actual wakeups done by this task.
-         */
-        if (!in_interrupt()) {
-                struct sched_entity *se = &current->se;
-                u64 sample = se->sum_exec_runtime;
-                if (se->last_wakeup)
-                        sample -= se->last_wakeup;
-                else
-                        sample -= se->start_runtime;
-                update_avg(&se->avg_wakeup, sample);
-                se->last_wakeup = se->sum_exec_runtime;
-        }
 out_running:
-        trace_sched_wakeup(rq, p, success);
+        trace_sched_wakeup(p, success);
        check_preempt_curr(rq, p, wake_flags);
        p->state = TASK_RUNNING;
@@ -2494,42 +2453,9 @@ static void __sched_fork(struct task_struct *p)
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
        p->se.nr_migrations             = 0;
-        p->se.last_wakeup               = 0;
-        p->se.avg_overlap               = 0;
-        p->se.start_runtime             = 0;
-        p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
 #ifdef CONFIG_SCHEDSTATS
-        p->se.wait_start                        = 0;
+        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
-        p->se.wait_max                          = 0;
-        p->se.wait_count                        = 0;
-        p->se.wait_sum                          = 0;
-        p->se.sleep_start                       = 0;
-        p->se.sleep_max                         = 0;
-        p->se.sum_sleep_runtime                 = 0;
-        p->se.block_start                       = 0;
-        p->se.block_max                         = 0;
-        p->se.exec_max                          = 0;
-        p->se.slice_max                         = 0;
-        p->se.nr_migrations_cold                = 0;
-        p->se.nr_failed_migrations_affine       = 0;
-        p->se.nr_failed_migrations_running      = 0;
-        p->se.nr_failed_migrations_hot          = 0;
-        p->se.nr_forced_migrations              = 0;
-        p->se.nr_wakeups                        = 0;
-        p->se.nr_wakeups_sync                   = 0;
-        p->se.nr_wakeups_migrate                = 0;
-        p->se.nr_wakeups_local                  = 0;
-        p->se.nr_wakeups_remote                 = 0;
-        p->se.nr_wakeups_affine                 = 0;
-        p->se.nr_wakeups_affine_attempts        = 0;
-        p->se.nr_wakeups_passive                = 0;
-        p->se.nr_wakeups_idle                   = 0;
 #endif
        INIT_LIST_HEAD(&p->rt.run_list);
@@ -2550,11 +2476,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
        __sched_fork(p);
        /*
-         * We mark the process as waking here. This guarantees that
+         * We mark the process as running here. This guarantees that
         * nobody will actually run it, and a signal or other external
         * event cannot wake it up and insert it on the runqueue either.
         */
-        p->state = TASK_WAKING;
+        p->state = TASK_RUNNING;
        /*
         * Revert to default priority/policy on fork if requested.
@@ -2621,31 +2547,27 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
        int cpu __maybe_unused = get_cpu();
 #ifdef CONFIG_SMP
+        rq = task_rq_lock(p, &flags);
+        p->state = TASK_WAKING;
        /*
         * Fork balancing, do it here and not earlier because:
         *  - cpus_allowed can change in the fork path
         *  - any previously selected cpu might disappear through hotplug
         *
-         * We still have TASK_WAKING but PF_STARTING is gone now, meaning
+         * We set TASK_WAKING so that select_task_rq() can drop rq->lock
-         * ->cpus_allowed is stable, we have preemption disabled, meaning
+         * without people poking at ->cpus_allowed.
-         * cpu_online_mask is stable.
         */
-        cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
+        cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
        set_task_cpu(p, cpu);
-#endif
-        /*
-         * Since the task is not on the rq and we still have TASK_WAKING set
-         * nobody else will migrate this task.
-         */
-        rq = cpu_rq(cpu);
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        BUG_ON(p->state != TASK_WAKING);
        p->state = TASK_RUNNING;
-        update_rq_clock(rq);
+        task_rq_unlock(rq, &flags);
+#endif
+        rq = task_rq_lock(p, &flags);
        activate_task(rq, p, 0);
-        trace_sched_wakeup_new(rq, p, 1);
+        trace_sched_wakeup_new(p, 1);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_woken)
@@ -2865,7 +2787,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
        struct mm_struct *mm, *oldmm;
        prepare_task_switch(rq, prev, next);
-        trace_sched_switch(rq, prev, next);
+        trace_sched_switch(prev, next);
        mm = next->mm;
        oldmm = prev->active_mm;
        /*
@@ -2982,6 +2904,61 @@ static unsigned long calc_load_update;
 unsigned long avenrun[3];
 EXPORT_SYMBOL(avenrun);
+static long calc_load_fold_active(struct rq *this_rq)
+{
+        long nr_active, delta = 0;
+        nr_active = this_rq->nr_running;
+        nr_active += (long) this_rq->nr_uninterruptible;
+        if (nr_active != this_rq->calc_load_active) {
+                delta = nr_active - this_rq->calc_load_active;
+                this_rq->calc_load_active = nr_active;
+        }
+        return delta;
+}
+#ifdef CONFIG_NO_HZ
+/*
+ * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_tasks_idle;
+static void calc_load_account_idle(struct rq *this_rq)
+{
+        long delta;
+        delta = calc_load_fold_active(this_rq);
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks_idle);
+}
+static long calc_load_fold_idle(void)
+{
+        long delta = 0;
+        /*
+         * Its got a race, we don't care...
+         */
+        if (atomic_long_read(&calc_load_tasks_idle))
+                delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+        return delta;
+}
+#else
+static void calc_load_account_idle(struct rq *this_rq)
+{
+}
+static inline long calc_load_fold_idle(void)
+{
+        return 0;
+}
+#endif
 /**
 * get_avenrun - get the load average array
 * @loads:      pointer to dest load array
@@ -3028,20 +3005,22 @@ void calc_global_load(void)
 }
 /*
- * Either called from update_cpu_load() or from a cpu going idle
+ * Called from update_cpu_load() to periodically update this CPU's
+ * active count.
 */
 static void calc_load_account_active(struct rq *this_rq)
 {
-        long nr_active, delta;
+        long delta;
-        nr_active = this_rq->nr_running;
+        if (time_before(jiffies, this_rq->calc_load_update))
-        nr_active += (long) this_rq->nr_uninterruptible;
+                return;
-        if (nr_active != this_rq->calc_load_active) {
+        delta  = calc_load_fold_active(this_rq);
-                delta = nr_active - this_rq->calc_load_active;
+        delta += calc_load_fold_idle();
-                this_rq->calc_load_active = nr_active;
+        if (delta)
                atomic_long_add(delta, &calc_load_tasks);
-        }
+        this_rq->calc_load_update += LOAD_FREQ;
 }
 /*
@@ -3073,10 +3052,7 @@ static void update_cpu_load(struct rq *this_rq)
                this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
        }
-        if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+        calc_load_account_active(this_rq);
-                this_rq->calc_load_update += LOAD_FREQ;
-                calc_load_account_active(this_rq);
-        }
 }
 #ifdef CONFIG_SMP
@@ -3088,44 +3064,27 @@ static void update_cpu_load(struct rq *this_rq)
 void sched_exec(void)
 {
        struct task_struct *p = current;
-        struct migration_req req;
-        int dest_cpu, this_cpu;
        unsigned long flags;
        struct rq *rq;
+        int dest_cpu;
-again:
-        this_cpu = get_cpu();
-        dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
-        if (dest_cpu == this_cpu) {
-                put_cpu();
-                return;
-        }
        rq = task_rq_lock(p, &flags);
-        put_cpu();
+        dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+        if (dest_cpu == smp_processor_id())
+                goto unlock;
        /*
         * select_task_rq() can race against ->cpus_allowed
         */
-        if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
+        if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-            || unlikely(!cpu_active(dest_cpu))) {
+            likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
-                task_rq_unlock(rq, &flags);
+                struct migration_arg arg = { p, dest_cpu };
-                goto again;
-        }
-        /* force the process onto the specified CPU */
-        if (migrate_task(p, dest_cpu, &req)) {
-                /* Need to wait for migration thread (might exit: take ref). */
-                struct task_struct *mt = rq->migration_thread;
-                get_task_struct(mt);
                task_rq_unlock(rq, &flags);
-                wake_up_process(mt);
+                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-                put_task_struct(mt);
-                wait_for_completion(&req.done);
                return;
        }
+unlock:
        task_rq_unlock(rq, &flags);
 }
@@ -3597,23 +3556,9 @@ static inline void schedule_debug(struct task_struct *prev)
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-        if (prev->state == TASK_RUNNING) {
+        if (prev->se.on_rq)
-                u64 runtime = prev->se.sum_exec_runtime;
+                update_rq_clock(rq);
+        rq->skip_clock_update = 0;
-                runtime -= prev->se.prev_sum_exec_runtime;
-                runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
-                /*
-                 * In order to avoid avg_overlap growing stale when we are
-                 * indeed overlapping and hence not getting put to sleep, grow
-                 * the avg_overlap on preemption.
-                 *
-                 * We use the average preemption runtime because that
-                 * correlates to the amount of cache footprint a task can
-                 * build up.
-                 */
-                update_avg(&prev->se.avg_overlap, runtime);
-        }
        prev->sched_class->put_prev_task(rq, prev);
 }
@@ -3676,14 +3621,13 @@ need_resched_nonpreemptible:
                hrtick_clear(rq);
        raw_spin_lock_irq(&rq->lock);
-        update_rq_clock(rq);
        clear_tsk_need_resched(prev);
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                if (unlikely(signal_pending_state(prev->state, prev)))
                        prev->state = TASK_RUNNING;
                else
-                        deactivate_task(rq, prev, 1);
+                        deactivate_task(rq, prev, DEQUEUE_SLEEP);
                switch_count = &prev->nvcsw;
        }
@@ -4006,8 +3950,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
        if (!x->done) {
                DECLARE_WAITQUEUE(wait, current);
-                wait.flags |= WQ_FLAG_EXCLUSIVE;
+                __add_wait_queue_tail_exclusive(&x->wait, &wait);
-                __add_wait_queue_tail(&x->wait, &wait);
                do {
                        if (signal_pending_state(state, current)) {
                                timeout = -ERESTARTSYS;
@@ -4233,7 +4176,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        BUG_ON(prio < 0 || prio > MAX_PRIO);
        rq = task_rq_lock(p, &flags);
-        update_rq_clock(rq);
        oldprio = p->prio;
        prev_class = p->sched_class;
@@ -4254,7 +4196,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        if (running)
                p->sched_class->set_curr_task(rq);
        if (on_rq) {
-                enqueue_task(rq, p, 0, oldprio < prio);
+                enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
                check_class_changed(rq, p, prev_class, oldprio, running);
        }
@@ -4276,7 +4218,6 @@ void set_user_nice(struct task_struct *p, long nice)
         * the task might be in the middle of scheduling on another CPU.
         */
        rq = task_rq_lock(p, &flags);
-        update_rq_clock(rq);
        /*
         * The RT priorities are set via sched_setscheduler(), but we still
         * allow the 'normal' nice value to be set - but as expected
@@ -4298,7 +4239,7 @@ void set_user_nice(struct task_struct *p, long nice)
        delta = p->prio - old_prio;
        if (on_rq) {
-                enqueue_task(rq, p, 0, false);
+                enqueue_task(rq, p, 0);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -4559,7 +4500,6 @@ recheck:
                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                goto recheck;
        }
-        update_rq_clock(rq);
        on_rq = p->se.on_rq;
        running = task_current(rq, p);
        if (on_rq)
@@ -5296,17 +5236,15 @@ static inline void sched_init_granularity(void)
 /*
 * This is how migration works:
 *
- * 1) we queue a struct migration_req structure in the source CPU's
+ * 1) we invoke migration_cpu_stop() on the target CPU using
- *    runqueue and wake up that CPU's migration thread.
+ *    stop_one_cpu().
- * 2) we down() the locked semaphore => thread blocks.
+ * 2) stopper starts to run (implicitly forcing the migrated thread
- * 3) migration thread wakes up (implicitly it forces the migrated
+ *    off the CPU)
- *    thread off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
- * 4) it gets the migration request and checks whether the migrated
+ * 4) if it's in the wrong runqueue then the migration thread removes
- *    task is still in the wrong runqueue.
- * 5) if it's in the wrong runqueue then the migration thread removes
 *    it and puts it into the right queue.
- * 6) migration thread up()s the semaphore.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
- * 7) we wake up and the migration is done.
+ *    is done.
 */
 /*
@@ -5320,12 +5258,23 @@ static inline void sched_init_granularity(void)
 */
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
-        struct migration_req req;
        unsigned long flags;
        struct rq *rq;
+        unsigned int dest_cpu;
        int ret = 0;
+        /*
+         * Serialize against TASK_WAKING so that ttwu() and wunt() can
+         * drop the rq->lock and still rely on ->cpus_allowed.
+         */
+again:
+        while (task_is_waking(p))
+                cpu_relax();
        rq = task_rq_lock(p, &flags);
+        if (task_is_waking(p)) {
+                task_rq_unlock(rq, &flags);
+                goto again;
+        }
        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                ret = -EINVAL;
@@ -5349,15 +5298,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
-        if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
+        dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+        if (migrate_task(p, dest_cpu)) {
+                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
-                struct task_struct *mt = rq->migration_thread;
-                get_task_struct(mt);
                task_rq_unlock(rq, &flags);
-                wake_up_process(mt);
+                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-                put_task_struct(mt);
-                wait_for_completion(&req.done);
                tlb_migrate_finish(p->mm);
                return 0;
        }
@@ -5415,98 +5361,49 @@ fail:
        return ret;
 }
-#define RCU_MIGRATION_IDLE      0
-#define RCU_MIGRATION_NEED_QS   1
-#define RCU_MIGRATION_GOT_QS    2
-#define RCU_MIGRATION_MUST_SYNC 3
 /*
- * migration_thread - this is a highprio system thread that performs
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
- * thread migration by bumping thread off CPU then 'pushing' onto
+ * and performs thread migration by bumping thread off CPU then
- * another runqueue.
+ * 'pushing' onto another runqueue.
 */
-static int migration_thread(void *data)
+static int migration_cpu_stop(void *data)
 {
-        int badcpu;
+        struct migration_arg *arg = data;
-        int cpu = (long)data;
-        struct rq *rq;
-        rq = cpu_rq(cpu);
-        BUG_ON(rq->migration_thread != current);
-        set_current_state(TASK_INTERRUPTIBLE);
-        while (!kthread_should_stop()) {
-                struct migration_req *req;
-                struct list_head *head;
-                raw_spin_lock_irq(&rq->lock);
-                if (cpu_is_offline(cpu)) {
-                        raw_spin_unlock_irq(&rq->lock);
-                        break;
-                }
-                if (rq->active_balance) {
-                        active_load_balance(rq, cpu);
-                        rq->active_balance = 0;
-                }
-                head = &rq->migration_queue;
-                if (list_empty(head)) {
-                        raw_spin_unlock_irq(&rq->lock);
-                        schedule();
-                        set_current_state(TASK_INTERRUPTIBLE);
-                        continue;
-                }
-                req = list_entry(head->next, struct migration_req, list);
-                list_del_init(head->next);
-                if (req->task != NULL) {
-                        raw_spin_unlock(&rq->lock);
-                        __migrate_task(req->task, cpu, req->dest_cpu);
-                } else if (likely(cpu == (badcpu = smp_processor_id()))) {
-                        req->dest_cpu = RCU_MIGRATION_GOT_QS;
-                        raw_spin_unlock(&rq->lock);
-                } else {
-                        req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
-                        raw_spin_unlock(&rq->lock);
-                        WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
-                }
-                local_irq_enable();
-                complete(&req->done);
-        }
-        __set_current_state(TASK_RUNNING);
-        return 0;
-}
-#ifdef CONFIG_HOTPLUG_CPU
-static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
-{
-        int ret;
+        /*
+         * The original target cpu might have gone down and we might
+         * be on another cpu but it doesn't matter.
+         */
        local_irq_disable();
-        ret = __migrate_task(p, src_cpu, dest_cpu);
+        __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
        local_irq_enable();
-        return ret;
+        return 0;
 }
+#ifdef CONFIG_HOTPLUG_CPU
 /*
 * Figure out where task on dead CPU should go, use force if necessary.
 */
-static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
-        int dest_cpu;
+        struct rq *rq = cpu_rq(dead_cpu);
+        int needs_cpu, uninitialized_var(dest_cpu);
+        unsigned long flags;
-again:
+        local_irq_save(flags);
-        dest_cpu = select_fallback_rq(dead_cpu, p);
-        /* It can have affinity changed while we were choosing. */
+        raw_spin_lock(&rq->lock);
-        if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
+        needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
-                goto again;
+        if (needs_cpu)
+                dest_cpu = select_fallback_rq(dead_cpu, p);
+        raw_spin_unlock(&rq->lock);
+        /*
+         * It can only fail if we race with set_cpus_allowed(),
+         * in the racer should migrate the task anyway.
+         */
+        if (needs_cpu)
+                __migrate_task(p, dead_cpu, dest_cpu);
+        local_irq_restore(flags);
 }
 /*
@@ -5570,7 +5467,6 @@ void sched_idle_next(void)
        __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-        update_rq_clock(rq);
        activate_task(rq, p, 0);
        raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5625,7 +5521,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
        for ( ; ; ) {
                if (!rq->nr_running)
                        break;
-                update_rq_clock(rq);
                next = pick_next_task(rq);
                if (!next)
                        break;
@@ -5848,35 +5743,20 @@ static void set_rq_offline(struct rq *rq)
 static int __cpuinit
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
-        struct task_struct *p;
        int cpu = (long)hcpu;
        unsigned long flags;
-        struct rq *rq;
+        struct rq *rq = cpu_rq(cpu);
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
-                if (IS_ERR(p))
-                        return NOTIFY_BAD;
-                kthread_bind(p, cpu);
-                /* Must be high prio: stop_machine expects to yield to it. */
-                rq = task_rq_lock(p, &flags);
-                __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-                task_rq_unlock(rq, &flags);
-                get_task_struct(p);
-                cpu_rq(cpu)->migration_thread = p;
                rq->calc_load_update = calc_load_update;
                break;
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-                /* Strictly unnecessary, as first user will wake it. */
-                wake_up_process(cpu_rq(cpu)->migration_thread);
                /* Update our root-domain */
-                rq = cpu_rq(cpu);
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5887,61 +5767,24 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 #ifdef CONFIG_HOTPLUG_CPU
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
-                if (!cpu_rq(cpu)->migration_thread)
-                        break;
-                /* Unbind it from offline cpu so it can run. Fall thru. */
-                kthread_bind(cpu_rq(cpu)->migration_thread,
-                             cpumask_any(cpu_online_mask));
-                kthread_stop(cpu_rq(cpu)->migration_thread);
-                put_task_struct(cpu_rq(cpu)->migration_thread);
-                cpu_rq(cpu)->migration_thread = NULL;
-                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
                migrate_live_tasks(cpu);
-                rq = cpu_rq(cpu);
-                kthread_stop(rq->migration_thread);
-                put_task_struct(rq->migration_thread);
-                rq->migration_thread = NULL;
                /* Idle task back to normal (off runqueue, low prio) */
                raw_spin_lock_irq(&rq->lock);
-                update_rq_clock(rq);
                deactivate_task(rq, rq->idle, 0);
                __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
                rq->idle->sched_class = &idle_sched_class;
                migrate_dead_tasks(cpu);
                raw_spin_unlock_irq(&rq->lock);
-                cpuset_unlock();
                migrate_nr_uninterruptible(rq);
                BUG_ON(rq->nr_running != 0);
                calc_global_load_remove(rq);
-                /*
-                 * No need to migrate the tasks: it was best-effort if
-                 * they didn't take sched_hotcpu_mutex. Just wake up
-                 * the requestors.
-                 */
-                raw_spin_lock_irq(&rq->lock);
-                while (!list_empty(&rq->migration_queue)) {
-                        struct migration_req *req;
-                        req = list_entry(rq->migration_queue.next,
-                                         struct migration_req, list);
-                        list_del_init(&req->list);
-                        raw_spin_unlock_irq(&rq->lock);
-                        complete(&req->done);
-                        raw_spin_lock_irq(&rq->lock);
-                }
-                raw_spin_unlock_irq(&rq->lock);
                break;
        case CPU_DYING:
        case CPU_DYING_FROZEN:
                /* Update our root-domain */
-                rq = cpu_rq(cpu);
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -6272,6 +6115,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        struct rq *rq = cpu_rq(cpu);
        struct sched_domain *tmp;
+        for (tmp = sd; tmp; tmp = tmp->parent)
+                tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
        /* Remove the sched domains which do not contribute to scheduling. */
        for (tmp = sd; tmp; ) {
                struct sched_domain *parent = tmp->parent;
@@ -7755,10 +7601,8 @@ void __init sched_init(void)
                rq->push_cpu = 0;
                rq->cpu = i;
                rq->online = 0;
-                rq->migration_thread = NULL;
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
-                INIT_LIST_HEAD(&rq->migration_queue);
                rq_attach_root(rq, &def_root_domain);
 #endif
                init_rq_hrtick(rq);
@@ -7859,7 +7703,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
 {
        int on_rq;
-        update_rq_clock(rq);
        on_rq = p->se.on_rq;
        if (on_rq)
                deactivate_task(rq, p, 0);
@@ -7886,9 +7729,9 @@ void normalize_rt_tasks(void)
                p->se.exec_start                = 0;
 #ifdef CONFIG_SCHEDSTATS
-                p->se.wait_start                = 0;
+                p->se.statistics.wait_start     = 0;
-                p->se.sleep_start               = 0;
+                p->se.statistics.sleep_start    = 0;
-                p->se.block_start               = 0;
+                p->se.statistics.block_start    = 0;
 #endif
                if (!rt_task(p)) {
@@ -8221,8 +8064,6 @@ void sched_move_task(struct task_struct *tsk)
        rq = task_rq_lock(tsk, &flags);
-        update_rq_clock(rq);
        running = task_current(rq, tsk);
        on_rq = tsk->se.on_rq;
@@ -8241,7 +8082,7 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
        if (on_rq)
-                enqueue_task(rq, tsk, 0, false);
+                enqueue_task(rq, tsk, 0);
        task_rq_unlock(rq, &flags);
 }
@@ -9055,43 +8896,32 @@ struct cgroup_subsys cpuacct_subsys = {
 #ifndef CONFIG_SMP
-int rcu_expedited_torture_stats(char *page)
-{
-        return 0;
-}
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
 void synchronize_sched_expedited(void)
 {
+        barrier();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 #else /* #ifndef CONFIG_SMP */
-static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
+static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-static DEFINE_MUTEX(rcu_sched_expedited_mutex);
-#define RCU_EXPEDITED_STATE_POST -2
-#define RCU_EXPEDITED_STATE_IDLE -1
-static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
-int rcu_expedited_torture_stats(char *page)
+static int synchronize_sched_expedited_cpu_stop(void *data)
 {
-        int cnt = 0;
+        /*
-        int cpu;
+         * There must be a full memory barrier on each affected CPU
+         * between the time that try_stop_cpus() is called and the
-        cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
+         * time that it returns.
-        for_each_online_cpu(cpu) {
+         *
-                 cnt += sprintf(&page[cnt], " %d:%d",
+         * In the current initial implementation of cpu_stop, the
-                                cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
+         * above condition is already met when the control reaches
-        }
+         * this point and the following smp_mb() is not strictly
-        cnt += sprintf(&page[cnt], "\n");
+         * necessary.  Do smp_mb() anyway for documentation and
-        return cnt;
+         * robustness against future implementation changes.
+         */
+        smp_mb(); /* See above comment block. */
+        return 0;
 }
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
-static long synchronize_sched_expedited_count;
 /*
 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9105,18 +8935,14 @@ static long synchronize_sched_expedited_count;
 */
 void synchronize_sched_expedited(void)
 {
-        int cpu;
+        int snap, trycount = 0;
-        unsigned long flags;
-        bool need_full_sync = 0;
-        struct rq *rq;
-        struct migration_req *req;
-        long snap;
-        int trycount = 0;
        smp_mb();  /* ensure prior mod happens before capturing snap. */
-        snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
+        snap = atomic_read(&synchronize_sched_expedited_count) + 1;
        get_online_cpus();
-        while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
+        while (try_stop_cpus(cpu_online_mask,
+                             synchronize_sched_expedited_cpu_stop,
+                             NULL) == -EAGAIN) {
                put_online_cpus();
                if (trycount++ < 10)
                        udelay(trycount * num_online_cpus());
@@ -9124,41 +8950,15 @@ void synchronize_sched_expedited(void)
                        synchronize_sched();
                        return;
                }
-                if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
+                if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
                        smp_mb(); /* ensure test happens before caller kfree */
                        return;
                }
                get_online_cpus();
        }
-        rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
+        atomic_inc(&synchronize_sched_expedited_count);
-        for_each_online_cpu(cpu) {
+        smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
-                rq = cpu_rq(cpu);
-                req = &per_cpu(rcu_migration_req, cpu);
-                init_completion(&req->done);
-                req->task = NULL;
-                req->dest_cpu = RCU_MIGRATION_NEED_QS;
-                raw_spin_lock_irqsave(&rq->lock, flags);
-                list_add(&req->list, &rq->migration_queue);
-                raw_spin_unlock_irqrestore(&rq->lock, flags);
-                wake_up_process(rq->migration_thread);
-        }
-        for_each_online_cpu(cpu) {
-                rcu_expedited_state = cpu;
-                req = &per_cpu(rcu_migration_req, cpu);
-                rq = cpu_rq(cpu);
-                wait_for_completion(&req->done);
-                raw_spin_lock_irqsave(&rq->lock, flags);
-                if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
-                        need_full_sync = 1;
-                req->dest_cpu = RCU_MIGRATION_IDLE;
-                raw_spin_unlock_irqrestore(&rq->lock, flags);
-        }
-        rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
-        synchronize_sched_expedited_count++;
-        mutex_unlock(&rcu_sched_expedited_mutex);
        put_online_cpus();
-        if (need_full_sync)
-                synchronize_sched();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 9b49db144037..9cf1baf6616a 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
        PN(se->vruntime);
        PN(se->sum_exec_runtime);
 #ifdef CONFIG_SCHEDSTATS
-        PN(se->wait_start);
+        PN(se->statistics.wait_start);
-        PN(se->sleep_start);
+        PN(se->statistics.sleep_start);
-        PN(se->block_start);
+        PN(se->statistics.block_start);
-        PN(se->sleep_max);
+        PN(se->statistics.sleep_max);
-        PN(se->block_max);
+        PN(se->statistics.block_max);
-        PN(se->exec_max);
+        PN(se->statistics.exec_max);
-        PN(se->slice_max);
+        PN(se->statistics.slice_max);
-        PN(se->wait_max);
+        PN(se->statistics.wait_max);
-        PN(se->wait_sum);
+        PN(se->statistics.wait_sum);
-        P(se->wait_count);
+        P(se->statistics.wait_count);
 #endif
        P(se->load.weight);
 #undef PN
@@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
        SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
                SPLIT_NS(p->se.vruntime),
                SPLIT_NS(p->se.sum_exec_runtime),
-                SPLIT_NS(p->se.sum_sleep_runtime));
+                SPLIT_NS(p->se.statistics.sum_sleep_runtime));
 #else
        SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
                0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
@@ -173,11 +173,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        task_group_path(tg, path, sizeof(path));
        SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
-#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
-        {
-                uid_t uid = cfs_rq->tg->uid;
-                SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
-        }
 #else
        SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
@@ -407,40 +402,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        PN(se.exec_start);
        PN(se.vruntime);
        PN(se.sum_exec_runtime);
-        PN(se.avg_overlap);
-        PN(se.avg_wakeup);
        nr_switches = p->nvcsw + p->nivcsw;
 #ifdef CONFIG_SCHEDSTATS
-        PN(se.wait_start);
+        PN(se.statistics.wait_start);
-        PN(se.sleep_start);
+        PN(se.statistics.sleep_start);
-        PN(se.block_start);
+        PN(se.statistics.block_start);
-        PN(se.sleep_max);
+        PN(se.statistics.sleep_max);
-        PN(se.block_max);
+        PN(se.statistics.block_max);
-        PN(se.exec_max);
+        PN(se.statistics.exec_max);
-        PN(se.slice_max);
+        PN(se.statistics.slice_max);
-        PN(se.wait_max);
+        PN(se.statistics.wait_max);
-        PN(se.wait_sum);
+        PN(se.statistics.wait_sum);
-        P(se.wait_count);
+        P(se.statistics.wait_count);
-        PN(se.iowait_sum);
+        PN(se.statistics.iowait_sum);
-        P(se.iowait_count);
+        P(se.statistics.iowait_count);
        P(sched_info.bkl_count);
        P(se.nr_migrations);
-        P(se.nr_migrations_cold);
+        P(se.statistics.nr_migrations_cold);
-        P(se.nr_failed_migrations_affine);
+        P(se.statistics.nr_failed_migrations_affine);
-        P(se.nr_failed_migrations_running);
+        P(se.statistics.nr_failed_migrations_running);
-        P(se.nr_failed_migrations_hot);
+        P(se.statistics.nr_failed_migrations_hot);
-        P(se.nr_forced_migrations);
+        P(se.statistics.nr_forced_migrations);
-        P(se.nr_wakeups);
+        P(se.statistics.nr_wakeups);
-        P(se.nr_wakeups_sync);
+        P(se.statistics.nr_wakeups_sync);
-        P(se.nr_wakeups_migrate);
+        P(se.statistics.nr_wakeups_migrate);
-        P(se.nr_wakeups_local);
+        P(se.statistics.nr_wakeups_local);
-        P(se.nr_wakeups_remote);
+        P(se.statistics.nr_wakeups_remote);
-        P(se.nr_wakeups_affine);
+        P(se.statistics.nr_wakeups_affine);
-        P(se.nr_wakeups_affine_attempts);
+        P(se.statistics.nr_wakeups_affine_attempts);
-        P(se.nr_wakeups_passive);
+        P(se.statistics.nr_wakeups_passive);
-        P(se.nr_wakeups_idle);
+        P(se.statistics.nr_wakeups_idle);
        {
                u64 avg_atom, avg_per_cpu;
@@ -491,31 +484,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 void proc_sched_set_task(struct task_struct *p)
 {
 #ifdef CONFIG_SCHEDSTATS
-        p->se.wait_max                          = 0;
+        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
-        p->se.wait_sum                          = 0;
-        p->se.wait_count                        = 0;
-        p->se.iowait_sum                        = 0;
-        p->se.iowait_count                      = 0;
-        p->se.sleep_max                         = 0;
-        p->se.sum_sleep_runtime                 = 0;
-        p->se.block_max                         = 0;
-        p->se.exec_max                          = 0;
-        p->se.slice_max                         = 0;
-        p->se.nr_migrations                     = 0;
-        p->se.nr_migrations_cold                = 0;
-        p->se.nr_failed_migrations_affine       = 0;
-        p->se.nr_failed_migrations_running      = 0;
-        p->se.nr_failed_migrations_hot          = 0;
-        p->se.nr_forced_migrations              = 0;
-        p->se.nr_wakeups                        = 0;
-        p->se.nr_wakeups_sync                   = 0;
-        p->se.nr_wakeups_migrate                = 0;
-        p->se.nr_wakeups_local                  = 0;
-        p->se.nr_wakeups_remote                 = 0;
-        p->se.nr_wakeups_affine                 = 0;
-        p->se.nr_wakeups_affine_attempts        = 0;
-        p->se.nr_wakeups_passive                = 0;
-        p->se.nr_wakeups_idle                   = 0;
-        p->sched_info.bkl_count                 = 0;
 #endif
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5a5ea2cd924f..217e4a9393e4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,8 +35,8 @@
 * (to see the precise effective timeslice length of your workload,
 *  run vmstat and monitor the context-switches (cs) field)
 */
-unsigned int sysctl_sched_latency = 5000000ULL;
+unsigned int sysctl_sched_latency = 6000000ULL;
-unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 /*
 * The initial- and re-scaling of tunables is configurable
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
 /*
 * Minimal preemption granularity for CPU-bound tasks:
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
 */
-unsigned int sysctl_sched_min_granularity = 1000000ULL;
+unsigned int sysctl_sched_min_granularity = 2000000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
 /*
 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
 */
-static unsigned int sched_nr_latency = 5;
+static unsigned int sched_nr_latency = 3;
 /*
 * After fork, child runs first. If set to 0 (default) then
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 {
        unsigned long delta_exec_weighted;
-        schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
+        schedstat_set(curr->statistics.exec_max,
+                      max((u64)delta_exec, curr->statistics.exec_max));
        curr->sum_exec_runtime += delta_exec;
        schedstat_add(cfs_rq, exec_clock, delta_exec);
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
+        schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
 }
 /*
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        schedstat_set(se->wait_max, max(se->wait_max,
+        schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
-                        rq_of(cfs_rq)->clock - se->wait_start));
+                        rq_of(cfs_rq)->clock - se->statistics.wait_start));
-        schedstat_set(se->wait_count, se->wait_count + 1);
+        schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
-        schedstat_set(se->wait_sum, se->wait_sum +
+        schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
-                        rq_of(cfs_rq)->clock - se->wait_start);
+                        rq_of(cfs_rq)->clock - se->statistics.wait_start);
 #ifdef CONFIG_SCHEDSTATS
        if (entity_is_task(se)) {
                trace_sched_stat_wait(task_of(se),
-                        rq_of(cfs_rq)->clock - se->wait_start);
+                        rq_of(cfs_rq)->clock - se->statistics.wait_start);
        }
 #endif
-        schedstat_set(se->wait_start, 0);
+        schedstat_set(se->statistics.wait_start, 0);
 }
 static inline void
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
        if (entity_is_task(se))
                tsk = task_of(se);
-        if (se->sleep_start) {
+        if (se->statistics.sleep_start) {
-                u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+                u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
                if ((s64)delta < 0)
                        delta = 0;
-                if (unlikely(delta > se->sleep_max))
+                if (unlikely(delta > se->statistics.sleep_max))
-                        se->sleep_max = delta;
+                        se->statistics.sleep_max = delta;
-                se->sleep_start = 0;
+                se->statistics.sleep_start = 0;
-                se->sum_sleep_runtime += delta;
+                se->statistics.sum_sleep_runtime += delta;
                if (tsk) {
                        account_scheduler_latency(tsk, delta >> 10, 1);
                        trace_sched_stat_sleep(tsk, delta);
                }
        }
-        if (se->block_start) {
+        if (se->statistics.block_start) {
-                u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+                u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
                if ((s64)delta < 0)
                        delta = 0;
-                if (unlikely(delta > se->block_max))
+                if (unlikely(delta > se->statistics.block_max))
-                        se->block_max = delta;
+                        se->statistics.block_max = delta;
-                se->block_start = 0;
+                se->statistics.block_start = 0;
-                se->sum_sleep_runtime += delta;
+                se->statistics.sum_sleep_runtime += delta;
                if (tsk) {
                        if (tsk->in_iowait) {
-                                se->iowait_sum += delta;
+                                se->statistics.iowait_sum += delta;
-                                se->iowait_count++;
+                                se->statistics.iowait_count++;
                                trace_sched_stat_iowait(tsk, delta);
                        }
@@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
                vruntime += sched_vslice(cfs_rq, se);
        /* sleeps up to a single latency don't count. */
-        if (!initial && sched_feat(FAIR_SLEEPERS)) {
+        if (!initial) {
                unsigned long thresh = sysctl_sched_latency;
                /*
-                 * Convert the sleeper threshold into virtual time.
-                 * SCHED_IDLE is a special sub-class.  We care about
-                 * fairness only relative to other SCHED_IDLE tasks,
-                 * all of which have the same weight.
-                 */
-                if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
-                                 task_of(se)->policy != SCHED_IDLE))
-                        thresh = calc_delta_fair(thresh, se);
-                /*
                 * Halve their sleep time's effect, to allow
                 * for a gentler effect of sleepers:
                 */
@@ -766,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        se->vruntime = vruntime;
 }
-#define ENQUEUE_WAKEUP  1
-#define ENQUEUE_MIGRATE 2
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -776,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update the normalized vruntime before updating min_vruntime
         * through callig update_curr().
         */
-        if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
+        if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
                se->vruntime += cfs_rq->min_vruntime;
        /*
@@ -812,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
        /*
         * Update run-time statistics of the 'current'.
@@ -820,15 +808,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
        update_curr(cfs_rq);
        update_stats_dequeue(cfs_rq, se);
-        if (sleep) {
+        if (flags & DEQUEUE_SLEEP) {
 #ifdef CONFIG_SCHEDSTATS
                if (entity_is_task(se)) {
                        struct task_struct *tsk = task_of(se);
                        if (tsk->state & TASK_INTERRUPTIBLE)
-                                se->sleep_start = rq_of(cfs_rq)->clock;
+                                se->statistics.sleep_start = rq_of(cfs_rq)->clock;
                        if (tsk->state & TASK_UNINTERRUPTIBLE)
-                                se->block_start = rq_of(cfs_rq)->clock;
+                                se->statistics.block_start = rq_of(cfs_rq)->clock;
                }
 #endif
        }
@@ -845,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
         * update can refer to the ->curr item and we need to reflect this
         * movement in our normalized position.
         */
-        if (!sleep)
+        if (!(flags & DEQUEUE_SLEEP))
                se->vruntime -= cfs_rq->min_vruntime;
 }
@@ -912,7 +900,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
         * when there are only lesser-weight tasks around):
         */
        if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
-                se->slice_max = max(se->slice_max,
+                se->statistics.slice_max = max(se->statistics.slice_max,
                        se->sum_exec_runtime - se->prev_sum_exec_runtime);
        }
 #endif
@@ -1054,16 +1042,10 @@ static inline void hrtick_update(struct rq *rq)
 * then put the task into the rbtree:
 */
 static void
-enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
-        int flags = 0;
-        if (wakeup)
-                flags |= ENQUEUE_WAKEUP;
-        if (p->state == TASK_WAKING)
-                flags |= ENQUEUE_MIGRATE;
        for_each_sched_entity(se) {
                if (se->on_rq)
@@ -1081,18 +1063,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 * decreased. We remove the task from the rbtree and
 * update the fair scheduling stats:
 */
-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
-                dequeue_entity(cfs_rq, se, sleep);
+                dequeue_entity(cfs_rq, se, flags);
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight)
                        break;
-                sleep = 1;
+                flags |= DEQUEUE_SLEEP;
        }
        hrtick_update(rq);
@@ -1240,7 +1222,6 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
-        struct task_struct *curr = current;
        unsigned long this_load, load;
        int idx, this_cpu, prev_cpu;
        unsigned long tl_per_task;
@@ -1255,18 +1236,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
        load      = source_load(prev_cpu, idx);
        this_load = target_load(this_cpu, idx);
-        if (sync) {
-               if (sched_feat(SYNC_LESS) &&
-                   (curr->se.avg_overlap > sysctl_sched_migration_cost ||
-                    p->se.avg_overlap > sysctl_sched_migration_cost))
-                       sync = 0;
-        } else {
-                if (sched_feat(SYNC_MORE) &&
-                    (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-                     p->se.avg_overlap < sysctl_sched_migration_cost))
-                        sync = 1;
-        }
        /*
         * If sync wakeup then subtract the (maximum possible)
         * effect of the currently running task from the load
@@ -1306,7 +1275,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
        if (sync && balanced)
                return 1;
-        schedstat_inc(p, se.nr_wakeups_affine_attempts);
+        schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
        tl_per_task = cpu_avg_load_per_task(this_cpu);
        if (balanced ||
@@ -1318,7 +1287,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
                 * there is no bad imbalance.
                 */
                schedstat_inc(sd, ttwu_move_affine);
-                schedstat_inc(p, se.nr_wakeups_affine);
+                schedstat_inc(p, se.statistics.nr_wakeups_affine);
                return 1;
        }
@@ -1406,29 +1375,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 /*
 * Try and locate an idle CPU in the sched_domain.
 */
-static int
+static int select_idle_sibling(struct task_struct *p, int target)
-select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
 {
        int cpu = smp_processor_id();
        int prev_cpu = task_cpu(p);
+        struct sched_domain *sd;
        int i;
        /*
-         * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
+         * If the task is going to be woken-up on this cpu and if it is
-         * test in select_task_rq_fair) and the prev_cpu is idle then that's
+         * already idle, then it is the right target.
-         * always a better target than the current cpu.
         */
-        if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+        if (target == cpu && idle_cpu(cpu))
+                return cpu;
+        /*
+         * If the task is going to be woken-up on the cpu where it previously
+         * ran and if it is currently idle, then it the right target.
+         */
+        if (target == prev_cpu && idle_cpu(prev_cpu))
                return prev_cpu;
        /*
-         * Otherwise, iterate the domain and find an elegible idle cpu.
+         * Otherwise, iterate the domains and find an elegible idle cpu.
         */
-        for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+        for_each_domain(target, sd) {
-                if (!cpu_rq(i)->cfs.nr_running) {
+                if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
-                        target = i;
                        break;
+                for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+                        if (idle_cpu(i)) {
+                                target = i;
+                                break;
+                        }
                }
+                /*
+                 * Lets stop looking for an idle sibling when we reached
+                 * the domain that spans the current cpu and prev_cpu.
+                 */
+                if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
+                    cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+                        break;
        }
        return target;
@@ -1445,7 +1433,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
 *
 * preempt must be disabled.
 */
-static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+static int
+select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
 {
        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
        int cpu = smp_processor_id();
@@ -1456,8 +1445,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
        int sync = wake_flags & WF_SYNC;
        if (sd_flag & SD_BALANCE_WAKE) {
-                if (sched_feat(AFFINE_WAKEUPS) &&
+                if (cpumask_test_cpu(cpu, &p->cpus_allowed))
-                    cpumask_test_cpu(cpu, &p->cpus_allowed))
                        want_affine = 1;
                new_cpu = prev_cpu;
        }
@@ -1491,34 +1479,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                }
                /*
-                 * While iterating the domains looking for a spanning
+                 * If both cpu and prev_cpu are part of this domain,
-                 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
+                 * cpu is a valid SD_WAKE_AFFINE target.
-                 * in cache sharing domains along the way.
                 */
-                if (want_affine) {
+                if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
-                        int target = -1;
+                    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+                        affine_sd = tmp;
-                        /*
+                        want_affine = 0;
-                         * If both cpu and prev_cpu are part of this domain,
-                         * cpu is a valid SD_WAKE_AFFINE target.
-                         */
-                        if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
-                                target = cpu;
-                        /*
-                         * If there's an idle sibling in this domain, make that
-                         * the wake_affine target instead of the current cpu.
-                         */
-                        if (tmp->flags & SD_SHARE_PKG_RESOURCES)
-                                target = select_idle_sibling(p, tmp, target);
-                        if (target >= 0) {
-                                if (tmp->flags & SD_WAKE_AFFINE) {
-                                        affine_sd = tmp;
-                                        want_affine = 0;
-                                }
-                                cpu = target;
-                        }
                }
                if (!want_sd && !want_affine)
@@ -1531,22 +1498,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                        sd = tmp;
        }
+#ifdef CONFIG_FAIR_GROUP_SCHED
        if (sched_feat(LB_SHARES_UPDATE)) {
                /*
                 * Pick the largest domain to update shares over
                 */
                tmp = sd;
-                if (affine_sd && (!tmp ||
+                if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
-                                  cpumask_weight(sched_domain_span(affine_sd)) >
-                                  cpumask_weight(sched_domain_span(sd))))
                        tmp = affine_sd;
-                if (tmp)
+                if (tmp) {
+                        raw_spin_unlock(&rq->lock);
                        update_shares(tmp);
+                        raw_spin_lock(&rq->lock);
+                }
        }
+#endif
-        if (affine_sd && wake_affine(affine_sd, p, sync))
+        if (affine_sd) {
-                return cpu;
+                if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+                        return select_idle_sibling(p, cpu);
+                else
+                        return select_idle_sibling(p, prev_cpu);
+        }
        while (sd) {
                int load_idx = sd->forkexec_idx;
@@ -1576,10 +1550,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                /* Now try balancing at a lower domain level of new_cpu */
                cpu = new_cpu;
-                weight = cpumask_weight(sched_domain_span(sd));
+                weight = sd->span_weight;
                sd = NULL;
                for_each_domain(cpu, tmp) {
-                        if (weight <= cpumask_weight(sched_domain_span(tmp)))
+                        if (weight <= tmp->span_weight)
                                break;
                        if (tmp->flags & sd_flag)
                                sd = tmp;
@@ -1591,63 +1565,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 }
 #endif /* CONFIG_SMP */
-/*
- * Adaptive granularity
- *
- * se->avg_wakeup gives the average time a task runs until it does a wakeup,
- * with the limit of wakeup_gran -- when it never does a wakeup.
- *
- * So the smaller avg_wakeup is the faster we want this task to preempt,
- * but we don't want to treat the preemptee unfairly and therefore allow it
- * to run for at least the amount of time we'd like to run.
- *
- * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
- *
- * NOTE: we use *nr_running to scale with load, this nicely matches the
- *       degrading latency on load.
- */
-static unsigned long
-adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
-{
-        u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
-        u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
-        u64 gran = 0;
-        if (this_run < expected_wakeup)
-                gran = expected_wakeup - this_run;
-        return min_t(s64, gran, sysctl_sched_wakeup_granularity);
-}
 static unsigned long
 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
 {
        unsigned long gran = sysctl_sched_wakeup_granularity;
-        if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
-                gran = adaptive_gran(curr, se);
        /*
         * Since its curr running now, convert the gran from real-time
         * to virtual-time in his units.
+         *
+         * By using 'se' instead of 'curr' we penalize light tasks, so
+         * they get preempted easier. That is, if 'se' < 'curr' then
+         * the resulting gran will be larger, therefore penalizing the
+         * lighter, if otoh 'se' > 'curr' then the resulting gran will
+         * be smaller, again penalizing the lighter task.
+         *
+         * This is especially important for buddies when the leftmost
+         * task is higher priority than the buddy.
         */
-        if (sched_feat(ASYM_GRAN)) {
+        if (unlikely(se->load.weight != NICE_0_LOAD))
-                /*
+                gran = calc_delta_fair(gran, se);
-                 * By using 'se' instead of 'curr' we penalize light tasks, so
-                 * they get preempted easier. That is, if 'se' < 'curr' then
-                 * the resulting gran will be larger, therefore penalizing the
-                 * lighter, if otoh 'se' > 'curr' then the resulting gran will
-                 * be smaller, again penalizing the lighter task.
-                 *
-                 * This is especially important for buddies when the leftmost
-                 * task is higher priority than the buddy.
-                 */
-                if (unlikely(se->load.weight != NICE_0_LOAD))
-                        gran = calc_delta_fair(gran, se);
-        } else {
-                if (unlikely(curr->load.weight != NICE_0_LOAD))
-                        gran = calc_delta_fair(gran, curr);
-        }
        return gran;
 }
@@ -1705,7 +1642,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        struct task_struct *curr = rq->curr;
        struct sched_entity *se = &curr->se, *pse = &p->se;
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-        int sync = wake_flags & WF_SYNC;
        int scale = cfs_rq->nr_running >= sched_nr_latency;
        if (unlikely(rt_prio(p->prio)))
@@ -1738,14 +1674,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        if (unlikely(curr->policy == SCHED_IDLE))
                goto preempt;
-        if (sched_feat(WAKEUP_SYNC) && sync)
-                goto preempt;
-        if (sched_feat(WAKEUP_OVERLAP) &&
-                        se->avg_overlap < sysctl_sched_migration_cost &&
-                        pse->avg_overlap < sysctl_sched_migration_cost)
-                goto preempt;
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
@@ -1844,13 +1772,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         * 3) are cache-hot on their current CPU.
         */
        if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
-                schedstat_inc(p, se.nr_failed_migrations_affine);
+                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
                return 0;
        }
        *all_pinned = 0;
        if (task_running(rq, p)) {
-                schedstat_inc(p, se.nr_failed_migrations_running);
+                schedstat_inc(p, se.statistics.nr_failed_migrations_running);
                return 0;
        }
@@ -1866,14 +1794,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 #ifdef CONFIG_SCHEDSTATS
                if (tsk_cache_hot) {
                        schedstat_inc(sd, lb_hot_gained[idle]);
-                        schedstat_inc(p, se.nr_forced_migrations);
+                        schedstat_inc(p, se.statistics.nr_forced_migrations);
                }
 #endif
                return 1;
        }
        if (tsk_cache_hot) {
-                schedstat_inc(p, se.nr_failed_migrations_hot);
+                schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
                return 0;
        }
        return 1;
@@ -2311,7 +2239,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
 unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
 {
-        unsigned long weight = cpumask_weight(sched_domain_span(sd));
+        unsigned long weight = sd->span_weight;
        unsigned long smt_gain = sd->smt_gain;
        smt_gain /= weight;
@@ -2344,7 +2272,7 @@ unsigned long scale_rt_power(int cpu)
 static void update_cpu_power(struct sched_domain *sd, int cpu)
 {
-        unsigned long weight = cpumask_weight(sched_domain_span(sd));
+        unsigned long weight = sd->span_weight;
        unsigned long power = SCHED_LOAD_SCALE;
        struct sched_group *sdg = sd->groups;
@@ -2870,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
+static int active_load_balance_cpu_stop(void *data);
 /*
 * Check this_cpu to ensure it is balanced within domain. Attempt to move
 * tasks if there is an imbalance.
@@ -2959,8 +2889,9 @@ redo:
                if (need_active_balance(sd, sd_idle, idle)) {
                        raw_spin_lock_irqsave(&busiest->lock, flags);
-                        /* don't kick the migration_thread, if the curr
+                        /* don't kick the active_load_balance_cpu_stop,
-                         * task on busiest cpu can't be moved to this_cpu
+                         * if the curr task on busiest cpu can't be
+                         * moved to this_cpu
                         */
                        if (!cpumask_test_cpu(this_cpu,
                                              &busiest->curr->cpus_allowed)) {
@@ -2970,14 +2901,22 @@ redo:
                                goto out_one_pinned;
                        }
+                        /*
+                         * ->active_balance synchronizes accesses to
+                         * ->active_balance_work.  Once set, it's cleared
+                         * only after active load balance is finished.
+                         */
                        if (!busiest->active_balance) {
                                busiest->active_balance = 1;
                                busiest->push_cpu = this_cpu;
                                active_balance = 1;
                        }
                        raw_spin_unlock_irqrestore(&busiest->lock, flags);
                        if (active_balance)
-                                wake_up_process(busiest->migration_thread);
+                                stop_one_cpu_nowait(cpu_of(busiest),
+                                        active_load_balance_cpu_stop, busiest,
+                                        &busiest->active_balance_work);
                        /*
                         * We've kicked active balancing, reset the failure
@@ -3084,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 }
 /*
- * active_load_balance is run by migration threads. It pushes running tasks
+ * active_load_balance_cpu_stop is run by cpu stopper. It pushes
- * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
+ * running tasks off the busiest CPU onto idle CPUs. It requires at
- * running on each physical CPU where possible, and avoids physical /
+ * least 1 task to be running on each physical CPU where possible, and
- * logical imbalances.
+ * avoids physical / logical imbalances.
- *
- * Called with busiest_rq locked.
 */
-static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
+static int active_load_balance_cpu_stop(void *data)
 {
+        struct rq *busiest_rq = data;
+        int busiest_cpu = cpu_of(busiest_rq);
        int target_cpu = busiest_rq->push_cpu;
+        struct rq *target_rq = cpu_rq(target_cpu);
        struct sched_domain *sd;
-        struct rq *target_rq;
+        raw_spin_lock_irq(&busiest_rq->lock);
+        /* make sure the requested cpu hasn't gone down in the meantime */
+        if (unlikely(busiest_cpu != smp_processor_id() ||
+                     !busiest_rq->active_balance))
+                goto out_unlock;
        /* Is there any task to move? */
        if (busiest_rq->nr_running <= 1)
-                return;
+                goto out_unlock;
-        target_rq = cpu_rq(target_cpu);
        /*
         * This condition is "impossible", if it occurs
@@ -3112,8 +3056,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
        /* move a task from busiest_rq to target_rq */
        double_lock_balance(busiest_rq, target_rq);
-        update_rq_clock(busiest_rq);
-        update_rq_clock(target_rq);
        /* Search for an sd spanning us and the target CPU. */
        for_each_domain(target_cpu, sd) {
@@ -3132,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
                        schedstat_inc(sd, alb_failed);
        }
        double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+        busiest_rq->active_balance = 0;
+        raw_spin_unlock_irq(&busiest_rq->lock);
+        return 0;
 }
 #ifdef CONFIG_NO_HZ
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d5059fd761d9..83c66e8ad3ee 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,11 +1,4 @@
 /*
- * Disregards a certain amount of sleep time (sched_latency_ns) and
- * considers the task to be running during that period. This gives it
- * a service deficit on wakeup, allowing it to run sooner.
- */
-SCHED_FEAT(FAIR_SLEEPERS, 1)
-/*
 * Only give sleepers 50% of their service deficit. This allows
 * them to run sooner, but does not allow tons of sleepers to
 * rip the spread apart.
@@ -13,13 +6,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1)
 SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
 /*
- * By not normalizing the sleep time, heavy tasks get an effective
- * longer period, and lighter task an effective shorter period they
- * are considered running.
- */
-SCHED_FEAT(NORMALIZED_SLEEPER, 0)
-/*
 * Place new tasks ahead so that they do not starve already running
 * tasks
 */
@@ -31,37 +17,6 @@ SCHED_FEAT(START_DEBIT, 1)
 SCHED_FEAT(WAKEUP_PREEMPT, 1)
 /*
- * Compute wakeup_gran based on task behaviour, clipped to
- *  [0, sched_wakeup_gran_ns]
- */
-SCHED_FEAT(ADAPTIVE_GRAN, 1)
-/*
- * When converting the wakeup granularity to virtual time, do it such
- * that heavier tasks preempting a lighter task have an edge.
- */
-SCHED_FEAT(ASYM_GRAN, 1)
-/*
- * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
- */
-SCHED_FEAT(WAKEUP_SYNC, 0)
-/*
- * Wakeup preempt based on task behaviour. Tasks that do not overlap
- * don't get preempted.
- */
-SCHED_FEAT(WAKEUP_OVERLAP, 0)
-/*
- * Use the SYNC wakeup hint, pipes and the likes use this to indicate
- * the remote end is likely to consume the data we just wrote, and
- * therefore has cache benefit from being placed on the same cpu, see
- * also AFFINE_WAKEUPS.
- */
-SCHED_FEAT(SYNC_WAKEUPS, 1)
-/*
 * Based on load and program behaviour, see if it makes sense to place
 * a newly woken task on the same cpu as the task that woke it --
 * improve cache locality. Typically used with SYNC wakeups as
@@ -70,16 +25,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
 SCHED_FEAT(AFFINE_WAKEUPS, 1)
 /*
- * Weaken SYNC hint based on overlap
- */
-SCHED_FEAT(SYNC_LESS, 1)
-/*
- * Add SYNC hint based on overlap
- */
-SCHED_FEAT(SYNC_MORE, 0)
-/*
 * Prefer to schedule the task we woke last (assuming it failed
 * wakeup-preemption), since its likely going to consume data we
 * touched, increases cache locality.
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a8a6d8a50947..9fa0f402c87c 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,8 @@
 */
 #ifdef CONFIG_SMP
-static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
+static int
+select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
 {
        return task_cpu(p); /* IDLE tasks as never migrated */
 }
@@ -22,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
        schedstat_inc(rq, sched_goidle);
-        /* adjust the active tasks as we might go into a long sleep */
+        calc_load_account_idle(rq);
-        calc_load_account_active(rq);
        return rq->idle;
 }
@@ -32,7 +32,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
 * message if some code attempts to do it:
 */
 static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 {
        raw_spin_unlock_irq(&rq->lock);
        printk(KERN_ERR "bad: scheduling from the idle thread!\n");
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b5b920ae2ea7..8afb953e31c6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -613,7 +613,7 @@ static void update_curr_rt(struct rq *rq)
        if (unlikely((s64)delta_exec < 0))
                delta_exec = 0;
-        schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
+        schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
        curr->se.sum_exec_runtime += delta_exec;
        account_group_exec_runtime(curr, delta_exec);
@@ -888,20 +888,20 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 * Adding/removing a task to/from a priority array:
 */
 static void
-enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
        struct sched_rt_entity *rt_se = &p->rt;
-        if (wakeup)
+        if (flags & ENQUEUE_WAKEUP)
                rt_se->timeout = 0;
-        enqueue_rt_entity(rt_se, head);
+        enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
 }
-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
        struct sched_rt_entity *rt_se = &p->rt;
@@ -948,10 +948,9 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
-static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
+static int
+select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
 {
-        struct rq *rq = task_rq(p);
        if (sd_flag != SD_BALANCE_WAKE)
                return smp_processor_id();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 9bb9fb1bd79c..ef51d1fcf5e6 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,17 +1,381 @@
-/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
+/*
- * GPL v2 and any later version.
+ * kernel/stop_machine.c
+ *
+ * Copyright (C) 2008, 2005     IBM Corporation.
+ * Copyright (C) 2008, 2005     Rusty Russell rusty@rustcorp.com.au
+ * Copyright (C) 2010           SUSE Linux Products GmbH
+ * Copyright (C) 2010           Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2 and any later version.
 */
+#include <linux/completion.h>
 #include <linux/cpu.h>
-#include <linux/err.h>
+#include <linux/init.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
+#include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/stop_machine.h>
-#include <linux/syscalls.h>
 #include <linux/interrupt.h>
+#include <linux/kallsyms.h>
 #include <asm/atomic.h>
-#include <asm/uaccess.h>
+/*
+ * Structure to determine completion condition and record errors.  May
+ * be shared by works on different cpus.
+ */
+struct cpu_stop_done {
+        atomic_t                nr_todo;        /* nr left to execute */
+        bool                    executed;       /* actually executed? */
+        int                     ret;            /* collected return value */
+        struct completion       completion;     /* fired if nr_todo reaches 0 */
+};
+/* the actual stopper, one per every possible cpu, enabled on online cpus */
+struct cpu_stopper {
+        spinlock_t              lock;
+        struct list_head        works;          /* list of pending works */
+        struct task_struct      *thread;        /* stopper thread */
+        bool                    enabled;        /* is this stopper enabled? */
+};
+static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
+{
+        memset(done, 0, sizeof(*done));
+        atomic_set(&done->nr_todo, nr_todo);
+        init_completion(&done->completion);
+}
+/* signal completion unless @done is NULL */
+static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
+{
+        if (done) {
+                if (executed)
+                        done->executed = true;
+                if (atomic_dec_and_test(&done->nr_todo))
+                        complete(&done->completion);
+        }
+}
+/* queue @work to @stopper.  if offline, @work is completed immediately */
+static void cpu_stop_queue_work(struct cpu_stopper *stopper,
+                                struct cpu_stop_work *work)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&stopper->lock, flags);
+        if (stopper->enabled) {
+                list_add_tail(&work->list, &stopper->works);
+                wake_up_process(stopper->thread);
+        } else
+                cpu_stop_signal_done(work->done, false);
+        spin_unlock_irqrestore(&stopper->lock, flags);
+}
+/**
+ * stop_one_cpu - stop a cpu
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on @cpu.  @fn is run in a process context with
+ * the highest priority preempting any task on the cpu and
+ * monopolizing it.  This function returns after the execution is
+ * complete.
+ *
+ * This function doesn't guarantee @cpu stays online till @fn
+ * completes.  If @cpu goes down in the middle, execution may happen
+ * partially or fully on different cpus.  @fn should either be ready
+ * for that or the caller should ensure that @cpu stays online until
+ * this function completes.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
+ * otherwise, the return value of @fn.
+ */
+int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
+{
+        struct cpu_stop_done done;
+        struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
+        cpu_stop_init_done(&done, 1);
+        cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
+        wait_for_completion(&done.completion);
+        return done.executed ? done.ret : -ENOENT;
+}
+/**
+ * stop_one_cpu_nowait - stop a cpu but don't wait for completion
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Similar to stop_one_cpu() but doesn't wait for completion.  The
+ * caller is responsible for ensuring @work_buf is currently unused
+ * and will remain untouched until stopper starts executing @fn.
+ *
+ * CONTEXT:
+ * Don't care.
+ */
+void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+                        struct cpu_stop_work *work_buf)
+{
+        *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
+        cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
+}
+/* static data for stop_cpus */
+static DEFINE_MUTEX(stop_cpus_mutex);
+static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
+int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+        struct cpu_stop_work *work;
+        struct cpu_stop_done done;
+        unsigned int cpu;
+        /* initialize works and done */
+        for_each_cpu(cpu, cpumask) {
+                work = &per_cpu(stop_cpus_work, cpu);
+                work->fn = fn;
+                work->arg = arg;
+                work->done = &done;
+        }
+        cpu_stop_init_done(&done, cpumask_weight(cpumask));
+        /*
+         * Disable preemption while queueing to avoid getting
+         * preempted by a stopper which might wait for other stoppers
+         * to enter @fn which can lead to deadlock.
+         */
+        preempt_disable();
+        for_each_cpu(cpu, cpumask)
+                cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
+                                    &per_cpu(stop_cpus_work, cpu));
+        preempt_enable();
+        wait_for_completion(&done.completion);
+        return done.executed ? done.ret : -ENOENT;
+}
+/**
+ * stop_cpus - stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on online cpus in @cpumask.  On each target cpu,
+ * @fn is run in a process context with the highest priority
+ * preempting any task on the cpu and monopolizing it.  This function
+ * returns after all executions are complete.
+ *
+ * This function doesn't guarantee the cpus in @cpumask stay online
+ * till @fn completes.  If some cpus go down in the middle, execution
+ * on the cpu may happen partially or fully on different cpus.  @fn
+ * should either be ready for that or the caller should ensure that
+ * the cpus stay online until this function completes.
+ *
+ * All stop_cpus() calls are serialized making it safe for @fn to wait
+ * for all cpus to start executing it.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed at all because all cpus in
+ * @cpumask were offline; otherwise, 0 if all executions of @fn
+ * returned 0, any non zero return value if any returned non zero.
+ */
+int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+        int ret;
+        /* static works are used, process one request at a time */
+        mutex_lock(&stop_cpus_mutex);
+        ret = __stop_cpus(cpumask, fn, arg);
+        mutex_unlock(&stop_cpus_mutex);
+        return ret;
+}
+/**
+ * try_stop_cpus - try to stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Identical to stop_cpus() except that it fails with -EAGAIN if
+ * someone else is already using the facility.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -EAGAIN if someone else is already stopping cpus, -ENOENT if
+ * @fn(@arg) was not executed at all because all cpus in @cpumask were
+ * offline; otherwise, 0 if all executions of @fn returned 0, any non
+ * zero return value if any returned non zero.
+ */
+int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+        int ret;
+        /* static works are used, process one request at a time */
+        if (!mutex_trylock(&stop_cpus_mutex))
+                return -EAGAIN;
+        ret = __stop_cpus(cpumask, fn, arg);
+        mutex_unlock(&stop_cpus_mutex);
+        return ret;
+}
+static int cpu_stopper_thread(void *data)
+{
+        struct cpu_stopper *stopper = data;
+        struct cpu_stop_work *work;
+        int ret;
+repeat:
+        set_current_state(TASK_INTERRUPTIBLE);  /* mb paired w/ kthread_stop */
+        if (kthread_should_stop()) {
+                __set_current_state(TASK_RUNNING);
+                return 0;
+        }
+        work = NULL;
+        spin_lock_irq(&stopper->lock);
+        if (!list_empty(&stopper->works)) {
+                work = list_first_entry(&stopper->works,
+                                        struct cpu_stop_work, list);
+                list_del_init(&work->list);
+        }
+        spin_unlock_irq(&stopper->lock);
+        if (work) {
+                cpu_stop_fn_t fn = work->fn;
+                void *arg = work->arg;
+                struct cpu_stop_done *done = work->done;
+                char ksym_buf[KSYM_NAME_LEN];
+                __set_current_state(TASK_RUNNING);
+                /* cpu stop callbacks are not allowed to sleep */
+                preempt_disable();
+                ret = fn(arg);
+                if (ret)
+                        done->ret = ret;
+                /* restore preemption and check it's still balanced */
+                preempt_enable();
+                WARN_ONCE(preempt_count(),
+                          "cpu_stop: %s(%p) leaked preempt count\n",
+                          kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
+                                          ksym_buf), arg);
+                cpu_stop_signal_done(done, true);
+        } else
+                schedule();
+        goto repeat;
+}
+/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
+static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
+                                           unsigned long action, void *hcpu)
+{
+        struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+        unsigned int cpu = (unsigned long)hcpu;
+        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+        struct cpu_stop_work *work;
+        struct task_struct *p;
+        switch (action & ~CPU_TASKS_FROZEN) {
+        case CPU_UP_PREPARE:
+                BUG_ON(stopper->thread || stopper->enabled ||
+                       !list_empty(&stopper->works));
+                p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
+                                   cpu);
+                if (IS_ERR(p))
+                        return NOTIFY_BAD;
+                sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+                get_task_struct(p);
+                stopper->thread = p;
+                break;
+        case CPU_ONLINE:
+                kthread_bind(stopper->thread, cpu);
+                /* strictly unnecessary, as first user will wake it */
+                wake_up_process(stopper->thread);
+                /* mark enabled */
+                spin_lock_irq(&stopper->lock);
+                stopper->enabled = true;
+                spin_unlock_irq(&stopper->lock);
+                break;
+#ifdef CONFIG_HOTPLUG_CPU
+        case CPU_UP_CANCELED:
+        case CPU_DEAD:
+                /* kill the stopper */
+                kthread_stop(stopper->thread);
+                /* drain remaining works */
+                spin_lock_irq(&stopper->lock);
+                list_for_each_entry(work, &stopper->works, list)
+                        cpu_stop_signal_done(work->done, false);
+                stopper->enabled = false;
+                spin_unlock_irq(&stopper->lock);
+                /* release the stopper */
+                put_task_struct(stopper->thread);
+                stopper->thread = NULL;
+                break;
+#endif
+        }
+        return NOTIFY_OK;
+}
+/*
+ * Give it a higher priority so that cpu stopper is available to other
+ * cpu notifiers.  It currently shares the same priority as sched
+ * migration_notifier.
+ */
+static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
+        .notifier_call  = cpu_stop_cpu_callback,
+        .priority       = 10,
+};
+static int __init cpu_stop_init(void)
+{
+        void *bcpu = (void *)(long)smp_processor_id();
+        unsigned int cpu;
+        int err;
+        for_each_possible_cpu(cpu) {
+                struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+                spin_lock_init(&stopper->lock);
+                INIT_LIST_HEAD(&stopper->works);
+        }
+        /* start one for the boot cpu */
+        err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
+                                    bcpu);
+        BUG_ON(err == NOTIFY_BAD);
+        cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
+        register_cpu_notifier(&cpu_stop_cpu_notifier);
+        return 0;
+}
+early_initcall(cpu_stop_init);
+#ifdef CONFIG_STOP_MACHINE
 /* This controls the threads on each CPU. */
 enum stopmachine_state {
@@ -26,174 +390,94 @@ enum stopmachine_state {
        /* Exit */
        STOPMACHINE_EXIT,
 };
-static enum stopmachine_state state;
 struct stop_machine_data {
-        int (*fn)(void *);
+        int                     (*fn)(void *);
-        void *data;
+        void                    *data;
-        int fnret;
+        /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+        unsigned int            num_threads;
+        const struct cpumask    *active_cpus;
+        enum stopmachine_state  state;
+        atomic_t                thread_ack;
 };
-/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+static void set_state(struct stop_machine_data *smdata,
-static unsigned int num_threads;
+                      enum stopmachine_state newstate)
-static atomic_t thread_ack;
-static DEFINE_MUTEX(lock);
-/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
-static DEFINE_MUTEX(setup_lock);
-/* Users of stop_machine. */
-static int refcount;
-static struct workqueue_struct *stop_machine_wq;
-static struct stop_machine_data active, idle;
-static const struct cpumask *active_cpus;
-static void __percpu *stop_machine_work;
-static void set_state(enum stopmachine_state newstate)
 {
        /* Reset ack counter. */
-        atomic_set(&thread_ack, num_threads);
+        atomic_set(&smdata->thread_ack, smdata->num_threads);
        smp_wmb();
-        state = newstate;
+        smdata->state = newstate;
 }
 /* Last one to ack a state moves to the next state. */
-static void ack_state(void)
+static void ack_state(struct stop_machine_data *smdata)
 {
-        if (atomic_dec_and_test(&thread_ack))
+        if (atomic_dec_and_test(&smdata->thread_ack))
-                set_state(state + 1);
+                set_state(smdata, smdata->state + 1);
 }
-/* This is the actual function which stops the CPU. It runs
+/* This is the cpu_stop function which stops the CPU. */
- * in the context of a dedicated stopmachine workqueue. */
+static int stop_machine_cpu_stop(void *data)
-static void stop_cpu(struct work_struct *unused)
 {
+        struct stop_machine_data *smdata = data;
        enum stopmachine_state curstate = STOPMACHINE_NONE;
-        struct stop_machine_data *smdata = &idle;
+        int cpu = smp_processor_id(), err = 0;
-        int cpu = smp_processor_id();
+        bool is_active;
-        int err;
+        if (!smdata->active_cpus)
+                is_active = cpu == cpumask_first(cpu_online_mask);
+        else
+                is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
-        if (!active_cpus) {
-                if (cpu == cpumask_first(cpu_online_mask))
-                        smdata = &active;
-        } else {
-                if (cpumask_test_cpu(cpu, active_cpus))
-                        smdata = &active;
-        }
        /* Simple state machine */
        do {
                /* Chill out and ensure we re-read stopmachine_state. */
                cpu_relax();
-                if (state != curstate) {
+                if (smdata->state != curstate) {
-                        curstate = state;
+                        curstate = smdata->state;
                        switch (curstate) {
                        case STOPMACHINE_DISABLE_IRQ:
                                local_irq_disable();
                                hard_irq_disable();
                                break;
                        case STOPMACHINE_RUN:
-                                /* On multiple CPUs only a single error code
+                                if (is_active)
-                                 * is needed to tell that something failed. */
+                                        err = smdata->fn(smdata->data);
-                                err = smdata->fn(smdata->data);
-                                if (err)
-                                        smdata->fnret = err;
                                break;
                        default:
                                break;
                        }
-                        ack_state();
+                        ack_state(smdata);
                }
        } while (curstate != STOPMACHINE_EXIT);
        local_irq_enable();
+        return err;
 }
-/* Callback for CPUs which aren't supposed to do anything. */
-static int chill(void *unused)
-{
-        return 0;
-}
-int stop_machine_create(void)
-{
-        mutex_lock(&setup_lock);
-        if (refcount)
-                goto done;
-        stop_machine_wq = create_rt_workqueue("kstop");
-        if (!stop_machine_wq)
-                goto err_out;
-        stop_machine_work = alloc_percpu(struct work_struct);
-        if (!stop_machine_work)
-                goto err_out;
-done:
-        refcount++;
-        mutex_unlock(&setup_lock);
-        return 0;
-err_out:
-        if (stop_machine_wq)
-                destroy_workqueue(stop_machine_wq);
-        mutex_unlock(&setup_lock);
-        return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(stop_machine_create);
-void stop_machine_destroy(void)
-{
-        mutex_lock(&setup_lock);
-        refcount--;
-        if (refcount)
-                goto done;
-        destroy_workqueue(stop_machine_wq);
-        free_percpu(stop_machine_work);
-done:
-        mutex_unlock(&setup_lock);
-}
-EXPORT_SYMBOL_GPL(stop_machine_destroy);
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
-        struct work_struct *sm_work;
+        struct stop_machine_data smdata = { .fn = fn, .data = data,
-        int i, ret;
+                                            .num_threads = num_online_cpus(),
+                                            .active_cpus = cpus };
-        /* Set up initial state. */
-        mutex_lock(&lock);
+        /* Set the initial state and stop all online cpus. */
-        num_threads = num_online_cpus();
+        set_state(&smdata, STOPMACHINE_PREPARE);
-        active_cpus = cpus;
+        return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
-        active.fn = fn;
-        active.data = data;
-        active.fnret = 0;
-        idle.fn = chill;
-        idle.data = NULL;
-        set_state(STOPMACHINE_PREPARE);
-        /* Schedule the stop_cpu work on all cpus: hold this CPU so one
-         * doesn't hit this CPU until we're ready. */
-        get_cpu();
-        for_each_online_cpu(i) {
-                sm_work = per_cpu_ptr(stop_machine_work, i);
-                INIT_WORK(sm_work, stop_cpu);
-                queue_work_on(i, stop_machine_wq, sm_work);
-        }
-        /* This will release the thread on our CPU. */
-        put_cpu();
-        flush_workqueue(stop_machine_wq);
-        ret = active.fnret;
-        mutex_unlock(&lock);
-        return ret;
 }
 int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
        int ret;
-        ret = stop_machine_create();
-        if (ret)
-                return ret;
        /* No CPUs can come up or down during this. */
        get_online_cpus();
        ret = __stop_machine(fn, data, cpus);
        put_online_cpus();
-        stop_machine_destroy();
        return ret;
 }
 EXPORT_SYMBOL_GPL(stop_machine);
+#endif  /* CONFIG_STOP_MACHINE */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f992762d7f51..1d7b9bc1c034 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,14 +150,32 @@ static void tick_nohz_update_jiffies(ktime_t now)
        touch_softlockup_watchdog();
 }
+/*
+ * Updates the per cpu time idle statistics counters
+ */
+static void
+update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
+{
+        ktime_t delta;
+        if (ts->idle_active) {
+                delta = ktime_sub(now, ts->idle_entrytime);
+                ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+                if (nr_iowait_cpu() > 0)
+                        ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+                ts->idle_entrytime = now;
+        }
+        if (last_update_time)
+                *last_update_time = ktime_to_us(now);
+}
 static void tick_nohz_stop_idle(int cpu, ktime_t now)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-        ktime_t delta;
-        delta = ktime_sub(now, ts->idle_entrytime);
+        update_ts_time_stats(ts, now, NULL);
-        ts->idle_lastupdate = now;
-        ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
        ts->idle_active = 0;
        sched_clock_idle_wakeup_event(0);
@@ -165,20 +183,32 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
 static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 {
-        ktime_t now, delta;
+        ktime_t now;
        now = ktime_get();
-        if (ts->idle_active) {
-                delta = ktime_sub(now, ts->idle_entrytime);
+        update_ts_time_stats(ts, now, NULL);
-                ts->idle_lastupdate = now;
-                ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-        }
        ts->idle_entrytime = now;
        ts->idle_active = 1;
        sched_clock_idle_sleep_event();
        return now;
 }
+/**
+ * get_cpu_idle_time_us - get the total idle time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative idle time (since boot) for a given
+ * CPU, in microseconds. The idle time returned includes
+ * the iowait time (unlike what "top" and co report).
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -186,15 +216,38 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
        if (!tick_nohz_enabled)
                return -1;
-        if (ts->idle_active)
+        update_ts_time_stats(ts, ktime_get(), last_update_time);
-                *last_update_time = ktime_to_us(ts->idle_lastupdate);
-        else
-                *last_update_time = ktime_to_us(ktime_get());
        return ktime_to_us(ts->idle_sleeptime);
 }
 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
+/*
+ * get_cpu_iowait_time_us - get the total iowait time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative iowait time (since boot) for a given
+ * CPU, in microseconds.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
+u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
+{
+        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+        if (!tick_nohz_enabled)
+                return -1;
+        update_ts_time_stats(ts, ktime_get(), last_update_time);
+        return ktime_to_us(ts->iowait_sleeptime);
+}
+EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 /**
 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
 *
@@ -262,6 +315,9 @@ void tick_nohz_stop_sched_tick(int inidle)
                goto end;
        }
+        if (nohz_ratelimit(cpu))
+                goto end;
        ts->idle_calls++;
        /* Read jiffies and the time when jiffies were updated last */
        do {
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1a4a7dd78777..ab8f5e33fa92 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -176,6 +176,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
                P_ns(idle_waketime);
                P_ns(idle_exittime);
                P_ns(idle_sleeptime);
+                P_ns(iowait_sleeptime);
                P(last_jiffies);
                P(next_jiffies);
                P_ns(idle_expires);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b3bc91a3f510..36ea2b65dcdc 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -675,28 +675,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
        }
 }
-static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_abort(void *ignore,
+                                   struct request_queue *q, struct request *rq)
 {
        blk_add_trace_rq(q, rq, BLK_TA_ABORT);
 }
-static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_insert(void *ignore,
+                                    struct request_queue *q, struct request *rq)
 {
        blk_add_trace_rq(q, rq, BLK_TA_INSERT);
 }
-static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_issue(void *ignore,
+                                   struct request_queue *q, struct request *rq)
 {
        blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
 }
-static void blk_add_trace_rq_requeue(struct request_queue *q,
+static void blk_add_trace_rq_requeue(void *ignore,
+                                     struct request_queue *q,
                                     struct request *rq)
 {
        blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
 }
-static void blk_add_trace_rq_complete(struct request_queue *q,
+static void blk_add_trace_rq_complete(void *ignore,
+                                      struct request_queue *q,
                                      struct request *rq)
 {
        blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
@@ -724,34 +729,40 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
                        !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
 }
-static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_bounce(void *ignore,
+                                     struct request_queue *q, struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
 }
-static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_complete(void *ignore,
+                                       struct request_queue *q, struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
 }
-static void blk_add_trace_bio_backmerge(struct request_queue *q,
+static void blk_add_trace_bio_backmerge(void *ignore,
+                                        struct request_queue *q,
                                        struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
 }
-static void blk_add_trace_bio_frontmerge(struct request_queue *q,
+static void blk_add_trace_bio_frontmerge(void *ignore,
+                                         struct request_queue *q,
                                         struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
 }
-static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_queue(void *ignore,
+                                    struct request_queue *q, struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
 }
-static void blk_add_trace_getrq(struct request_queue *q,
+static void blk_add_trace_getrq(void *ignore,
+                                struct request_queue *q,
                                struct bio *bio, int rw)
 {
        if (bio)
@@ -765,7 +776,8 @@ static void blk_add_trace_getrq(struct request_queue *q,
 }
-static void blk_add_trace_sleeprq(struct request_queue *q,
+static void blk_add_trace_sleeprq(void *ignore,
+                                  struct request_queue *q,
                                  struct bio *bio, int rw)
 {
        if (bio)
@@ -779,7 +791,7 @@ static void blk_add_trace_sleeprq(struct request_queue *q,
        }
 }
-static void blk_add_trace_plug(struct request_queue *q)
+static void blk_add_trace_plug(void *ignore, struct request_queue *q)
 {
        struct blk_trace *bt = q->blk_trace;
@@ -787,7 +799,7 @@ static void blk_add_trace_plug(struct request_queue *q)
                __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
 }
-static void blk_add_trace_unplug_io(struct request_queue *q)
+static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
 {
        struct blk_trace *bt = q->blk_trace;
@@ -800,7 +812,7 @@ static void blk_add_trace_unplug_io(struct request_queue *q)
        }
 }
-static void blk_add_trace_unplug_timer(struct request_queue *q)
+static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
 {
        struct blk_trace *bt = q->blk_trace;
@@ -813,7 +825,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q)
        }
 }
-static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
+static void blk_add_trace_split(void *ignore,
+                                struct request_queue *q, struct bio *bio,
                                unsigned int pdu)
 {
        struct blk_trace *bt = q->blk_trace;
@@ -839,8 +852,9 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
 *     it spans a stripe (or similar). Add a trace for that action.
 *
 **/
-static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
+static void blk_add_trace_remap(void *ignore,
-                                       dev_t dev, sector_t from)
+                                struct request_queue *q, struct bio *bio,
+                                dev_t dev, sector_t from)
 {
        struct blk_trace *bt = q->blk_trace;
        struct blk_io_trace_remap r;
@@ -869,7 +883,8 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
 *     Add a trace for that action.
 *
 **/
-static void blk_add_trace_rq_remap(struct request_queue *q,
+static void blk_add_trace_rq_remap(void *ignore,
+                                   struct request_queue *q,
                                   struct request *rq, dev_t dev,
                                   sector_t from)
 {
@@ -921,64 +936,64 @@ static void blk_register_tracepoints(void)
 {
        int ret;
-        ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
+        ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
+        ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
+        ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+        ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
+        ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+        ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
+        ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+        ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+        ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
+        ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_getrq(blk_add_trace_getrq);
+        ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
+        ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_plug(blk_add_trace_plug);
+        ret = register_trace_block_plug(blk_add_trace_plug, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+        ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
+        ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_split(blk_add_trace_split);
+        ret = register_trace_block_split(blk_add_trace_split, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_remap(blk_add_trace_remap);
+        ret = register_trace_block_remap(blk_add_trace_remap, NULL);
        WARN_ON(ret);
-        ret = register_trace_block_rq_remap(blk_add_trace_rq_remap);
+        ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
        WARN_ON(ret);
 }
 static void blk_unregister_tracepoints(void)
 {
-        unregister_trace_block_rq_remap(blk_add_trace_rq_remap);
+        unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
-        unregister_trace_block_remap(blk_add_trace_remap);
+        unregister_trace_block_remap(blk_add_trace_remap, NULL);
-        unregister_trace_block_split(blk_add_trace_split);
+        unregister_trace_block_split(blk_add_trace_split, NULL);
-        unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
+        unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
-        unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+        unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
-        unregister_trace_block_plug(blk_add_trace_plug);
+        unregister_trace_block_plug(blk_add_trace_plug, NULL);
-        unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
+        unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
-        unregister_trace_block_getrq(blk_add_trace_getrq);
+        unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
-        unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
+        unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
-        unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+        unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
-        unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+        unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
-        unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
+        unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
-        unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+        unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
-        unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
+        unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
-        unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+        unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
-        unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
+        unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
-        unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
+        unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
-        unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
+        unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
        tracepoint_synchronize_unregister();
 }
@@ -1321,7 +1336,7 @@ out:
 }
 static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
-                                               int flags)
+                                               int flags, struct trace_event *event)
 {
        return print_one_line(iter, false);
 }
@@ -1343,7 +1358,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
 }
 static enum print_line_t
-blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
+blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
+                             struct trace_event *event)
 {
        return blk_trace_synthesize_old_trace(iter) ?
                        TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
@@ -1381,12 +1397,16 @@ static struct tracer blk_tracer __read_mostly = {
        .set_flag       = blk_tracer_set_flag,
 };
-static struct trace_event trace_blk_event = {
+static struct trace_event_functions trace_blk_event_funcs = {
-        .type           = TRACE_BLK,
        .trace          = blk_trace_event_print,
        .binary         = blk_trace_event_print_binary,
 };
+static struct trace_event trace_blk_event = {
+        .type           = TRACE_BLK,
+        .funcs          = &trace_blk_event_funcs,
+};
 static int __init init_blk_tracer(void)
 {
        if (!register_ftrace_event(&trace_blk_event)) {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2404b59b3097..6d2cb14f9449 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -264,6 +264,7 @@ struct ftrace_profile {
        unsigned long                   counter;
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        unsigned long long              time;
+        unsigned long long              time_squared;
 #endif
 };
@@ -366,9 +367,9 @@ static int function_stat_headers(struct seq_file *m)
 {
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        seq_printf(m, "  Function                               "
-                   "Hit    Time            Avg\n"
+                   "Hit    Time            Avg             s^2\n"
                      "  --------                               "
-                   "---    ----            ---\n");
+                   "---    ----            ---             ---\n");
 #else
        seq_printf(m, "  Function                               Hit\n"
                      "  --------                               ---\n");
@@ -384,6 +385,7 @@ static int function_stat_show(struct seq_file *m, void *v)
        static DEFINE_MUTEX(mutex);
        static struct trace_seq s;
        unsigned long long avg;
+        unsigned long long stddev;
 #endif
        kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -394,11 +396,25 @@ static int function_stat_show(struct seq_file *m, void *v)
        avg = rec->time;
        do_div(avg, rec->counter);
+        /* Sample standard deviation (s^2) */
+        if (rec->counter <= 1)
+                stddev = 0;
+        else {
+                stddev = rec->time_squared - rec->counter * avg * avg;
+                /*
+                 * Divide only 1000 for ns^2 -> us^2 conversion.
+                 * trace_print_graph_duration will divide 1000 again.
+                 */
+                do_div(stddev, (rec->counter - 1) * 1000);
+        }
        mutex_lock(&mutex);
        trace_seq_init(&s);
        trace_print_graph_duration(rec->time, &s);
        trace_seq_puts(&s, "    ");
        trace_print_graph_duration(avg, &s);
+        trace_seq_puts(&s, "    ");
+        trace_print_graph_duration(stddev, &s);
        trace_print_seq(m, &s);
        mutex_unlock(&mutex);
 #endif
@@ -650,6 +666,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
        if (!stat->hash || !ftrace_profile_enabled)
                goto out;
+        /* If the calltime was zero'd ignore it */
+        if (!trace->calltime)
+                goto out;
        calltime = trace->rettime - trace->calltime;
        if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
@@ -668,8 +688,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
        }
        rec = ftrace_find_profiled_func(stat, trace->func);
-        if (rec)
+        if (rec) {
                rec->time += calltime;
+                rec->time_squared += calltime * calltime;
+        }
 out:
        local_irq_restore(flags);
@@ -3212,8 +3234,8 @@ free:
 }
 static void
-ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
+ftrace_graph_probe_sched_switch(void *ignore,
-                                struct task_struct *next)
+                        struct task_struct *prev, struct task_struct *next)
 {
        unsigned long long timestamp;
        int index;
@@ -3267,7 +3289,7 @@ static int start_graph_tracing(void)
        } while (ret == -EAGAIN);
        if (!ret) {
-                ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch);
+                ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
                if (ret)
                        pr_info("ftrace_graph: Couldn't activate tracepoint"
                                " probe to kernel_sched_switch\n");
@@ -3339,11 +3361,11 @@ void unregister_ftrace_graph(void)
                goto out;
        ftrace_graph_active--;
-        unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
        ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
        ftrace_graph_entry = ftrace_graph_entry_stub;
        ftrace_shutdown(FTRACE_STOP_FUNC_RET);
        unregister_pm_notifier(&ftrace_suspend_notifier);
+        unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
 out:
        mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index a91da69f153a..bbfc1bb1660b 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -95,7 +95,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
        trace_wake_up();
 }
-static void kmemtrace_kmalloc(unsigned long call_site,
+static void kmemtrace_kmalloc(void *ignore,
+                              unsigned long call_site,
                              const void *ptr,
                              size_t bytes_req,
                              size_t bytes_alloc,
@@ -105,7 +106,8 @@ static void kmemtrace_kmalloc(unsigned long call_site,
                        bytes_req, bytes_alloc, gfp_flags, -1);
 }
-static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
+static void kmemtrace_kmem_cache_alloc(void *ignore,
+                                       unsigned long call_site,
                                       const void *ptr,
                                       size_t bytes_req,
                                       size_t bytes_alloc,
@@ -115,7 +117,8 @@ static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
                        bytes_req, bytes_alloc, gfp_flags, -1);
 }
-static void kmemtrace_kmalloc_node(unsigned long call_site,
+static void kmemtrace_kmalloc_node(void *ignore,
+                                   unsigned long call_site,
                                   const void *ptr,
                                   size_t bytes_req,
                                   size_t bytes_alloc,
@@ -126,7 +129,8 @@ static void kmemtrace_kmalloc_node(unsigned long call_site,
                        bytes_req, bytes_alloc, gfp_flags, node);
 }
-static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
+static void kmemtrace_kmem_cache_alloc_node(void *ignore,
+                                            unsigned long call_site,
                                            const void *ptr,
                                            size_t bytes_req,
                                            size_t bytes_alloc,
@@ -137,12 +141,14 @@ static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
                        bytes_req, bytes_alloc, gfp_flags, node);
 }
-static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
+static void
+kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
 {
        kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
 }
-static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
+static void kmemtrace_kmem_cache_free(void *ignore,
+                                      unsigned long call_site, const void *ptr)
 {
        kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
 }
@@ -151,34 +157,34 @@ static int kmemtrace_start_probes(void)
 {
        int err;
-        err = register_trace_kmalloc(kmemtrace_kmalloc);
+        err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
        if (err)
                return err;
-        err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+        err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
        if (err)
                return err;
-        err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
+        err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
        if (err)
                return err;
-        err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+        err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
        if (err)
                return err;
-        err = register_trace_kfree(kmemtrace_kfree);
+        err = register_trace_kfree(kmemtrace_kfree, NULL);
        if (err)
                return err;
-        err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+        err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
        return err;
 }
 static void kmemtrace_stop_probes(void)
 {
-        unregister_trace_kmalloc(kmemtrace_kmalloc);
+        unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
-        unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+        unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
-        unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
+        unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
-        unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+        unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
-        unregister_trace_kfree(kmemtrace_kfree);
+        unregister_trace_kfree(kmemtrace_kfree, NULL);
-        unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+        unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
 }
 static int kmem_trace_init(struct trace_array *tr)
@@ -237,7 +243,8 @@ struct kmemtrace_user_event_alloc {
 };
 static enum print_line_t
-kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
+kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
+                      struct trace_event *event)
 {
        struct trace_seq *s = &iter->seq;
        struct kmemtrace_alloc_entry *entry;
@@ -257,7 +264,8 @@ kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
 }
 static enum print_line_t
-kmemtrace_print_free(struct trace_iterator *iter, int flags)
+kmemtrace_print_free(struct trace_iterator *iter, int flags,
+                     struct trace_event *event)
 {
        struct trace_seq *s = &iter->seq;
        struct kmemtrace_free_entry *entry;
@@ -275,7 +283,8 @@ kmemtrace_print_free(struct trace_iterator *iter, int flags)
 }
 static enum print_line_t
-kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
+kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
+                           struct trace_event *event)
 {
        struct trace_seq *s = &iter->seq;
        struct kmemtrace_alloc_entry *entry;
@@ -309,7 +318,8 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
 }
 static enum print_line_t
-kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
+kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
+                          struct trace_event *event)
 {
        struct trace_seq *s = &iter->seq;
        struct kmemtrace_free_entry *entry;
@@ -463,18 +473,26 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
        }
 }
-static struct trace_event kmem_trace_alloc = {
+static struct trace_event_functions kmem_trace_alloc_funcs = {
-        .type                   = TRACE_KMEM_ALLOC,
        .trace                  = kmemtrace_print_alloc,
        .binary                 = kmemtrace_print_alloc_user,
 };
-static struct trace_event kmem_trace_free = {
+static struct trace_event kmem_trace_alloc = {
-        .type                   = TRACE_KMEM_FREE,
+        .type                   = TRACE_KMEM_ALLOC,
+        .funcs                  = &kmem_trace_alloc_funcs,
+};
+static struct trace_event_functions kmem_trace_free_funcs = {
        .trace                  = kmemtrace_print_free,
        .binary                 = kmemtrace_print_free_user,
 };
+static struct trace_event kmem_trace_free = {
+        .type                   = TRACE_KMEM_FREE,
+        .funcs                  = &kmem_trace_free_funcs,
+};
 static struct tracer kmem_tracer __read_mostly = {
        .name                   = "kmemtrace",
        .init                   = kmem_trace_init,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 41ca394feb22..7f6059c5aa94 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -319,6 +319,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 #define TS_MASK         ((1ULL << TS_SHIFT) - 1)
 #define TS_DELTA_TEST   (~TS_MASK)
+/* Flag when events were overwritten */
+#define RB_MISSED_EVENTS        (1 << 31)
+/* Missed count stored at end */
+#define RB_MISSED_STORED        (1 << 30)
 struct buffer_data_page {
        u64              time_stamp;    /* page time stamp */
        local_t          commit;        /* write committed index */
@@ -338,6 +343,7 @@ struct buffer_page {
        local_t          write;         /* index for next write */
        unsigned         read;          /* index for next read */
        local_t          entries;       /* entries on this page */
+        unsigned long    real_end;      /* real end of data */
        struct buffer_data_page *page;  /* Actual data page */
 };
@@ -417,6 +423,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
                               (unsigned int)sizeof(field.commit),
                               (unsigned int)is_signed_type(long));
+        ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
+                               "offset:%u;\tsize:%u;\tsigned:%u;\n",
+                               (unsigned int)offsetof(typeof(field), commit),
+                               1,
+                               (unsigned int)is_signed_type(long));
        ret = trace_seq_printf(s, "\tfield: char data;\t"
                               "offset:%u;\tsize:%u;\tsigned:%u;\n",
                               (unsigned int)offsetof(typeof(field), data),
@@ -440,6 +452,8 @@ struct ring_buffer_per_cpu {
        struct buffer_page              *tail_page;     /* write to tail */
        struct buffer_page              *commit_page;   /* committed pages */
        struct buffer_page              *reader_page;
+        unsigned long                   lost_events;
+        unsigned long                   last_overrun;
        local_t                         commit_overrun;
        local_t                         overrun;
        local_t                         entries;
@@ -1762,6 +1776,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
        kmemcheck_annotate_bitfield(event, bitfield);
        /*
+         * Save the original length to the meta data.
+         * This will be used by the reader to add lost event
+         * counter.
+         */
+        tail_page->real_end = tail;
+        /*
         * If this event is bigger than the minimum size, then
         * we need to be careful that we don't subtract the
         * write counter enough to allow another writer to slip
@@ -1979,17 +2000,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
                  u64 *ts, u64 *delta)
 {
        struct ring_buffer_event *event;
-        static int once;
        int ret;
-        if (unlikely(*delta > (1ULL << 59) && !once++)) {
+        WARN_ONCE(*delta > (1ULL << 59),
-                printk(KERN_WARNING "Delta way too big! %llu"
+                  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
-                       " ts=%llu write stamp = %llu\n",
+                  (unsigned long long)*delta,
-                       (unsigned long long)*delta,
+                  (unsigned long long)*ts,
-                       (unsigned long long)*ts,
+                  (unsigned long long)cpu_buffer->write_stamp);
-                       (unsigned long long)cpu_buffer->write_stamp);
-                WARN_ON(1);
-        }
        /*
         * The delta is too big, we to add a
@@ -2838,6 +2855,7 @@ static struct buffer_page *
 rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
        struct buffer_page *reader = NULL;
+        unsigned long overwrite;
        unsigned long flags;
        int nr_loops = 0;
        int ret;
@@ -2879,6 +2897,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        local_set(&cpu_buffer->reader_page->write, 0);
        local_set(&cpu_buffer->reader_page->entries, 0);
        local_set(&cpu_buffer->reader_page->page->commit, 0);
+        cpu_buffer->reader_page->real_end = 0;
 spin:
        /*
@@ -2899,6 +2918,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
        /*
+         * We want to make sure we read the overruns after we set up our
+         * pointers to the next object. The writer side does a
+         * cmpxchg to cross pages which acts as the mb on the writer
+         * side. Note, the reader will constantly fail the swap
+         * while the writer is updating the pointers, so this
+         * guarantees that the overwrite recorded here is the one we
+         * want to compare with the last_overrun.
+         */
+        smp_mb();
+        overwrite = local_read(&(cpu_buffer->overrun));
+        /*
         * Here's the tricky part.
         *
         * We need to move the pointer past the header page.
@@ -2929,6 +2960,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->reader_page = reader;
        rb_reset_reader_page(cpu_buffer);
+        if (overwrite != cpu_buffer->last_overrun) {
+                cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
+                cpu_buffer->last_overrun = overwrite;
+        }
        goto again;
 out:
@@ -3005,8 +3041,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
                rb_advance_iter(iter);
 }
+static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
+{
+        return cpu_buffer->lost_events;
+}
 static struct ring_buffer_event *
-rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
+rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
+               unsigned long *lost_events)
 {
        struct ring_buffer_event *event;
        struct buffer_page *reader;
@@ -3058,6 +3100,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
                        ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
                                                         cpu_buffer->cpu, ts);
                }
+                if (lost_events)
+                        *lost_events = rb_lost_events(cpu_buffer);
                return event;
        default:
@@ -3168,12 +3212,14 @@ static inline int rb_ok_to_lock(void)
 * @buffer: The ring buffer to read
 * @cpu: The cpu to peak at
 * @ts: The timestamp counter of this event.
+ * @lost_events: a variable to store if events were lost (may be NULL)
 *
 * This will return the event that will be read next, but does
 * not consume the data.
 */
 struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+                 unsigned long *lost_events)
 {
        struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
        struct ring_buffer_event *event;
@@ -3188,7 +3234,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
        local_irq_save(flags);
        if (dolock)
                spin_lock(&cpu_buffer->reader_lock);
-        event = rb_buffer_peek(cpu_buffer, ts);
+        event = rb_buffer_peek(cpu_buffer, ts, lost_events);
        if (event && event->type_len == RINGBUF_TYPE_PADDING)
                rb_advance_reader(cpu_buffer);
        if (dolock)
@@ -3230,13 +3276,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 /**
 * ring_buffer_consume - return an event and consume it
 * @buffer: The ring buffer to get the next event from
+ * @cpu: the cpu to read the buffer from
+ * @ts: a variable to store the timestamp (may be NULL)
+ * @lost_events: a variable to store if events were lost (may be NULL)
 *
 * Returns the next event in the ring buffer, and that event is consumed.
 * Meaning, that sequential reads will keep returning a different event,
 * and eventually empty the ring buffer if the producer is slower.
 */
 struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+                    unsigned long *lost_events)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
        struct ring_buffer_event *event = NULL;
@@ -3257,9 +3307,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
        if (dolock)
                spin_lock(&cpu_buffer->reader_lock);
-        event = rb_buffer_peek(cpu_buffer, ts);
+        event = rb_buffer_peek(cpu_buffer, ts, lost_events);
-        if (event)
+        if (event) {
+                cpu_buffer->lost_events = 0;
                rb_advance_reader(cpu_buffer);
+        }
        if (dolock)
                spin_unlock(&cpu_buffer->reader_lock);
@@ -3276,23 +3328,30 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
 /**
- * ring_buffer_read_start - start a non consuming read of the buffer
+ * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
 * @buffer: The ring buffer to read from
 * @cpu: The cpu buffer to iterate over
 *
- * This starts up an iteration through the buffer. It also disables
+ * This performs the initial preparations necessary to iterate
- * the recording to the buffer until the reading is finished.
+ * through the buffer.  Memory is allocated, buffer recording
- * This prevents the reading from being corrupted. This is not
+ * is disabled, and the iterator pointer is returned to the caller.
- * a consuming read, so a producer is not expected.
 *
- * Must be paired with ring_buffer_finish.
+ * Disabling buffer recordng prevents the reading from being
+ * corrupted. This is not a consuming read, so a producer is not
+ * expected.
+ *
+ * After a sequence of ring_buffer_read_prepare calls, the user is
+ * expected to make at least one call to ring_buffer_prepare_sync.
+ * Afterwards, ring_buffer_read_start is invoked to get things going
+ * for real.
+ *
+ * This overall must be paired with ring_buffer_finish.
 */
 struct ring_buffer_iter *
-ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
        struct ring_buffer_iter *iter;
-        unsigned long flags;
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return NULL;
@@ -3306,15 +3365,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
        iter->cpu_buffer = cpu_buffer;
        atomic_inc(&cpu_buffer->record_disabled);
+        return iter;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
+/**
+ * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
+ *
+ * All previously invoked ring_buffer_read_prepare calls to prepare
+ * iterators will be synchronized.  Afterwards, read_buffer_read_start
+ * calls on those iterators are allowed.
+ */
+void
+ring_buffer_read_prepare_sync(void)
+{
        synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @iter: The iterator returned by ring_buffer_read_prepare
+ *
+ * This finalizes the startup of an iteration through the buffer.
+ * The iterator comes from a call to ring_buffer_read_prepare and
+ * an intervening ring_buffer_read_prepare_sync must have been
+ * performed.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+void
+ring_buffer_read_start(struct ring_buffer_iter *iter)
+{
+        struct ring_buffer_per_cpu *cpu_buffer;
+        unsigned long flags;
+        if (!iter)
+                return;
+        cpu_buffer = iter->cpu_buffer;
        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        arch_spin_lock(&cpu_buffer->lock);
        rb_iter_reset(iter);
        arch_spin_unlock(&cpu_buffer->lock);
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-        return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
@@ -3408,6 +3504,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->write_stamp = 0;
        cpu_buffer->read_stamp = 0;
+        cpu_buffer->lost_events = 0;
+        cpu_buffer->last_overrun = 0;
        rb_head_page_activate(cpu_buffer);
 }
@@ -3683,6 +3782,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
        struct ring_buffer_event *event;
        struct buffer_data_page *bpage;
        struct buffer_page *reader;
+        unsigned long missed_events;
        unsigned long flags;
        unsigned int commit;
        unsigned int read;
@@ -3719,6 +3819,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
        read = reader->read;
        commit = rb_page_commit(reader);
+        /* Check if any events were dropped */
+        missed_events = cpu_buffer->lost_events;
        /*
         * If this page has been partially read or
         * if len is not big enough to read the rest of the page or
@@ -3779,9 +3882,35 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                local_set(&reader->entries, 0);
                reader->read = 0;
                *data_page = bpage;
+                /*
+                 * Use the real_end for the data size,
+                 * This gives us a chance to store the lost events
+                 * on the page.
+                 */
+                if (reader->real_end)
+                        local_set(&bpage->commit, reader->real_end);
        }
        ret = read;
+        cpu_buffer->lost_events = 0;
+        /*
+         * Set a flag in the commit field if we lost events
+         */
+        if (missed_events) {
+                commit = local_read(&bpage->commit);
+                /* If there is room at the end of the page to save the
+                 * missed events, then record it there.
+                 */
+                if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+                        memcpy(&bpage->data[commit], &missed_events,
+                               sizeof(missed_events));
+                        local_add(RB_MISSED_STORED, &bpage->commit);
+                }
+                local_add(RB_MISSED_EVENTS, &bpage->commit);
+        }
 out_unlock:
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index df74c7982255..302f8a614635 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -81,7 +81,7 @@ static enum event_status read_event(int cpu)
        int *entry;
        u64 ts;
-        event = ring_buffer_consume(buffer, cpu, &ts);
+        event = ring_buffer_consume(buffer, cpu, &ts, NULL);
        if (!event)
                return EVENT_DROPPED;
@@ -113,7 +113,8 @@ static enum event_status read_page(int cpu)
        ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
        if (ret >= 0) {
                rpage = bpage;
-                commit = local_read(&rpage->commit);
+                /* The commit may have missed event flags set, clear them */
+                commit = local_read(&rpage->commit) & 0xfffff;
                for (i = 0; i < commit && !kill_test; i += inc) {
                        if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 44f916a04065..ba0ec81158b2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -117,9 +117,12 @@ static cpumask_var_t __read_mostly	tracing_buffer_mask;
 *
 * It is default off, but you can enable it with either specifying
 * "ftrace_dump_on_oops" in the kernel command line, or setting
- * /proc/sys/kernel/ftrace_dump_on_oops to true.
+ * /proc/sys/kernel/ftrace_dump_on_oops
+ * Set 1 if you want to dump buffers of all CPUs
+ * Set 2 if you want to dump the buffer of the CPU that triggered oops
 */
-int ftrace_dump_on_oops;
+enum ftrace_dump_mode ftrace_dump_on_oops;
 static int tracing_set_tracer(const char *buf);
@@ -139,8 +142,17 @@ __setup("ftrace=", set_cmdline_ftrace);
 static int __init set_ftrace_dump_on_oops(char *str)
 {
-        ftrace_dump_on_oops = 1;
+        if (*str++ != '=' || !*str) {
-        return 1;
+                ftrace_dump_on_oops = DUMP_ALL;
+                return 1;
+        }
+        if (!strcmp("orig_cpu", str)) {
+                ftrace_dump_on_oops = DUMP_ORIG;
+                return 1;
+        }
+        return 0;
 }
 __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
@@ -1545,7 +1557,8 @@ static void trace_iterator_increment(struct trace_iterator *iter)
 }
 static struct trace_entry *
-peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
+peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
+                unsigned long *lost_events)
 {
        struct ring_buffer_event *event;
        struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
@@ -1556,7 +1569,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
        if (buf_iter)
                event = ring_buffer_iter_peek(buf_iter, ts);
        else
-                event = ring_buffer_peek(iter->tr->buffer, cpu, ts);
+                event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
+                                         lost_events);
        ftrace_enable_cpu();
@@ -1564,10 +1578,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
 }
 static struct trace_entry *
-__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
+__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
+                  unsigned long *missing_events, u64 *ent_ts)
 {
        struct ring_buffer *buffer = iter->tr->buffer;
        struct trace_entry *ent, *next = NULL;
+        unsigned long lost_events = 0, next_lost = 0;
        int cpu_file = iter->cpu_file;
        u64 next_ts = 0, ts;
        int next_cpu = -1;
@@ -1580,7 +1596,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
        if (cpu_file > TRACE_PIPE_ALL_CPU) {
                if (ring_buffer_empty_cpu(buffer, cpu_file))
                        return NULL;
-                ent = peek_next_entry(iter, cpu_file, ent_ts);
+                ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
                if (ent_cpu)
                        *ent_cpu = cpu_file;
@@ -1592,7 +1608,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
                if (ring_buffer_empty_cpu(buffer, cpu))
                        continue;
-                ent = peek_next_entry(iter, cpu, &ts);
+                ent = peek_next_entry(iter, cpu, &ts, &lost_events);
                /*
                 * Pick the entry with the smallest timestamp:
@@ -1601,6 +1617,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
                        next = ent;
                        next_cpu = cpu;
                        next_ts = ts;
+                        next_lost = lost_events;
                }
        }
@@ -1610,6 +1627,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
        if (ent_ts)
                *ent_ts = next_ts;
+        if (missing_events)
+                *missing_events = next_lost;
        return next;
 }
@@ -1617,13 +1637,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
                                          int *ent_cpu, u64 *ent_ts)
 {
-        return __find_next_entry(iter, ent_cpu, ent_ts);
+        return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
 }
 /* Find the next real entry, and increment the iterator to the next entry */
 static void *find_next_entry_inc(struct trace_iterator *iter)
 {
-        iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
+        iter->ent = __find_next_entry(iter, &iter->cpu,
+                                      &iter->lost_events, &iter->ts);
        if (iter->ent)
                trace_iterator_increment(iter);
@@ -1635,7 +1656,8 @@ static void trace_consume(struct trace_iterator *iter)
 {
        /* Don't allow ftrace to trace into the ring buffers */
        ftrace_disable_cpu();
-        ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts);
+        ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
+                            &iter->lost_events);
        ftrace_enable_cpu();
 }
@@ -1786,7 +1808,7 @@ static void print_func_help_header(struct seq_file *m)
 }
-static void
+void
 print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 {
        unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1914,7 +1936,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
        }
        if (event)
-                return event->trace(iter, sym_flags);
+                return event->funcs->trace(iter, sym_flags, event);
        if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
                goto partial;
@@ -1940,7 +1962,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
        event = ftrace_find_event(entry->type);
        if (event)
-                return event->raw(iter, 0);
+                return event->funcs->raw(iter, 0, event);
        if (!trace_seq_printf(s, "%d ?\n", entry->type))
                goto partial;
@@ -1967,7 +1989,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
        event = ftrace_find_event(entry->type);
        if (event) {
-                enum print_line_t ret = event->hex(iter, 0);
+                enum print_line_t ret = event->funcs->hex(iter, 0, event);
                if (ret != TRACE_TYPE_HANDLED)
                        return ret;
        }
@@ -1992,10 +2014,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
        }
        event = ftrace_find_event(entry->type);
-        return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
+        return event ? event->funcs->binary(iter, 0, event) :
+                TRACE_TYPE_HANDLED;
 }
-static int trace_empty(struct trace_iterator *iter)
+int trace_empty(struct trace_iterator *iter)
 {
        int cpu;
@@ -2030,6 +2053,10 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
        enum print_line_t ret;
+        if (iter->lost_events)
+                trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+                                 iter->cpu, iter->lost_events);
        if (iter->trace && iter->trace->print_line) {
                ret = iter->trace->print_line(iter);
                if (ret != TRACE_TYPE_UNHANDLED)
@@ -2058,6 +2085,23 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
        return print_trace_fmt(iter);
 }
+void trace_default_header(struct seq_file *m)
+{
+        struct trace_iterator *iter = m->private;
+        if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+                /* print nothing if the buffers are empty */
+                if (trace_empty(iter))
+                        return;
+                print_trace_header(m, iter);
+                if (!(trace_flags & TRACE_ITER_VERBOSE))
+                        print_lat_help_header(m);
+        } else {
+                if (!(trace_flags & TRACE_ITER_VERBOSE))
+                        print_func_help_header(m);
+        }
+}
 static int s_show(struct seq_file *m, void *v)
 {
        struct trace_iterator *iter = v;
@@ -2070,17 +2114,9 @@ static int s_show(struct seq_file *m, void *v)
                }
                if (iter->trace && iter->trace->print_header)
                        iter->trace->print_header(m);
-                else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+                else
-                        /* print nothing if the buffers are empty */
+                        trace_default_header(m);
-                        if (trace_empty(iter))
-                                return 0;
-                        print_trace_header(m, iter);
-                        if (!(trace_flags & TRACE_ITER_VERBOSE))
-                                print_lat_help_header(m);
-                } else {
-                        if (!(trace_flags & TRACE_ITER_VERBOSE))
-                                print_func_help_header(m);
-                }
        } else if (iter->leftover) {
                /*
                 * If we filled the seq_file buffer earlier, we
@@ -2166,15 +2202,20 @@ __tracing_open(struct inode *inode, struct file *file)
        if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
                for_each_tracing_cpu(cpu) {
                        iter->buffer_iter[cpu] =
-                                ring_buffer_read_start(iter->tr->buffer, cpu);
+                                ring_buffer_read_prepare(iter->tr->buffer, cpu);
+                }
+                ring_buffer_read_prepare_sync();
+                for_each_tracing_cpu(cpu) {
+                        ring_buffer_read_start(iter->buffer_iter[cpu]);
                        tracing_iter_reset(iter, cpu);
                }
        } else {
                cpu = iter->cpu_file;
                iter->buffer_iter[cpu] =
-                                ring_buffer_read_start(iter->tr->buffer, cpu);
+                        ring_buffer_read_prepare(iter->tr->buffer, cpu);
+                ring_buffer_read_prepare_sync();
+                ring_buffer_read_start(iter->buffer_iter[cpu]);
                tracing_iter_reset(iter, cpu);
        }
@@ -4324,7 +4365,7 @@ static int trace_panic_handler(struct notifier_block *this,
                               unsigned long event, void *unused)
 {
        if (ftrace_dump_on_oops)
-                ftrace_dump();
+                ftrace_dump(ftrace_dump_on_oops);
        return NOTIFY_OK;
 }
@@ -4341,7 +4382,7 @@ static int trace_die_handler(struct notifier_block *self,
        switch (val) {
        case DIE_OOPS:
                if (ftrace_dump_on_oops)
-                        ftrace_dump();
+                        ftrace_dump(ftrace_dump_on_oops);
                break;
        default:
                break;
@@ -4382,7 +4423,8 @@ trace_printk_seq(struct trace_seq *s)
        trace_seq_init(s);
 }
-static void __ftrace_dump(bool disable_tracing)
+static void
+__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 {
        static arch_spinlock_t ftrace_dump_lock =
                (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -4415,12 +4457,25 @@ static void __ftrace_dump(bool disable_tracing)
        /* don't look at user memory in panic mode */
        trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
-        printk(KERN_TRACE "Dumping ftrace buffer:\n");
        /* Simulate the iterator */
        iter.tr = &global_trace;
        iter.trace = current_trace;
-        iter.cpu_file = TRACE_PIPE_ALL_CPU;
+        switch (oops_dump_mode) {
+        case DUMP_ALL:
+                iter.cpu_file = TRACE_PIPE_ALL_CPU;
+                break;
+        case DUMP_ORIG:
+                iter.cpu_file = raw_smp_processor_id();
+                break;
+        case DUMP_NONE:
+                goto out_enable;
+        default:
+                printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
+                iter.cpu_file = TRACE_PIPE_ALL_CPU;
+        }
+        printk(KERN_TRACE "Dumping ftrace buffer:\n");
        /*
         * We need to stop all tracing on all CPUS to read the
@@ -4459,6 +4514,7 @@ static void __ftrace_dump(bool disable_tracing)
        else
                printk(KERN_TRACE "---------------------------------\n");
+ out_enable:
        /* Re-enable tracing if requested */
        if (!disable_tracing) {
                trace_flags |= old_userobj;
@@ -4475,9 +4531,9 @@ static void __ftrace_dump(bool disable_tracing)
 }
 /* By default: disable tracing after the dump */
-void ftrace_dump(void)
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 {
-        __ftrace_dump(true);
+        __ftrace_dump(true, oops_dump_mode);
 }
 __init static int tracer_alloc_buffers(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3ebdb6bd2362..2cd96399463f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -364,6 +364,9 @@ void trace_function(struct trace_array *tr,
                    unsigned long ip,
                    unsigned long parent_ip,
                    unsigned long flags, int pc);
+void trace_default_header(struct seq_file *m);
+void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
+int trace_empty(struct trace_iterator *iter);
 void trace_graph_return(struct ftrace_graph_ret *trace);
 int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -402,12 +405,12 @@ void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
 void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
                   int pc);
 #else
-static inline void ftrace_trace_stack(struct trace_array *tr,
+static inline void ftrace_trace_stack(struct ring_buffer *buffer,
                                      unsigned long flags, int skip, int pc)
 {
 }
-static inline void ftrace_trace_userstack(struct trace_array *tr,
+static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
                                          unsigned long flags, int pc)
 {
 }
@@ -475,9 +478,29 @@ extern int trace_clock_id;
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+/* Flag options */
+#define TRACE_GRAPH_PRINT_OVERRUN       0x1
+#define TRACE_GRAPH_PRINT_CPU           0x2
+#define TRACE_GRAPH_PRINT_OVERHEAD      0x4
+#define TRACE_GRAPH_PRINT_PROC          0x8
+#define TRACE_GRAPH_PRINT_DURATION      0x10
+#define TRACE_GRAPH_PRINT_ABS_TIME      0x20
+extern enum print_line_t
+print_graph_function_flags(struct trace_iterator *iter, u32 flags);
+extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
 extern enum print_line_t
 trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
+extern void graph_trace_open(struct trace_iterator *iter);
+extern void graph_trace_close(struct trace_iterator *iter);
+extern int __trace_graph_entry(struct trace_array *tr,
+                               struct ftrace_graph_ent *trace,
+                               unsigned long flags, int pc);
+extern void __trace_graph_return(struct trace_array *tr,
+                                 struct ftrace_graph_ret *trace,
+                                 unsigned long flags, int pc);
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* TODO: make this variable */
@@ -508,7 +531,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #else /* CONFIG_FUNCTION_GRAPH_TRACER */
 static inline enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
        return TRACE_TYPE_UNHANDLED;
 }
@@ -755,12 +778,15 @@ extern void print_subsystem_event_filter(struct event_subsystem *system,
                                         struct trace_seq *s);
 extern int filter_assign_type(const char *type);
+struct list_head *
+trace_get_fields(struct ftrace_event_call *event_call);
 static inline int
 filter_check_discard(struct ftrace_event_call *call, void *rec,
                     struct ring_buffer *buffer,
                     struct ring_buffer_event *event)
 {
-        if (unlikely(call->filter_active) &&
+        if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
            !filter_match_preds(call->filter, rec)) {
                ring_buffer_discard_commit(buffer, event);
                return 1;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index b9bc4d470177..8d3538b4ea5f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -143,7 +143,7 @@ static void branch_trace_reset(struct trace_array *tr)
 }
 static enum print_line_t trace_branch_print(struct trace_iterator *iter,
-                                            int flags)
+                                            int flags, struct trace_event *event)
 {
        struct trace_branch *field;
@@ -167,9 +167,13 @@ static void branch_print_header(struct seq_file *s)
                "    |\n");
 }
+static struct trace_event_functions trace_branch_funcs = {
+        .trace          = trace_branch_print,
+};
 static struct trace_event trace_branch_event = {
        .type           = TRACE_BRANCH,
-        .trace          = trace_branch_print,
+        .funcs          = &trace_branch_funcs,
 };
 static struct tracer branch_trace __read_mostly =
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0565bb42566f..0a47e8d6b491 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -49,7 +49,12 @@ static int perf_trace_event_enable(struct ftrace_event_call *event)
                rcu_assign_pointer(perf_trace_buf_nmi, buf);
        }
-        ret = event->perf_event_enable(event);
+        if (event->class->reg)
+                ret = event->class->reg(event, TRACE_REG_PERF_REGISTER);
+        else
+                ret = tracepoint_probe_register(event->name,
+                                                event->class->perf_probe,
+                                                event);
        if (!ret) {
                total_ref_count++;
                return 0;
@@ -75,7 +80,8 @@ int perf_trace_enable(int event_id)
        mutex_lock(&event_mutex);
        list_for_each_entry(event, &ftrace_events, list) {
-                if (event->id == event_id && event->perf_event_enable &&
+                if (event->event.type == event_id &&
+                    event->class && event->class->perf_probe &&
                    try_module_get(event->mod)) {
                        ret = perf_trace_event_enable(event);
                        break;
@@ -93,7 +99,10 @@ static void perf_trace_event_disable(struct ftrace_event_call *event)
        if (--event->perf_refcount > 0)
                return;
-        event->perf_event_disable(event);
+        if (event->class->reg)
+                event->class->reg(event, TRACE_REG_PERF_UNREGISTER);
+        else
+                tracepoint_probe_unregister(event->name, event->class->perf_probe, event);
        if (!--total_ref_count) {
                buf = perf_trace_buf;
@@ -119,7 +128,7 @@ void perf_trace_disable(int event_id)
        mutex_lock(&event_mutex);
        list_for_each_entry(event, &ftrace_events, list) {
-                if (event->id == event_id) {
+                if (event->event.type == event_id) {
                        perf_trace_event_disable(event);
                        module_put(event->mod);
                        break;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c697c7043349..53cffc0b0801 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -29,11 +29,23 @@ DEFINE_MUTEX(event_mutex);
 LIST_HEAD(ftrace_events);
+struct list_head *
+trace_get_fields(struct ftrace_event_call *event_call)
+{
+        if (!event_call->class->get_fields)
+                return &event_call->class->fields;
+        return event_call->class->get_fields(event_call);
+}
 int trace_define_field(struct ftrace_event_call *call, const char *type,
                       const char *name, int offset, int size, int is_signed,
                       int filter_type)
 {
        struct ftrace_event_field *field;
+        struct list_head *head;
+        if (WARN_ON(!call->class))
+                return 0;
        field = kzalloc(sizeof(*field), GFP_KERNEL);
        if (!field)
@@ -56,7 +68,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
        field->size = size;
        field->is_signed = is_signed;
-        list_add(&field->link, &call->fields);
+        head = trace_get_fields(call);
+        list_add(&field->link, head);
        return 0;
@@ -94,8 +107,10 @@ static int trace_define_common_fields(struct ftrace_event_call *call)
 void trace_destroy_fields(struct ftrace_event_call *call)
 {
        struct ftrace_event_field *field, *next;
+        struct list_head *head;
-        list_for_each_entry_safe(field, next, &call->fields, link) {
+        head = trace_get_fields(call);
+        list_for_each_entry_safe(field, next, head, link) {
                list_del(&field->link);
                kfree(field->type);
                kfree(field->name);
@@ -107,11 +122,9 @@ int trace_event_raw_init(struct ftrace_event_call *call)
 {
        int id;
-        id = register_ftrace_event(call->event);
+        id = register_ftrace_event(&call->event);
        if (!id)
                return -ENODEV;
-        call->id = id;
-        INIT_LIST_HEAD(&call->fields);
        return 0;
 }
@@ -124,23 +137,33 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
        switch (enable) {
        case 0:
-                if (call->enabled) {
+                if (call->flags & TRACE_EVENT_FL_ENABLED) {
-                        call->enabled = 0;
+                        call->flags &= ~TRACE_EVENT_FL_ENABLED;
                        tracing_stop_cmdline_record();
-                        call->unregfunc(call);
+                        if (call->class->reg)
+                                call->class->reg(call, TRACE_REG_UNREGISTER);
+                        else
+                                tracepoint_probe_unregister(call->name,
+                                                            call->class->probe,
+                                                            call);
                }
                break;
        case 1:
-                if (!call->enabled) {
+                if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
                        tracing_start_cmdline_record();
-                        ret = call->regfunc(call);
+                        if (call->class->reg)
+                                ret = call->class->reg(call, TRACE_REG_REGISTER);
+                        else
+                                ret = tracepoint_probe_register(call->name,
+                                                                call->class->probe,
+                                                                call);
                        if (ret) {
                                tracing_stop_cmdline_record();
                                pr_info("event trace: Could not enable event "
                                        "%s\n", call->name);
                                break;
                        }
-                        call->enabled = 1;
+                        call->flags |= TRACE_EVENT_FL_ENABLED;
                }
                break;
        }
@@ -171,15 +194,16 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
        mutex_lock(&event_mutex);
        list_for_each_entry(call, &ftrace_events, list) {
-                if (!call->name || !call->regfunc)
+                if (!call->name || !call->class ||
+                    (!call->class->probe && !call->class->reg))
                        continue;
                if (match &&
                    strcmp(match, call->name) != 0 &&
-                    strcmp(match, call->system) != 0)
+                    strcmp(match, call->class->system) != 0)
                        continue;
-                if (sub && strcmp(sub, call->system) != 0)
+                if (sub && strcmp(sub, call->class->system) != 0)
                        continue;
                if (event && strcmp(event, call->name) != 0)
@@ -297,7 +321,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                 * The ftrace subsystem is for showing formats only.
                 * They can not be enabled or disabled via the event files.
                 */
-                if (call->regfunc)
+                if (call->class && (call->class->probe || call->class->reg))
                        return call;
        }
@@ -328,7 +352,7 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
        (*pos)++;
        list_for_each_entry_continue(call, &ftrace_events, list) {
-                if (call->enabled)
+                if (call->flags & TRACE_EVENT_FL_ENABLED)
                        return call;
        }
@@ -355,8 +379,8 @@ static int t_show(struct seq_file *m, void *v)
 {
        struct ftrace_event_call *call = v;
-        if (strcmp(call->system, TRACE_SYSTEM) != 0)
+        if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
-                seq_printf(m, "%s:", call->system);
+                seq_printf(m, "%s:", call->class->system);
        seq_printf(m, "%s\n", call->name);
        return 0;
@@ -387,7 +411,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
        struct ftrace_event_call *call = filp->private_data;
        char *buf;
-        if (call->enabled)
+        if (call->flags & TRACE_EVENT_FL_ENABLED)
                buf = "1\n";
        else
                buf = "0\n";
@@ -450,10 +474,11 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
        mutex_lock(&event_mutex);
        list_for_each_entry(call, &ftrace_events, list) {
-                if (!call->name || !call->regfunc)
+                if (!call->name || !call->class ||
+                    (!call->class->probe && !call->class->reg))
                        continue;
-                if (system && strcmp(call->system, system) != 0)
+                if (system && strcmp(call->class->system, system) != 0)
                        continue;
                /*
@@ -461,7 +486,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
                 * or if all events or cleared, or if we have
                 * a mixture.
                 */
-                set |= (1 << !!call->enabled);
+                set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
                /*
                 * If we have a mixture, no need to look further.
@@ -525,6 +550,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 {
        struct ftrace_event_call *call = filp->private_data;
        struct ftrace_event_field *field;
+        struct list_head *head;
        struct trace_seq *s;
        int common_field_count = 5;
        char *buf;
@@ -540,10 +566,11 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
        trace_seq_init(s);
        trace_seq_printf(s, "name: %s\n", call->name);
-        trace_seq_printf(s, "ID: %d\n", call->id);
+        trace_seq_printf(s, "ID: %d\n", call->event.type);
        trace_seq_printf(s, "format:\n");
-        list_for_each_entry_reverse(field, &call->fields, link) {
+        head = trace_get_fields(call);
+        list_for_each_entry_reverse(field, head, link) {
                /*
                 * Smartly shows the array type(except dynamic array).
                 * Normal:
@@ -613,7 +640,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
                return -ENOMEM;
        trace_seq_init(s);
-        trace_seq_printf(s, "%d\n", call->id);
+        trace_seq_printf(s, "%d\n", call->event.type);
        r = simple_read_from_buffer(ubuf, cnt, ppos,
                                    s->buffer, s->len);
@@ -919,14 +946,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
                 const struct file_operations *filter,
                 const struct file_operations *format)
 {
+        struct list_head *head;
        int ret;
        /*
         * If the trace point header did not define TRACE_SYSTEM
         * then the system would be called "TRACE_SYSTEM".
         */
-        if (strcmp(call->system, TRACE_SYSTEM) != 0)
+        if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
-                d_events = event_subsystem_dir(call->system, d_events);
+                d_events = event_subsystem_dir(call->class->system, d_events);
        call->dir = debugfs_create_dir(call->name, d_events);
        if (!call->dir) {
@@ -935,22 +963,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
                return -1;
        }
-        if (call->regfunc)
+        if (call->class->probe || call->class->reg)
                trace_create_file("enable", 0644, call->dir, call,
                                  enable);
-        if (call->id && call->perf_event_enable)
+#ifdef CONFIG_PERF_EVENTS
+        if (call->event.type && (call->class->perf_probe || call->class->reg))
                trace_create_file("id", 0444, call->dir, call,
                                  id);
+#endif
-        if (call->define_fields) {
+        if (call->class->define_fields) {
-                ret = trace_define_common_fields(call);
+                /*
-                if (!ret)
+                 * Other events may have the same class. Only update
-                        ret = call->define_fields(call);
+                 * the fields if they are not already defined.
-                if (ret < 0) {
+                 */
-                        pr_warning("Could not initialize trace point"
+                head = trace_get_fields(call);
-                                   " events/%s\n", call->name);
+                if (list_empty(head)) {
-                        return ret;
+                        ret = trace_define_common_fields(call);
+                        if (!ret)
+                                ret = call->class->define_fields(call);
+                        if (ret < 0) {
+                                pr_warning("Could not initialize trace point"
+                                           " events/%s\n", call->name);
+                                return ret;
+                        }
                }
                trace_create_file("filter", 0644, call->dir, call,
                                  filter);
@@ -970,8 +1007,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
        if (!call->name)
                return -EINVAL;
-        if (call->raw_init) {
+        if (call->class->raw_init) {
-                ret = call->raw_init(call);
+                ret = call->class->raw_init(call);
                if (ret < 0) {
                        if (ret != -ENOSYS)
                                pr_warning("Could not initialize trace "
@@ -1035,13 +1072,13 @@ static void remove_subsystem_dir(const char *name)
 static void __trace_remove_event_call(struct ftrace_event_call *call)
 {
        ftrace_event_enable_disable(call, 0);
-        if (call->event)
+        if (call->event.funcs)
-                __unregister_ftrace_event(call->event);
+                __unregister_ftrace_event(&call->event);
        debugfs_remove_recursive(call->dir);
        list_del(&call->list);
        trace_destroy_fields(call);
        destroy_preds(call);
-        remove_subsystem_dir(call->system);
+        remove_subsystem_dir(call->class->system);
 }
 /* Remove an event_call */
@@ -1132,8 +1169,8 @@ static void trace_module_add_events(struct module *mod)
                /* The linker may leave blanks */
                if (!call->name)
                        continue;
-                if (call->raw_init) {
+                if (call->class->raw_init) {
-                        ret = call->raw_init(call);
+                        ret = call->class->raw_init(call);
                        if (ret < 0) {
                                if (ret != -ENOSYS)
                                        pr_warning("Could not initialize trace "
@@ -1286,8 +1323,8 @@ static __init int event_trace_init(void)
                /* The linker may leave blanks */
                if (!call->name)
                        continue;
-                if (call->raw_init) {
+                if (call->class->raw_init) {
-                        ret = call->raw_init(call);
+                        ret = call->class->raw_init(call);
                        if (ret < 0) {
                                if (ret != -ENOSYS)
                                        pr_warning("Could not initialize trace "
@@ -1388,8 +1425,8 @@ static __init void event_trace_self_tests(void)
        list_for_each_entry(call, &ftrace_events, list) {
-                /* Only test those that have a regfunc */
+                /* Only test those that have a probe */
-                if (!call->regfunc)
+                if (!call->class || !call->class->probe)
                        continue;
 /*
@@ -1399,8 +1436,8 @@ static __init void event_trace_self_tests(void)
 * syscalls as we test.
 */
 #ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
-                if (call->system &&
+                if (call->class->system &&
-                    strcmp(call->system, "syscalls") == 0)
+                    strcmp(call->class->system, "syscalls") == 0)
                        continue;
 #endif
@@ -1410,7 +1447,7 @@ static __init void event_trace_self_tests(void)
                 * If an event is already enabled, someone is using
                 * it and the self test should not be on.
                 */
-                if (call->enabled) {
+                if (call->flags & TRACE_EVENT_FL_ENABLED) {
                        pr_warning("Enabled event during self test!\n");
                        WARN_ON_ONCE(1);
                        continue;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 58092d844a1f..57bb1bb32999 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -500,8 +500,10 @@ static struct ftrace_event_field *
 find_event_field(struct ftrace_event_call *call, char *name)
 {
        struct ftrace_event_field *field;
+        struct list_head *head;
-        list_for_each_entry(field, &call->fields, link) {
+        head = trace_get_fields(call);
+        list_for_each_entry(field, head, link) {
                if (!strcmp(field->name, name))
                        return field;
        }
@@ -545,7 +547,7 @@ static void filter_disable_preds(struct ftrace_event_call *call)
        struct event_filter *filter = call->filter;
        int i;
-        call->filter_active = 0;
+        call->flags &= ~TRACE_EVENT_FL_FILTERED;
        filter->n_preds = 0;
        for (i = 0; i < MAX_FILTER_PRED; i++)
@@ -572,7 +574,7 @@ void destroy_preds(struct ftrace_event_call *call)
 {
        __free_preds(call->filter);
        call->filter = NULL;
-        call->filter_active = 0;
+        call->flags &= ~TRACE_EVENT_FL_FILTERED;
 }
 static struct event_filter *__alloc_preds(void)
@@ -611,7 +613,7 @@ static int init_preds(struct ftrace_event_call *call)
        if (call->filter)
                return 0;
-        call->filter_active = 0;
+        call->flags &= ~TRACE_EVENT_FL_FILTERED;
        call->filter = __alloc_preds();
        if (IS_ERR(call->filter))
                return PTR_ERR(call->filter);
@@ -625,10 +627,10 @@ static int init_subsystem_preds(struct event_subsystem *system)
        int err;
        list_for_each_entry(call, &ftrace_events, list) {
-                if (!call->define_fields)
+                if (!call->class || !call->class->define_fields)
                        continue;
-                if (strcmp(call->system, system->name) != 0)
+                if (strcmp(call->class->system, system->name) != 0)
                        continue;
                err = init_preds(call);
@@ -644,10 +646,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
        struct ftrace_event_call *call;
        list_for_each_entry(call, &ftrace_events, list) {
-                if (!call->define_fields)
+                if (!call->class || !call->class->define_fields)
                        continue;
-                if (strcmp(call->system, system->name) != 0)
+                if (strcmp(call->class->system, system->name) != 0)
                        continue;
                filter_disable_preds(call);
@@ -1249,10 +1251,10 @@ static int replace_system_preds(struct event_subsystem *system,
        list_for_each_entry(call, &ftrace_events, list) {
                struct event_filter *filter = call->filter;
-                if (!call->define_fields)
+                if (!call->class || !call->class->define_fields)
                        continue;
-                if (strcmp(call->system, system->name) != 0)
+                if (strcmp(call->class->system, system->name) != 0)
                        continue;
                /* try to see if the filter can be applied */
@@ -1266,7 +1268,7 @@ static int replace_system_preds(struct event_subsystem *system,
                if (err)
                        filter_disable_preds(call);
                else {
-                        call->filter_active = 1;
+                        call->flags |= TRACE_EVENT_FL_FILTERED;
                        replace_filter_string(filter, filter_string);
                }
                fail = false;
@@ -1315,7 +1317,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
        if (err)
                append_filter_err(ps, call->filter);
        else
-                call->filter_active = 1;
+                call->flags |= TRACE_EVENT_FL_FILTERED;
 out:
        filter_opstack_clear(ps);
        postfix_clear(ps);
@@ -1393,7 +1395,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
        mutex_lock(&event_mutex);
        list_for_each_entry(call, &ftrace_events, list) {
-                if (call->id == event_id)
+                if (call->event.type == event_id)
                        break;
        }
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e091f64ba6ce..8536e2a65969 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -127,7 +127,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 static int ftrace_raw_init_event(struct ftrace_event_call *call)
 {
-        INIT_LIST_HEAD(&call->fields);
+        INIT_LIST_HEAD(&call->class->fields);
        return 0;
 }
@@ -153,17 +153,21 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
 #define F_printk(fmt, args...) #fmt ", "  __stringify(args)
 #undef FTRACE_ENTRY
-#define FTRACE_ENTRY(call, struct_name, type, tstruct, print)           \
+#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print)          \
+                                                                        \
+struct ftrace_event_class event_class_ftrace_##call = {                 \
+        .system                 = __stringify(TRACE_SYSTEM),            \
+        .define_fields          = ftrace_define_fields_##call,          \
+        .raw_init               = ftrace_raw_init_event,                \
+};                                                                      \
                                                                        \
 struct ftrace_event_call __used                                         \
 __attribute__((__aligned__(4)))                                         \
 __attribute__((section("_ftrace_events"))) event_##call = {             \
        .name                   = #call,                                \
-        .id                     = type,                                 \
+        .event.type             = etype,                                \
-        .system                 = __stringify(TRACE_SYSTEM),            \
+        .class                  = &event_class_ftrace_##call,           \
-        .raw_init               = ftrace_raw_init_event,                \
        .print_fmt              = print,                                \
-        .define_fields          = ftrace_define_fields_##call,          \
 };                                                                      \
 #include "trace_entries.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 9aed1a5cf553..79f4bac99a94 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -40,7 +40,7 @@ struct fgraph_data {
 #define TRACE_GRAPH_PRINT_OVERHEAD      0x4
 #define TRACE_GRAPH_PRINT_PROC          0x8
 #define TRACE_GRAPH_PRINT_DURATION      0x10
-#define TRACE_GRAPH_PRINT_ABS_TIME      0X20
+#define TRACE_GRAPH_PRINT_ABS_TIME      0x20
 static struct tracer_opt trace_opts[] = {
        /* Display overruns? (for self-debug purpose) */
@@ -179,7 +179,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
        return ret;
 }
-static int __trace_graph_entry(struct trace_array *tr,
+int __trace_graph_entry(struct trace_array *tr,
                                struct ftrace_graph_ent *trace,
                                unsigned long flags,
                                int pc)
@@ -246,7 +246,7 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
                return trace_graph_entry(trace);
 }
-static void __trace_graph_return(struct trace_array *tr,
+void __trace_graph_return(struct trace_array *tr,
                                struct ftrace_graph_ret *trace,
                                unsigned long flags,
                                int pc)
@@ -490,9 +490,10 @@ get_return_for_leaf(struct trace_iterator *iter,
                         * We need to consume the current entry to see
                         * the next one.
                         */
-                        ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+                        ring_buffer_consume(iter->tr->buffer, iter->cpu,
+                                            NULL, NULL);
                        event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
-                                                 NULL);
+                                                 NULL, NULL);
                }
                if (!event)
@@ -526,17 +527,18 @@ get_return_for_leaf(struct trace_iterator *iter,
 /* Signal a overhead of time execution to the output */
 static int
-print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+print_graph_overhead(unsigned long long duration, struct trace_seq *s,
+                     u32 flags)
 {
        /* If duration disappear, we don't need anything */
-        if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION))
+        if (!(flags & TRACE_GRAPH_PRINT_DURATION))
                return 1;
        /* Non nested entry or return */
        if (duration == -1)
                return trace_seq_printf(s, "  ");
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+        if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
                /* Duration exceeded 100 msecs */
                if (duration > 100000ULL)
                        return trace_seq_printf(s, "! ");
@@ -562,7 +564,7 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
 static enum print_line_t
 print_graph_irq(struct trace_iterator *iter, unsigned long addr,
-                enum trace_type type, int cpu, pid_t pid)
+                enum trace_type type, int cpu, pid_t pid, u32 flags)
 {
        int ret;
        struct trace_seq *s = &iter->seq;
@@ -572,21 +574,21 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
                return TRACE_TYPE_UNHANDLED;
        /* Absolute time */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+        if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
                ret = print_graph_abs_time(iter->ts, s);
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
        /* Cpu */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+        if (flags & TRACE_GRAPH_PRINT_CPU) {
                ret = print_graph_cpu(s, cpu);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
        /* Proc */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+        if (flags & TRACE_GRAPH_PRINT_PROC) {
                ret = print_graph_proc(s, pid);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -596,7 +598,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
        }
        /* No overhead */
-        ret = print_graph_overhead(-1, s);
+        ret = print_graph_overhead(-1, s, flags);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
@@ -609,7 +611,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
                return TRACE_TYPE_PARTIAL_LINE;
        /* Don't close the duration column if haven't one */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+        if (flags & TRACE_GRAPH_PRINT_DURATION)
                trace_seq_printf(s, " |");
        ret = trace_seq_printf(s, "\n");
@@ -679,7 +681,8 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 static enum print_line_t
 print_graph_entry_leaf(struct trace_iterator *iter,
                struct ftrace_graph_ent_entry *entry,
-                struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
+                struct ftrace_graph_ret_entry *ret_entry,
+                struct trace_seq *s, u32 flags)
 {
        struct fgraph_data *data = iter->private;
        struct ftrace_graph_ret *graph_ret;
@@ -711,12 +714,12 @@ print_graph_entry_leaf(struct trace_iterator *iter,
        }
        /* Overhead */
-        ret = print_graph_overhead(duration, s);
+        ret = print_graph_overhead(duration, s, flags);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
        /* Duration */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+        if (flags & TRACE_GRAPH_PRINT_DURATION) {
                ret = print_graph_duration(duration, s);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -739,7 +742,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 static enum print_line_t
 print_graph_entry_nested(struct trace_iterator *iter,
                         struct ftrace_graph_ent_entry *entry,
-                         struct trace_seq *s, int cpu)
+                         struct trace_seq *s, int cpu, u32 flags)
 {
        struct ftrace_graph_ent *call = &entry->graph_ent;
        struct fgraph_data *data = iter->private;
@@ -759,12 +762,12 @@ print_graph_entry_nested(struct trace_iterator *iter,
        }
        /* No overhead */
-        ret = print_graph_overhead(-1, s);
+        ret = print_graph_overhead(-1, s, flags);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
        /* No time */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+        if (flags & TRACE_GRAPH_PRINT_DURATION) {
                ret = trace_seq_printf(s, "            |  ");
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -790,7 +793,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
 static enum print_line_t
 print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
-                     int type, unsigned long addr)
+                     int type, unsigned long addr, u32 flags)
 {
        struct fgraph_data *data = iter->private;
        struct trace_entry *ent = iter->ent;
@@ -803,27 +806,27 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
        if (type) {
                /* Interrupt */
-                ret = print_graph_irq(iter, addr, type, cpu, ent->pid);
+                ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
        /* Absolute time */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+        if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
                ret = print_graph_abs_time(iter->ts, s);
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
        /* Cpu */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+        if (flags & TRACE_GRAPH_PRINT_CPU) {
                ret = print_graph_cpu(s, cpu);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
        /* Proc */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+        if (flags & TRACE_GRAPH_PRINT_PROC) {
                ret = print_graph_proc(s, ent->pid);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -845,7 +848,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
 static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
-                        struct trace_iterator *iter)
+                        struct trace_iterator *iter, u32 flags)
 {
        struct fgraph_data *data = iter->private;
        struct ftrace_graph_ent *call = &field->graph_ent;
@@ -853,14 +856,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
        static enum print_line_t ret;
        int cpu = iter->cpu;
-        if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
+        if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
                return TRACE_TYPE_PARTIAL_LINE;
        leaf_ret = get_return_for_leaf(iter, field);
        if (leaf_ret)
-                ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
+                ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
        else
-                ret = print_graph_entry_nested(iter, field, s, cpu);
+                ret = print_graph_entry_nested(iter, field, s, cpu, flags);
        if (data) {
                /*
@@ -879,7 +882,8 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 static enum print_line_t
 print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
-                   struct trace_entry *ent, struct trace_iterator *iter)
+                   struct trace_entry *ent, struct trace_iterator *iter,
+                   u32 flags)
 {
        unsigned long long duration = trace->rettime - trace->calltime;
        struct fgraph_data *data = iter->private;
@@ -909,16 +913,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
                }
        }
-        if (print_graph_prologue(iter, s, 0, 0))
+        if (print_graph_prologue(iter, s, 0, 0, flags))
                return TRACE_TYPE_PARTIAL_LINE;
        /* Overhead */
-        ret = print_graph_overhead(duration, s);
+        ret = print_graph_overhead(duration, s, flags);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
        /* Duration */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+        if (flags & TRACE_GRAPH_PRINT_DURATION) {
                ret = print_graph_duration(duration, s);
                if (ret == TRACE_TYPE_PARTIAL_LINE)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -948,14 +952,15 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
        }
        /* Overrun */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
+        if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
                ret = trace_seq_printf(s, " (Overruns: %lu)\n",
                                        trace->overrun);
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
-        ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid);
+        ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
+                              cpu, pid, flags);
        if (ret == TRACE_TYPE_PARTIAL_LINE)
                return TRACE_TYPE_PARTIAL_LINE;
@@ -963,8 +968,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 }
 static enum print_line_t
-print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
+print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
-                    struct trace_iterator *iter)
+                    struct trace_iterator *iter, u32 flags)
 {
        unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
        struct fgraph_data *data = iter->private;
@@ -976,16 +981,16 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
        if (data)
                depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
-        if (print_graph_prologue(iter, s, 0, 0))
+        if (print_graph_prologue(iter, s, 0, 0, flags))
                return TRACE_TYPE_PARTIAL_LINE;
        /* No overhead */
-        ret = print_graph_overhead(-1, s);
+        ret = print_graph_overhead(-1, s, flags);
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
        /* No time */
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+        if (flags & TRACE_GRAPH_PRINT_DURATION) {
                ret = trace_seq_printf(s, "            |  ");
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -1020,7 +1025,7 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
                if (!event)
                        return TRACE_TYPE_UNHANDLED;
-                ret = event->trace(iter, sym_flags);
+                ret = event->funcs->trace(iter, sym_flags, event);
                if (ret != TRACE_TYPE_HANDLED)
                        return ret;
        }
@@ -1040,7 +1045,7 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
 enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
        struct ftrace_graph_ent_entry *field;
        struct fgraph_data *data = iter->private;
@@ -1061,7 +1066,7 @@ print_graph_function(struct trace_iterator *iter)
        if (data && data->failed) {
                field = &data->ent;
                iter->cpu = data->cpu;
-                ret = print_graph_entry(field, s, iter);
+                ret = print_graph_entry(field, s, iter, flags);
                if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
                        per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
                        ret = TRACE_TYPE_NO_CONSUME;
@@ -1081,32 +1086,50 @@ print_graph_function(struct trace_iterator *iter)
                struct ftrace_graph_ent_entry saved;
                trace_assign_type(field, entry);
                saved = *field;
-                return print_graph_entry(&saved, s, iter);
+                return print_graph_entry(&saved, s, iter, flags);
        }
        case TRACE_GRAPH_RET: {
                struct ftrace_graph_ret_entry *field;
                trace_assign_type(field, entry);
-                return print_graph_return(&field->ret, s, entry, iter);
+                return print_graph_return(&field->ret, s, entry, iter, flags);
        }
+        case TRACE_STACK:
+        case TRACE_FN:
+                /* dont trace stack and functions as comments */
+                return TRACE_TYPE_UNHANDLED;
        default:
-                return print_graph_comment(s, entry, iter);
+                return print_graph_comment(s, entry, iter, flags);
        }
        return TRACE_TYPE_HANDLED;
 }
-static void print_lat_header(struct seq_file *s)
+static enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+        return print_graph_function_flags(iter, tracer_flags.val);
+}
+static enum print_line_t
+print_graph_function_event(struct trace_iterator *iter, int flags,
+                           struct trace_event *event)
+{
+        return print_graph_function(iter);
+}
+static void print_lat_header(struct seq_file *s, u32 flags)
 {
        static const char spaces[] = "                " /* 16 spaces */
                "    "                                  /* 4 spaces */
                "                 ";                    /* 17 spaces */
        int size = 0;
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+        if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
                size += 16;
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+        if (flags & TRACE_GRAPH_PRINT_CPU)
                size += 4;
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+        if (flags & TRACE_GRAPH_PRINT_PROC)
                size += 17;
        seq_printf(s, "#%.*s  _-----=> irqs-off        \n", size, spaces);
@@ -1117,43 +1140,48 @@ static void print_lat_header(struct seq_file *s)
        seq_printf(s, "#%.*s|||| /                     \n", size, spaces);
 }
-static void print_graph_headers(struct seq_file *s)
+void print_graph_headers_flags(struct seq_file *s, u32 flags)
 {
        int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
        if (lat)
-                print_lat_header(s);
+                print_lat_header(s, flags);
        /* 1st line */
        seq_printf(s, "#");
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+        if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
                seq_printf(s, "     TIME       ");
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+        if (flags & TRACE_GRAPH_PRINT_CPU)
                seq_printf(s, " CPU");
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+        if (flags & TRACE_GRAPH_PRINT_PROC)
                seq_printf(s, "  TASK/PID       ");
        if (lat)
                seq_printf(s, "|||||");
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+        if (flags & TRACE_GRAPH_PRINT_DURATION)
                seq_printf(s, "  DURATION   ");
        seq_printf(s, "               FUNCTION CALLS\n");
        /* 2nd line */
        seq_printf(s, "#");
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+        if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
                seq_printf(s, "      |         ");
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+        if (flags & TRACE_GRAPH_PRINT_CPU)
                seq_printf(s, " |  ");
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+        if (flags & TRACE_GRAPH_PRINT_PROC)
                seq_printf(s, "   |    |        ");
        if (lat)
                seq_printf(s, "|||||");
-        if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+        if (flags & TRACE_GRAPH_PRINT_DURATION)
                seq_printf(s, "   |   |      ");
        seq_printf(s, "               |   |   |   |\n");
 }
-static void graph_trace_open(struct trace_iterator *iter)
+void print_graph_headers(struct seq_file *s)
+{
+        print_graph_headers_flags(s, tracer_flags.val);
+}
+void graph_trace_open(struct trace_iterator *iter)
 {
        /* pid and depth on the last trace processed */
        struct fgraph_data *data;
@@ -1188,7 +1216,7 @@ static void graph_trace_open(struct trace_iterator *iter)
        pr_warning("function graph tracer: not enough memory\n");
 }
-static void graph_trace_close(struct trace_iterator *iter)
+void graph_trace_close(struct trace_iterator *iter)
 {
        struct fgraph_data *data = iter->private;
@@ -1198,6 +1226,20 @@ static void graph_trace_close(struct trace_iterator *iter)
        }
 }
+static struct trace_event_functions graph_functions = {
+        .trace          = print_graph_function_event,
+};
+static struct trace_event graph_trace_entry_event = {
+        .type           = TRACE_GRAPH_ENT,
+        .funcs          = &graph_functions,
+};
+static struct trace_event graph_trace_ret_event = {
+        .type           = TRACE_GRAPH_RET,
+        .funcs          = &graph_functions
+};
 static struct tracer graph_trace __read_mostly = {
        .name           = "function_graph",
        .open           = graph_trace_open,
@@ -1219,6 +1261,16 @@ static __init int init_graph_trace(void)
 {
        max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
+        if (!register_ftrace_event(&graph_trace_entry_event)) {
+                pr_warning("Warning: could not register graph trace events\n");
+                return 1;
+        }
+        if (!register_ftrace_event(&graph_trace_ret_event)) {
+                pr_warning("Warning: could not register graph trace events\n");
+                return 1;
+        }
        return register_tracer(&graph_trace);
 }
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2974bc7538c7..6fd486e0cef4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -34,6 +34,9 @@ static int trace_type __read_mostly;
 static int save_lat_flag;
+static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
+static int start_irqsoff_tracer(struct trace_array *tr, int graph);
 #ifdef CONFIG_PREEMPT_TRACER
 static inline int
 preempt_trace(void)
@@ -55,6 +58,23 @@ irq_trace(void)
 # define irq_trace() (0)
 #endif
+#define TRACE_DISPLAY_GRAPH     1
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+        /* display latency trace as call graph */
+        { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+        { } /* Empty entry */
+};
+static struct tracer_flags tracer_flags = {
+        .val  = 0,
+        .opts = trace_opts,
+};
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
 /*
 * Sequence count - we record it when starting a measurement and
 * skip the latency if the sequence has changed - some other section
@@ -108,6 +128,202 @@ static struct ftrace_ops trace_ops __read_mostly =
 };
 #endif /* CONFIG_FUNCTION_TRACER */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+{
+        int cpu;
+        if (!(bit & TRACE_DISPLAY_GRAPH))
+                return -EINVAL;
+        if (!(is_graph() ^ set))
+                return 0;
+        stop_irqsoff_tracer(irqsoff_trace, !set);
+        for_each_possible_cpu(cpu)
+                per_cpu(tracing_cpu, cpu) = 0;
+        tracing_max_latency = 0;
+        tracing_reset_online_cpus(irqsoff_trace);
+        return start_irqsoff_tracer(irqsoff_trace, set);
+}
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+{
+        struct trace_array *tr = irqsoff_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        long disabled;
+        int ret;
+        int cpu;
+        int pc;
+        cpu = raw_smp_processor_id();
+        if (likely(!per_cpu(tracing_cpu, cpu)))
+                return 0;
+        local_save_flags(flags);
+        /* slight chance to get a false positive on tracing_cpu */
+        if (!irqs_disabled_flags(flags))
+                return 0;
+        data = tr->data[cpu];
+        disabled = atomic_inc_return(&data->disabled);
+        if (likely(disabled == 1)) {
+                pc = preempt_count();
+                ret = __trace_graph_entry(tr, trace, flags, pc);
+        } else
+                ret = 0;
+        atomic_dec(&data->disabled);
+        return ret;
+}
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
+{
+        struct trace_array *tr = irqsoff_trace;
+        struct trace_array_cpu *data;
+        unsigned long flags;
+        long disabled;
+        int cpu;
+        int pc;
+        cpu = raw_smp_processor_id();
+        if (likely(!per_cpu(tracing_cpu, cpu)))
+                return;
+        local_save_flags(flags);
+        /* slight chance to get a false positive on tracing_cpu */
+        if (!irqs_disabled_flags(flags))
+                return;
+        data = tr->data[cpu];
+        disabled = atomic_inc_return(&data->disabled);
+        if (likely(disabled == 1)) {
+                pc = preempt_count();
+                __trace_graph_return(tr, trace, flags, pc);
+        }
+        atomic_dec(&data->disabled);
+}
+static void irqsoff_trace_open(struct trace_iterator *iter)
+{
+        if (is_graph())
+                graph_trace_open(iter);
+}
+static void irqsoff_trace_close(struct trace_iterator *iter)
+{
+        if (iter->private)
+                graph_trace_close(iter);
+}
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
+                            TRACE_GRAPH_PRINT_PROC)
+static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
+{
+        u32 flags = GRAPH_TRACER_FLAGS;
+        if (trace_flags & TRACE_ITER_LATENCY_FMT)
+                flags |= TRACE_GRAPH_PRINT_DURATION;
+        else
+                flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+        /*
+         * In graph mode call the graph tracer output function,
+         * otherwise go with the TRACE_FN event handler
+         */
+        if (is_graph())
+                return print_graph_function_flags(iter, flags);
+        return TRACE_TYPE_UNHANDLED;
+}
+static void irqsoff_print_header(struct seq_file *s)
+{
+        if (is_graph()) {
+                struct trace_iterator *iter = s->private;
+                u32 flags = GRAPH_TRACER_FLAGS;
+                if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+                        /* print nothing if the buffers are empty */
+                        if (trace_empty(iter))
+                                return;
+                        print_trace_header(s, iter);
+                        flags |= TRACE_GRAPH_PRINT_DURATION;
+                } else
+                        flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+                print_graph_headers_flags(s, flags);
+        } else
+                trace_default_header(s);
+}
+static void
+trace_graph_function(struct trace_array *tr,
+                 unsigned long ip, unsigned long flags, int pc)
+{
+        u64 time = trace_clock_local();
+        struct ftrace_graph_ent ent = {
+                .func  = ip,
+                .depth = 0,
+        };
+        struct ftrace_graph_ret ret = {
+                .func     = ip,
+                .depth    = 0,
+                .calltime = time,
+                .rettime  = time,
+        };
+        __trace_graph_entry(tr, &ent, flags, pc);
+        __trace_graph_return(tr, &ret, flags, pc);
+}
+static void
+__trace_function(struct trace_array *tr,
+                 unsigned long ip, unsigned long parent_ip,
+                 unsigned long flags, int pc)
+{
+        if (!is_graph())
+                trace_function(tr, ip, parent_ip, flags, pc);
+        else {
+                trace_graph_function(tr, parent_ip, flags, pc);
+                trace_graph_function(tr, ip, flags, pc);
+        }
+}
+#else
+#define __trace_function trace_function
+static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+{
+        return -EINVAL;
+}
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+{
+        return -1;
+}
+static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
+{
+        return TRACE_TYPE_UNHANDLED;
+}
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
+static void irqsoff_print_header(struct seq_file *s) { }
+static void irqsoff_trace_open(struct trace_iterator *iter) { }
+static void irqsoff_trace_close(struct trace_iterator *iter) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 /*
 * Should this new latency be reported/recorded?
 */
@@ -150,7 +366,7 @@ check_critical_timing(struct trace_array *tr,
        if (!report_latency(delta))
                goto out_unlock;
-        trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+        __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
        /* Skip 5 functions to get to the irq/preempt enable function */
        __trace_stack(tr, flags, 5, pc);
@@ -172,7 +388,7 @@ out_unlock:
 out:
        data->critical_sequence = max_sequence;
        data->preempt_timestamp = ftrace_now(cpu);
-        trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+        __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
 }
 static inline void
@@ -204,7 +420,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
        local_save_flags(flags);
-        trace_function(tr, ip, parent_ip, flags, preempt_count());
+        __trace_function(tr, ip, parent_ip, flags, preempt_count());
        per_cpu(tracing_cpu, cpu) = 1;
@@ -238,7 +454,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
        atomic_inc(&data->disabled);
        local_save_flags(flags);
-        trace_function(tr, ip, parent_ip, flags, preempt_count());
+        __trace_function(tr, ip, parent_ip, flags, preempt_count());
        check_critical_timing(tr, data, parent_ip ? : ip, cpu);
        data->critical_start = 0;
        atomic_dec(&data->disabled);
@@ -347,19 +563,32 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
 }
 #endif /* CONFIG_PREEMPT_TRACER */
-static void start_irqsoff_tracer(struct trace_array *tr)
+static int start_irqsoff_tracer(struct trace_array *tr, int graph)
 {
-        register_ftrace_function(&trace_ops);
+        int ret = 0;
-        if (tracing_is_enabled())
+        if (!graph)
+                ret = register_ftrace_function(&trace_ops);
+        else
+                ret = register_ftrace_graph(&irqsoff_graph_return,
+                                            &irqsoff_graph_entry);
+        if (!ret && tracing_is_enabled())
                tracer_enabled = 1;
        else
                tracer_enabled = 0;
+        return ret;
 }
-static void stop_irqsoff_tracer(struct trace_array *tr)
+static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
 {
        tracer_enabled = 0;
-        unregister_ftrace_function(&trace_ops);
+        if (!graph)
+                unregister_ftrace_function(&trace_ops);
+        else
+                unregister_ftrace_graph();
 }
 static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -372,12 +601,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
        /* make sure that the tracer is visible */
        smp_wmb();
        tracing_reset_online_cpus(tr);
-        start_irqsoff_tracer(tr);
+        if (start_irqsoff_tracer(tr, is_graph()))
+                printk(KERN_ERR "failed to start irqsoff tracer\n");
 }
 static void irqsoff_tracer_reset(struct trace_array *tr)
 {
-        stop_irqsoff_tracer(tr);
+        stop_irqsoff_tracer(tr, is_graph());
        if (!save_lat_flag)
                trace_flags &= ~TRACE_ITER_LATENCY_FMT;
@@ -409,9 +640,15 @@ static struct tracer irqsoff_tracer __read_mostly =
        .start          = irqsoff_tracer_start,
        .stop           = irqsoff_tracer_stop,
        .print_max      = 1,
+        .print_header   = irqsoff_print_header,
+        .print_line     = irqsoff_print_line,
+        .flags          = &tracer_flags,
+        .set_flag       = irqsoff_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_irqsoff,
 #endif
+        .open           = irqsoff_trace_open,
+        .close          = irqsoff_trace_close,
 };
 # define register_irqsoff(trace) register_tracer(&trace)
 #else
@@ -435,9 +672,15 @@ static struct tracer preemptoff_tracer __read_mostly =
        .start          = irqsoff_tracer_start,
        .stop           = irqsoff_tracer_stop,
        .print_max      = 1,
+        .print_header   = irqsoff_print_header,
+        .print_line     = irqsoff_print_line,
+        .flags          = &tracer_flags,
+        .set_flag       = irqsoff_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_preemptoff,
 #endif
+        .open           = irqsoff_trace_open,
+        .close          = irqsoff_trace_close,
 };
 # define register_preemptoff(trace) register_tracer(&trace)
 #else
@@ -463,9 +706,15 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
        .start          = irqsoff_tracer_start,
        .stop           = irqsoff_tracer_stop,
        .print_max      = 1,
+        .print_header   = irqsoff_print_header,
+        .print_line     = irqsoff_print_line,
+        .flags          = &tracer_flags,
+        .set_flag       = irqsoff_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_preemptirqsoff,
 #endif
+        .open           = irqsoff_trace_open,
+        .close          = irqsoff_trace_close,
 };
 # define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a7514326052b..9a082bba9537 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -324,8 +324,8 @@ struct trace_probe {
        unsigned long           nhit;
        unsigned int            flags;  /* For TP_FLAG_* */
        const char              *symbol;        /* symbol name */
+        struct ftrace_event_class       class;
        struct ftrace_event_call        call;
-        struct trace_event              event;
        ssize_t                 size;           /* trace entry size */
        unsigned int            nr_args;
        struct probe_arg        args[];
@@ -404,6 +404,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
                goto error;
        }
+        tp->call.class = &tp->class;
        tp->call.name = kstrdup(event, GFP_KERNEL);
        if (!tp->call.name)
                goto error;
@@ -413,8 +414,8 @@ static struct trace_probe *alloc_trace_probe(const char *group,
                goto error;
        }
-        tp->call.system = kstrdup(group, GFP_KERNEL);
+        tp->class.system = kstrdup(group, GFP_KERNEL);
-        if (!tp->call.system)
+        if (!tp->class.system)
                goto error;
        INIT_LIST_HEAD(&tp->list);
@@ -443,7 +444,7 @@ static void free_trace_probe(struct trace_probe *tp)
        for (i = 0; i < tp->nr_args; i++)
                free_probe_arg(&tp->args[i]);
-        kfree(tp->call.system);
+        kfree(tp->call.class->system);
        kfree(tp->call.name);
        kfree(tp->symbol);
        kfree(tp);
@@ -456,7 +457,7 @@ static struct trace_probe *find_probe_event(const char *event,
        list_for_each_entry(tp, &probe_list, list)
                if (strcmp(tp->call.name, event) == 0 &&
-                    strcmp(tp->call.system, group) == 0)
+                    strcmp(tp->call.class->system, group) == 0)
                        return tp;
        return NULL;
 }
@@ -481,7 +482,7 @@ static int register_trace_probe(struct trace_probe *tp)
        mutex_lock(&probe_lock);
        /* register as an event */
-        old_tp = find_probe_event(tp->call.name, tp->call.system);
+        old_tp = find_probe_event(tp->call.name, tp->call.class->system);
        if (old_tp) {
                /* delete old event */
                unregister_trace_probe(old_tp);
@@ -904,7 +905,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
        int i;
        seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
-        seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
+        seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
        if (!tp->symbol)
                seq_printf(m, " 0x%p", tp->rp.kp.addr);
@@ -1061,8 +1062,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
        size = sizeof(*entry) + tp->size;
-        event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
+        event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
-                                                  irq_flags, pc);
+                                                  size, irq_flags, pc);
        if (!event)
                return;
@@ -1094,8 +1095,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
        size = sizeof(*entry) + tp->size;
-        event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
+        event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
-                                                  irq_flags, pc);
+                                                  size, irq_flags, pc);
        if (!event)
                return;
@@ -1112,18 +1113,17 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
 /* Event entry printers */
 enum print_line_t
-print_kprobe_event(struct trace_iterator *iter, int flags)
+print_kprobe_event(struct trace_iterator *iter, int flags,
+                   struct trace_event *event)
 {
        struct kprobe_trace_entry_head *field;
        struct trace_seq *s = &iter->seq;
-        struct trace_event *event;
        struct trace_probe *tp;
        u8 *data;
        int i;
        field = (struct kprobe_trace_entry_head *)iter->ent;
-        event = ftrace_find_event(field->ent.type);
+        tp = container_of(event, struct trace_probe, call.event);
-        tp = container_of(event, struct trace_probe, event);
        if (!trace_seq_printf(s, "%s: (", tp->call.name))
                goto partial;
@@ -1149,18 +1149,17 @@ partial:
 }
 enum print_line_t
-print_kretprobe_event(struct trace_iterator *iter, int flags)
+print_kretprobe_event(struct trace_iterator *iter, int flags,
+                      struct trace_event *event)
 {
        struct kretprobe_trace_entry_head *field;
        struct trace_seq *s = &iter->seq;
-        struct trace_event *event;
        struct trace_probe *tp;
        u8 *data;
        int i;
        field = (struct kretprobe_trace_entry_head *)iter->ent;
-        event = ftrace_find_event(field->ent.type);
+        tp = container_of(event, struct trace_probe, call.event);
-        tp = container_of(event, struct trace_probe, event);
        if (!trace_seq_printf(s, "%s: (", tp->call.name))
                goto partial;
@@ -1217,8 +1216,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
 static int probe_event_raw_init(struct ftrace_event_call *event_call)
 {
-        INIT_LIST_HEAD(&event_call->fields);
        return 0;
 }
@@ -1353,7 +1350,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
                     "profile buffer not large enough"))
                return;
-        entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
+        entry = perf_trace_buf_prepare(size, call->event.type,
+                                       &rctx, &irq_flags);
        if (!entry)
                return;
@@ -1384,7 +1382,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
                     "profile buffer not large enough"))
                return;
-        entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
+        entry = perf_trace_buf_prepare(size, call->event.type,
+                                       &rctx, &irq_flags);
        if (!entry)
                return;
@@ -1425,6 +1424,26 @@ static void probe_perf_disable(struct ftrace_event_call *call)
 }
 #endif  /* CONFIG_PERF_EVENTS */
+static __kprobes
+int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
+{
+        switch (type) {
+        case TRACE_REG_REGISTER:
+                return probe_event_enable(event);
+        case TRACE_REG_UNREGISTER:
+                probe_event_disable(event);
+                return 0;
+#ifdef CONFIG_PERF_EVENTS
+        case TRACE_REG_PERF_REGISTER:
+                return probe_perf_enable(event);
+        case TRACE_REG_PERF_UNREGISTER:
+                probe_perf_disable(event);
+                return 0;
+#endif
+        }
+        return 0;
+}
 static __kprobes
 int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
@@ -1454,6 +1473,14 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
        return 0;       /* We don't tweek kernel, so just return 0 */
 }
+static struct trace_event_functions kretprobe_funcs = {
+        .trace          = print_kretprobe_event
+};
+static struct trace_event_functions kprobe_funcs = {
+        .trace          = print_kprobe_event
+};
 static int register_probe_event(struct trace_probe *tp)
 {
        struct ftrace_event_call *call = &tp->call;
@@ -1461,36 +1488,31 @@ static int register_probe_event(struct trace_probe *tp)
        /* Initialize ftrace_event_call */
        if (probe_is_return(tp)) {
-                tp->event.trace = print_kretprobe_event;
+                INIT_LIST_HEAD(&call->class->fields);
-                call->raw_init = probe_event_raw_init;
+                call->event.funcs = &kretprobe_funcs;
-                call->define_fields = kretprobe_event_define_fields;
+                call->class->raw_init = probe_event_raw_init;
+                call->class->define_fields = kretprobe_event_define_fields;
        } else {
-                tp->event.trace = print_kprobe_event;
+                INIT_LIST_HEAD(&call->class->fields);
-                call->raw_init = probe_event_raw_init;
+                call->event.funcs = &kprobe_funcs;
-                call->define_fields = kprobe_event_define_fields;
+                call->class->raw_init = probe_event_raw_init;
+                call->class->define_fields = kprobe_event_define_fields;
        }
        if (set_print_fmt(tp) < 0)
                return -ENOMEM;
-        call->event = &tp->event;
+        ret = register_ftrace_event(&call->event);
-        call->id = register_ftrace_event(&tp->event);
+        if (!ret) {
-        if (!call->id) {
                kfree(call->print_fmt);
                return -ENODEV;
        }
-        call->enabled = 0;
+        call->flags = 0;
-        call->regfunc = probe_event_enable;
+        call->class->reg = kprobe_register;
-        call->unregfunc = probe_event_disable;
-#ifdef CONFIG_PERF_EVENTS
-        call->perf_event_enable = probe_perf_enable;
-        call->perf_event_disable = probe_perf_disable;
-#endif
        call->data = tp;
        ret = trace_add_event_call(call);
        if (ret) {
                pr_info("Failed to register kprobe event: %s\n", call->name);
                kfree(call->print_fmt);
-                unregister_ftrace_event(&tp->event);
+                unregister_ftrace_event(&call->event);
        }
        return ret;
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8e46b3323cdc..fc9d4dbb089e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -253,7 +253,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
        void *ret;
        if (s->full)
-                return 0;
+                return NULL;
        if (len > ((PAGE_SIZE - 1) - s->len)) {
                s->full = 1;
@@ -726,6 +726,9 @@ int register_ftrace_event(struct trace_event *event)
        if (WARN_ON(!event))
                goto out;
+        if (WARN_ON(!event->funcs))
+                goto out;
        INIT_LIST_HEAD(&event->list);
        if (!event->type) {
@@ -758,14 +761,14 @@ int register_ftrace_event(struct trace_event *event)
                        goto out;
        }
-        if (event->trace == NULL)
+        if (event->funcs->trace == NULL)
-                event->trace = trace_nop_print;
+                event->funcs->trace = trace_nop_print;
-        if (event->raw == NULL)
+        if (event->funcs->raw == NULL)
-                event->raw = trace_nop_print;
+                event->funcs->raw = trace_nop_print;
-        if (event->hex == NULL)
+        if (event->funcs->hex == NULL)
-                event->hex = trace_nop_print;
+                event->funcs->hex = trace_nop_print;
-        if (event->binary == NULL)
+        if (event->funcs->binary == NULL)
-                event->binary = trace_nop_print;
+                event->funcs->binary = trace_nop_print;
        key = event->type & (EVENT_HASHSIZE - 1);
@@ -807,13 +810,15 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
 * Standard events
 */
-enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags)
+enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
+                                  struct trace_event *event)
 {
        return TRACE_TYPE_HANDLED;
 }
 /* TRACE_FN */
-static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
+                                        struct trace_event *event)
 {
        struct ftrace_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -840,7 +845,8 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
        return TRACE_TYPE_PARTIAL_LINE;
 }
-static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
+                                      struct trace_event *event)
 {
        struct ftrace_entry *field;
@@ -854,7 +860,8 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
        return TRACE_TYPE_HANDLED;
 }
-static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
+                                      struct trace_event *event)
 {
        struct ftrace_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -867,7 +874,8 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
        return TRACE_TYPE_HANDLED;
 }
-static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
+                                      struct trace_event *event)
 {
        struct ftrace_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -880,14 +888,18 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
        return TRACE_TYPE_HANDLED;
 }
-static struct trace_event trace_fn_event = {
+static struct trace_event_functions trace_fn_funcs = {
-        .type           = TRACE_FN,
        .trace          = trace_fn_trace,
        .raw            = trace_fn_raw,
        .hex            = trace_fn_hex,
        .binary         = trace_fn_bin,
 };
+static struct trace_event trace_fn_event = {
+        .type           = TRACE_FN,
+        .funcs          = &trace_fn_funcs,
+};
 /* TRACE_CTX an TRACE_WAKE */
 static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
                                             char *delim)
@@ -916,13 +928,14 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
        return TRACE_TYPE_HANDLED;
 }
-static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
+                                         struct trace_event *event)
 {
        return trace_ctxwake_print(iter, "==>");
 }
 static enum print_line_t trace_wake_print(struct trace_iterator *iter,
-                                          int flags)
+                                          int flags, struct trace_event *event)
 {
        return trace_ctxwake_print(iter, "  +");
 }
@@ -950,12 +963,14 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
        return TRACE_TYPE_HANDLED;
 }
-static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
+                                       struct trace_event *event)
 {
        return trace_ctxwake_raw(iter, 0);
 }
-static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags,
+                                        struct trace_event *event)
 {
        return trace_ctxwake_raw(iter, '+');
 }
@@ -984,18 +999,20 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
        return TRACE_TYPE_HANDLED;
 }
-static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
+                                       struct trace_event *event)
 {
        return trace_ctxwake_hex(iter, 0);
 }
-static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags,
+                                        struct trace_event *event)
 {
        return trace_ctxwake_hex(iter, '+');
 }
 static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
-                                           int flags)
+                                           int flags, struct trace_event *event)
 {
        struct ctx_switch_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1012,25 +1029,33 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
        return TRACE_TYPE_HANDLED;
 }
-static struct trace_event trace_ctx_event = {
+static struct trace_event_functions trace_ctx_funcs = {
-        .type           = TRACE_CTX,
        .trace          = trace_ctx_print,
        .raw            = trace_ctx_raw,
        .hex            = trace_ctx_hex,
        .binary         = trace_ctxwake_bin,
 };
-static struct trace_event trace_wake_event = {
+static struct trace_event trace_ctx_event = {
-        .type           = TRACE_WAKE,
+        .type           = TRACE_CTX,
+        .funcs          = &trace_ctx_funcs,
+};
+static struct trace_event_functions trace_wake_funcs = {
        .trace          = trace_wake_print,
        .raw            = trace_wake_raw,
        .hex            = trace_wake_hex,
        .binary         = trace_ctxwake_bin,
 };
+static struct trace_event trace_wake_event = {
+        .type           = TRACE_WAKE,
+        .funcs          = &trace_wake_funcs,
+};
 /* TRACE_SPECIAL */
 static enum print_line_t trace_special_print(struct trace_iterator *iter,
-                                             int flags)
+                                             int flags, struct trace_event *event)
 {
        struct special_entry *field;
@@ -1046,7 +1071,7 @@ static enum print_line_t trace_special_print(struct trace_iterator *iter,
 }
 static enum print_line_t trace_special_hex(struct trace_iterator *iter,
-                                           int flags)
+                                           int flags, struct trace_event *event)
 {
        struct special_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1061,7 +1086,7 @@ static enum print_line_t trace_special_hex(struct trace_iterator *iter,
 }
 static enum print_line_t trace_special_bin(struct trace_iterator *iter,
-                                           int flags)
+                                           int flags, struct trace_event *event)
 {
        struct special_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1075,18 +1100,22 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter,
        return TRACE_TYPE_HANDLED;
 }
-static struct trace_event trace_special_event = {
+static struct trace_event_functions trace_special_funcs = {
-        .type           = TRACE_SPECIAL,
        .trace          = trace_special_print,
        .raw            = trace_special_print,
        .hex            = trace_special_hex,
        .binary         = trace_special_bin,
 };
+static struct trace_event trace_special_event = {
+        .type           = TRACE_SPECIAL,
+        .funcs          = &trace_special_funcs,
+};
 /* TRACE_STACK */
 static enum print_line_t trace_stack_print(struct trace_iterator *iter,
-                                           int flags)
+                                           int flags, struct trace_event *event)
 {
        struct stack_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1114,17 +1143,21 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
        return TRACE_TYPE_PARTIAL_LINE;
 }
-static struct trace_event trace_stack_event = {
+static struct trace_event_functions trace_stack_funcs = {
-        .type           = TRACE_STACK,
        .trace          = trace_stack_print,
        .raw            = trace_special_print,
        .hex            = trace_special_hex,
        .binary         = trace_special_bin,
 };
+static struct trace_event trace_stack_event = {
+        .type           = TRACE_STACK,
+        .funcs          = &trace_stack_funcs,
+};
 /* TRACE_USER_STACK */
 static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
-                                                int flags)
+                                                int flags, struct trace_event *event)
 {
        struct userstack_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1143,17 +1176,22 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
        return TRACE_TYPE_PARTIAL_LINE;
 }
-static struct trace_event trace_user_stack_event = {
+static struct trace_event_functions trace_user_stack_funcs = {
-        .type           = TRACE_USER_STACK,
        .trace          = trace_user_stack_print,
        .raw            = trace_special_print,
        .hex            = trace_special_hex,
        .binary         = trace_special_bin,
 };
+static struct trace_event trace_user_stack_event = {
+        .type           = TRACE_USER_STACK,
+        .funcs          = &trace_user_stack_funcs,
+};
 /* TRACE_BPRINT */
 static enum print_line_t
-trace_bprint_print(struct trace_iterator *iter, int flags)
+trace_bprint_print(struct trace_iterator *iter, int flags,
+                   struct trace_event *event)
 {
        struct trace_entry *entry = iter->ent;
        struct trace_seq *s = &iter->seq;
@@ -1178,7 +1216,8 @@ trace_bprint_print(struct trace_iterator *iter, int flags)
 static enum print_line_t
-trace_bprint_raw(struct trace_iterator *iter, int flags)
+trace_bprint_raw(struct trace_iterator *iter, int flags,
+                 struct trace_event *event)
 {
        struct bprint_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1197,16 +1236,19 @@ trace_bprint_raw(struct trace_iterator *iter, int flags)
        return TRACE_TYPE_PARTIAL_LINE;
 }
+static struct trace_event_functions trace_bprint_funcs = {
+        .trace          = trace_bprint_print,
+        .raw            = trace_bprint_raw,
+};
 static struct trace_event trace_bprint_event = {
        .type           = TRACE_BPRINT,
-        .trace          = trace_bprint_print,
+        .funcs          = &trace_bprint_funcs,
-        .raw            = trace_bprint_raw,
 };
 /* TRACE_PRINT */
 static enum print_line_t trace_print_print(struct trace_iterator *iter,
-                                           int flags)
+                                           int flags, struct trace_event *event)
 {
        struct print_entry *field;
        struct trace_seq *s = &iter->seq;
@@ -1225,7 +1267,8 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
        return TRACE_TYPE_PARTIAL_LINE;
 }
-static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
+                                         struct trace_event *event)
 {
        struct print_entry *field;
@@ -1240,12 +1283,16 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
        return TRACE_TYPE_PARTIAL_LINE;
 }
-static struct trace_event trace_print_event = {
+static struct trace_event_functions trace_print_funcs = {
-        .type           = TRACE_PRINT,
        .trace          = trace_print_print,
        .raw            = trace_print_raw,
 };
+static struct trace_event trace_print_event = {
+        .type           = TRACE_PRINT,
+        .funcs          = &trace_print_funcs,
+};
 static struct trace_event *events[] __initdata = {
        &trace_fn_event,
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 9d91c72ba38b..c038eba0492b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -25,7 +25,7 @@ extern void trace_event_read_unlock(void);
 extern struct trace_event *ftrace_find_event(int type);
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
-                                         int flags);
+                                         int flags, struct trace_event *event);
 extern int
 trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5fca0f51fde4..8f758d070c43 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -50,8 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 }
 static void
-probe_sched_switch(struct rq *__rq, struct task_struct *prev,
+probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
-                        struct task_struct *next)
 {
        struct trace_array_cpu *data;
        unsigned long flags;
@@ -109,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 }
 static void
-probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
+probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
 {
        struct trace_array_cpu *data;
        unsigned long flags;
@@ -139,21 +138,21 @@ static int tracing_sched_register(void)
 {
        int ret;
-        ret = register_trace_sched_wakeup(probe_sched_wakeup);
+        ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
        if (ret) {
                pr_info("wakeup trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_wakeup\n");
                return ret;
        }
-        ret = register_trace_sched_wakeup_new(probe_sched_wakeup);
+        ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
        if (ret) {
                pr_info("wakeup trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_wakeup_new\n");
                goto fail_deprobe;
        }
-        ret = register_trace_sched_switch(probe_sched_switch);
+        ret = register_trace_sched_switch(probe_sched_switch, NULL);
        if (ret) {
                pr_info("sched trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_switch\n");
@@ -162,17 +161,17 @@ static int tracing_sched_register(void)
        return ret;
 fail_deprobe_wake_new:
-        unregister_trace_sched_wakeup_new(probe_sched_wakeup);
+        unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
 fail_deprobe:
-        unregister_trace_sched_wakeup(probe_sched_wakeup);
+        unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
        return ret;
 }
 static void tracing_sched_unregister(void)
 {
-        unregister_trace_sched_switch(probe_sched_switch);
+        unregister_trace_sched_switch(probe_sched_switch, NULL);
-        unregister_trace_sched_wakeup_new(probe_sched_wakeup);
+        unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
-        unregister_trace_sched_wakeup(probe_sched_wakeup);
+        unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
 }
 static void tracing_start_sched_switch(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0271742abb8d..0e73bc2ef8c5 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -98,7 +98,8 @@ static int report_latency(cycle_t delta)
        return 1;
 }
-static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
+static void
+probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
 {
        if (task != wakeup_task)
                return;
@@ -107,8 +108,8 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
 }
 static void notrace
-probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
+probe_wakeup_sched_switch(void *ignore,
-        struct task_struct *next)
+                          struct task_struct *prev, struct task_struct *next)
 {
        struct trace_array_cpu *data;
        cycle_t T0, T1, delta;
@@ -200,7 +201,7 @@ static void wakeup_reset(struct trace_array *tr)
 }
 static void
-probe_wakeup(struct rq *rq, struct task_struct *p, int success)
+probe_wakeup(void *ignore, struct task_struct *p, int success)
 {
        struct trace_array_cpu *data;
        int cpu = smp_processor_id();
@@ -264,28 +265,28 @@ static void start_wakeup_tracer(struct trace_array *tr)
 {
        int ret;
-        ret = register_trace_sched_wakeup(probe_wakeup);
+        ret = register_trace_sched_wakeup(probe_wakeup, NULL);
        if (ret) {
                pr_info("wakeup trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_wakeup\n");
                return;
        }
-        ret = register_trace_sched_wakeup_new(probe_wakeup);
+        ret = register_trace_sched_wakeup_new(probe_wakeup, NULL);
        if (ret) {
                pr_info("wakeup trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_wakeup_new\n");
                goto fail_deprobe;
        }
-        ret = register_trace_sched_switch(probe_wakeup_sched_switch);
+        ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL);
        if (ret) {
                pr_info("sched trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_switch\n");
                goto fail_deprobe_wake_new;
        }
-        ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task);
+        ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
        if (ret) {
                pr_info("wakeup trace: Couldn't activate tracepoint"
                        " probe to kernel_sched_migrate_task\n");
@@ -312,19 +313,19 @@ static void start_wakeup_tracer(struct trace_array *tr)
        return;
 fail_deprobe_wake_new:
-        unregister_trace_sched_wakeup_new(probe_wakeup);
+        unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
 fail_deprobe:
-        unregister_trace_sched_wakeup(probe_wakeup);
+        unregister_trace_sched_wakeup(probe_wakeup, NULL);
 }
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
        tracer_enabled = 0;
        unregister_ftrace_function(&trace_ops);
-        unregister_trace_sched_switch(probe_wakeup_sched_switch);
+        unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
-        unregister_trace_sched_wakeup_new(probe_wakeup);
+        unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
-        unregister_trace_sched_wakeup(probe_wakeup);
+        unregister_trace_sched_wakeup(probe_wakeup, NULL);
-        unregister_trace_sched_migrate_task(probe_wakeup_migrate_task);
+        unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
 }
 static int __wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 1cc9858258b3..250e7f9bd2f0 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -29,7 +29,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
        struct trace_entry *entry;
        unsigned int loops = 0;
-        while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
+        while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
                entry = ring_buffer_event_data(event);
                /*
@@ -255,7 +255,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 /* Maximum number of functions to trace before diagnosing a hang */
 #define GRAPH_MAX_FUNC_TEST     100000000
-static void __ftrace_dump(bool disable_tracing);
+static void
+__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
 static unsigned int graph_hang_thresh;
 /* Wrap the real function entry probe to avoid possible hanging */
@@ -266,7 +267,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
                ftrace_graph_stop();
                printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
                if (ftrace_dump_on_oops)
-                        __ftrace_dump(false);
+                        __ftrace_dump(false, DUMP_ALL);
                return 0;
        }
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 4d6d711717f2..9d358301ae3e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -15,6 +15,54 @@ static int sys_refcount_exit;
 static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
 static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
+static int syscall_enter_register(struct ftrace_event_call *event,
+                                 enum trace_reg type);
+static int syscall_exit_register(struct ftrace_event_call *event,
+                                 enum trace_reg type);
+static int syscall_enter_define_fields(struct ftrace_event_call *call);
+static int syscall_exit_define_fields(struct ftrace_event_call *call);
+static struct list_head *
+syscall_get_enter_fields(struct ftrace_event_call *call)
+{
+        struct syscall_metadata *entry = call->data;
+        return &entry->enter_fields;
+}
+static struct list_head *
+syscall_get_exit_fields(struct ftrace_event_call *call)
+{
+        struct syscall_metadata *entry = call->data;
+        return &entry->exit_fields;
+}
+struct trace_event_functions enter_syscall_print_funcs = {
+        .trace                  = print_syscall_enter,
+};
+struct trace_event_functions exit_syscall_print_funcs = {
+        .trace                  = print_syscall_exit,
+};
+struct ftrace_event_class event_class_syscall_enter = {
+        .system                 = "syscalls",
+        .reg                    = syscall_enter_register,
+        .define_fields          = syscall_enter_define_fields,
+        .get_fields             = syscall_get_enter_fields,
+        .raw_init               = init_syscall_trace,
+};
+struct ftrace_event_class event_class_syscall_exit = {
+        .system                 = "syscalls",
+        .reg                    = syscall_exit_register,
+        .define_fields          = syscall_exit_define_fields,
+        .get_fields             = syscall_get_exit_fields,
+        .raw_init               = init_syscall_trace,
+};
 extern unsigned long __start_syscalls_metadata[];
 extern unsigned long __stop_syscalls_metadata[];
@@ -53,7 +101,8 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
 }
 enum print_line_t
-print_syscall_enter(struct trace_iterator *iter, int flags)
+print_syscall_enter(struct trace_iterator *iter, int flags,
+                    struct trace_event *event)
 {
        struct trace_seq *s = &iter->seq;
        struct trace_entry *ent = iter->ent;
@@ -68,7 +117,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
        if (!entry)
                goto end;
-        if (entry->enter_event->id != ent->type) {
+        if (entry->enter_event->event.type != ent->type) {
                WARN_ON_ONCE(1);
                goto end;
        }
@@ -105,7 +154,8 @@ end:
 }
 enum print_line_t
-print_syscall_exit(struct trace_iterator *iter, int flags)
+print_syscall_exit(struct trace_iterator *iter, int flags,
+                   struct trace_event *event)
 {
        struct trace_seq *s = &iter->seq;
        struct trace_entry *ent = iter->ent;
@@ -123,7 +173,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
                return TRACE_TYPE_HANDLED;
        }
-        if (entry->exit_event->id != ent->type) {
+        if (entry->exit_event->event.type != ent->type) {
                WARN_ON_ONCE(1);
                return TRACE_TYPE_UNHANDLED;
        }
@@ -205,7 +255,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
                kfree(call->print_fmt);
 }
-int syscall_enter_define_fields(struct ftrace_event_call *call)
+static int syscall_enter_define_fields(struct ftrace_event_call *call)
 {
        struct syscall_trace_enter trace;
        struct syscall_metadata *meta = call->data;
@@ -228,7 +278,7 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
        return ret;
 }
-int syscall_exit_define_fields(struct ftrace_event_call *call)
+static int syscall_exit_define_fields(struct ftrace_event_call *call)
 {
        struct syscall_trace_exit trace;
        int ret;
@@ -243,7 +293,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
        return ret;
 }
-void ftrace_syscall_enter(struct pt_regs *regs, long id)
+void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
        struct syscall_trace_enter *entry;
        struct syscall_metadata *sys_data;
@@ -265,7 +315,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
        size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
        event = trace_current_buffer_lock_reserve(&buffer,
-                        sys_data->enter_event->id, size, 0, 0);
+                        sys_data->enter_event->event.type, size, 0, 0);
        if (!event)
                return;
@@ -278,7 +328,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
                trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 }
-void ftrace_syscall_exit(struct pt_regs *regs, long ret)
+void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 {
        struct syscall_trace_exit *entry;
        struct syscall_metadata *sys_data;
@@ -297,7 +347,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
                return;
        event = trace_current_buffer_lock_reserve(&buffer,
-                        sys_data->exit_event->id, sizeof(*entry), 0, 0);
+                        sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
        if (!event)
                return;
@@ -320,7 +370,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
        if (!sys_refcount_enter)
-                ret = register_trace_sys_enter(ftrace_syscall_enter);
+                ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
        if (!ret) {
                set_bit(num, enabled_enter_syscalls);
                sys_refcount_enter++;
@@ -340,7 +390,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
        sys_refcount_enter--;
        clear_bit(num, enabled_enter_syscalls);
        if (!sys_refcount_enter)
-                unregister_trace_sys_enter(ftrace_syscall_enter);
+                unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
        mutex_unlock(&syscall_trace_lock);
 }
@@ -354,7 +404,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
        if (!sys_refcount_exit)
-                ret = register_trace_sys_exit(ftrace_syscall_exit);
+                ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
        if (!ret) {
                set_bit(num, enabled_exit_syscalls);
                sys_refcount_exit++;
@@ -374,7 +424,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
        sys_refcount_exit--;
        clear_bit(num, enabled_exit_syscalls);
        if (!sys_refcount_exit)
-                unregister_trace_sys_exit(ftrace_syscall_exit);
+                unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
        mutex_unlock(&syscall_trace_lock);
 }
@@ -434,7 +484,7 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
 static int sys_perf_refcount_enter;
 static int sys_perf_refcount_exit;
-static void perf_syscall_enter(struct pt_regs *regs, long id)
+static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
        struct syscall_metadata *sys_data;
        struct syscall_trace_enter *rec;
@@ -461,7 +511,8 @@ static void perf_syscall_enter(struct pt_regs *regs, long id)
                return;
        rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
-                                sys_data->enter_event->id, &rctx, &flags);
+                                sys_data->enter_event->event.type,
+                                &rctx, &flags);
        if (!rec)
                return;
@@ -480,7 +531,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call)
        mutex_lock(&syscall_trace_lock);
        if (!sys_perf_refcount_enter)
-                ret = register_trace_sys_enter(perf_syscall_enter);
+                ret = register_trace_sys_enter(perf_syscall_enter, NULL);
        if (ret) {
                pr_info("event trace: Could not activate"
                                "syscall entry trace point");
@@ -502,11 +553,11 @@ void perf_sysenter_disable(struct ftrace_event_call *call)
        sys_perf_refcount_enter--;
        clear_bit(num, enabled_perf_enter_syscalls);
        if (!sys_perf_refcount_enter)
-                unregister_trace_sys_enter(perf_syscall_enter);
+                unregister_trace_sys_enter(perf_syscall_enter, NULL);
        mutex_unlock(&syscall_trace_lock);
 }
-static void perf_syscall_exit(struct pt_regs *regs, long ret)
+static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 {
        struct syscall_metadata *sys_data;
        struct syscall_trace_exit *rec;
@@ -536,7 +587,8 @@ static void perf_syscall_exit(struct pt_regs *regs, long ret)
                return;
        rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
-                                sys_data->exit_event->id, &rctx, &flags);
+                                sys_data->exit_event->event.type,
+                                &rctx, &flags);
        if (!rec)
                return;
@@ -555,7 +607,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call)
        mutex_lock(&syscall_trace_lock);
        if (!sys_perf_refcount_exit)
-                ret = register_trace_sys_exit(perf_syscall_exit);
+                ret = register_trace_sys_exit(perf_syscall_exit, NULL);
        if (ret) {
                pr_info("event trace: Could not activate"
                                "syscall exit trace point");
@@ -577,9 +629,50 @@ void perf_sysexit_disable(struct ftrace_event_call *call)
        sys_perf_refcount_exit--;
        clear_bit(num, enabled_perf_exit_syscalls);
        if (!sys_perf_refcount_exit)
-                unregister_trace_sys_exit(perf_syscall_exit);
+                unregister_trace_sys_exit(perf_syscall_exit, NULL);
        mutex_unlock(&syscall_trace_lock);
 }
 #endif /* CONFIG_PERF_EVENTS */
+static int syscall_enter_register(struct ftrace_event_call *event,
+                                 enum trace_reg type)
+{
+        switch (type) {
+        case TRACE_REG_REGISTER:
+                return reg_event_syscall_enter(event);
+        case TRACE_REG_UNREGISTER:
+                unreg_event_syscall_enter(event);
+                return 0;
+#ifdef CONFIG_PERF_EVENTS
+        case TRACE_REG_PERF_REGISTER:
+                return perf_sysenter_enable(event);
+        case TRACE_REG_PERF_UNREGISTER:
+                perf_sysenter_disable(event);
+                return 0;
+#endif
+        }
+        return 0;
+}
+static int syscall_exit_register(struct ftrace_event_call *event,
+                                 enum trace_reg type)
+{
+        switch (type) {
+        case TRACE_REG_REGISTER:
+                return reg_event_syscall_exit(event);
+        case TRACE_REG_UNREGISTER:
+                unreg_event_syscall_exit(event);
+                return 0;
+#ifdef CONFIG_PERF_EVENTS
+        case TRACE_REG_PERF_REGISTER:
+                return perf_sysexit_enable(event);
+        case TRACE_REG_PERF_UNREGISTER:
+                perf_sysexit_disable(event);
+                return 0;
+#endif
+        }
+        return 0;
+}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index cc2d2faa7d9e..a7cc3793baf6 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -49,7 +49,8 @@ static void cpu_workqueue_stat_free(struct kref *kref)
 /* Insertion of a work */
 static void
-probe_workqueue_insertion(struct task_struct *wq_thread,
+probe_workqueue_insertion(void *ignore,
+                          struct task_struct *wq_thread,
                          struct work_struct *work)
 {
        int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -70,7 +71,8 @@ found:
 /* Execution of a work */
 static void
-probe_workqueue_execution(struct task_struct *wq_thread,
+probe_workqueue_execution(void *ignore,
+                          struct task_struct *wq_thread,
                          struct work_struct *work)
 {
        int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -90,7 +92,8 @@ found:
 }
 /* Creation of a cpu workqueue thread */
-static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
+static void probe_workqueue_creation(void *ignore,
+                                     struct task_struct *wq_thread, int cpu)
 {
        struct cpu_workqueue_stats *cws;
        unsigned long flags;
@@ -114,7 +117,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
 }
 /* Destruction of a cpu workqueue thread */
-static void probe_workqueue_destruction(struct task_struct *wq_thread)
+static void
+probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
 {
        /* Workqueue only execute on one cpu */
        int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -259,19 +263,19 @@ int __init trace_workqueue_early_init(void)
 {
        int ret, cpu;
-        ret = register_trace_workqueue_insertion(probe_workqueue_insertion);
+        ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
        if (ret)
                goto out;
-        ret = register_trace_workqueue_execution(probe_workqueue_execution);
+        ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
        if (ret)
                goto no_insertion;
-        ret = register_trace_workqueue_creation(probe_workqueue_creation);
+        ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
        if (ret)
                goto no_execution;
-        ret = register_trace_workqueue_destruction(probe_workqueue_destruction);
+        ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
        if (ret)
                goto no_creation;
@@ -283,11 +287,11 @@ int __init trace_workqueue_early_init(void)
        return 0;
 no_creation:
-        unregister_trace_workqueue_creation(probe_workqueue_creation);
+        unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
 no_execution:
-        unregister_trace_workqueue_execution(probe_workqueue_execution);
+        unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
 no_insertion:
-        unregister_trace_workqueue_insertion(probe_workqueue_insertion);
+        unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
 out:
        pr_warning("trace_workqueue: unable to trace workqueues\n");
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index cc89be5bc0f8..c77f3eceea25 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -54,7 +54,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
 */
 struct tracepoint_entry {
        struct hlist_node hlist;
-        void **funcs;
+        struct tracepoint_func *funcs;
        int refcount;   /* Number of times armed. 0 if disarmed. */
        char name[0];
 };
@@ -64,12 +64,12 @@ struct tp_probes {
                struct rcu_head rcu;
                struct list_head list;
        } u;
-        void *probes[0];
+        struct tracepoint_func probes[0];
 };
 static inline void *allocate_probes(int count)
 {
-        struct tp_probes *p  = kmalloc(count * sizeof(void *)
+        struct tp_probes *p  = kmalloc(count * sizeof(struct tracepoint_func)
                        + sizeof(struct tp_probes), GFP_KERNEL);
        return p == NULL ? NULL : p->probes;
 }
@@ -79,7 +79,7 @@ static void rcu_free_old_probes(struct rcu_head *head)
        kfree(container_of(head, struct tp_probes, u.rcu));
 }
-static inline void release_probes(void *old)
+static inline void release_probes(struct tracepoint_func *old)
 {
        if (old) {
                struct tp_probes *tp_probes = container_of(old,
@@ -95,15 +95,16 @@ static void debug_print_probes(struct tracepoint_entry *entry)
        if (!tracepoint_debug || !entry->funcs)
                return;
-        for (i = 0; entry->funcs[i]; i++)
+        for (i = 0; entry->funcs[i].func; i++)
-                printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]);
+                printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func);
 }
-static void *
+static struct tracepoint_func *
-tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
+tracepoint_entry_add_probe(struct tracepoint_entry *entry,
+                           void *probe, void *data)
 {
        int nr_probes = 0;
-        void **old, **new;
+        struct tracepoint_func *old, *new;
        WARN_ON(!probe);
@@ -111,8 +112,9 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
        old = entry->funcs;
        if (old) {
                /* (N -> N+1), (N != 0, 1) probes */
-                for (nr_probes = 0; old[nr_probes]; nr_probes++)
+                for (nr_probes = 0; old[nr_probes].func; nr_probes++)
-                        if (old[nr_probes] == probe)
+                        if (old[nr_probes].func == probe &&
+                            old[nr_probes].data == data)
                                return ERR_PTR(-EEXIST);
        }
        /* + 2 : one for new probe, one for NULL func */
@@ -120,9 +122,10 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
        if (new == NULL)
                return ERR_PTR(-ENOMEM);
        if (old)
-                memcpy(new, old, nr_probes * sizeof(void *));
+                memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
-        new[nr_probes] = probe;
+        new[nr_probes].func = probe;
-        new[nr_probes + 1] = NULL;
+        new[nr_probes].data = data;
+        new[nr_probes + 1].func = NULL;
        entry->refcount = nr_probes + 1;
        entry->funcs = new;
        debug_print_probes(entry);
@@ -130,10 +133,11 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
 }
 static void *
-tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
+tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
+                              void *probe, void *data)
 {
        int nr_probes = 0, nr_del = 0, i;
-        void **old, **new;
+        struct tracepoint_func *old, *new;
        old = entry->funcs;
@@ -142,8 +146,10 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
        debug_print_probes(entry);
        /* (N -> M), (N > 1, M >= 0) probes */
-        for (nr_probes = 0; old[nr_probes]; nr_probes++) {
+        for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
-                if ((!probe || old[nr_probes] == probe))
+                if (!probe ||
+                    (old[nr_probes].func == probe &&
+                     old[nr_probes].data == data))
                        nr_del++;
        }
@@ -160,10 +166,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
                new = allocate_probes(nr_probes - nr_del + 1);
                if (new == NULL)
                        return ERR_PTR(-ENOMEM);
-                for (i = 0; old[i]; i++)
+                for (i = 0; old[i].func; i++)
-                        if ((probe && old[i] != probe))
+                        if (probe &&
+                            (old[i].func != probe || old[i].data != data))
                                new[j++] = old[i];
-                new[nr_probes - nr_del] = NULL;
+                new[nr_probes - nr_del].func = NULL;
                entry->refcount = nr_probes - nr_del;
                entry->funcs = new;
        }
@@ -315,18 +322,19 @@ static void tracepoint_update_probes(void)
        module_update_tracepoints();
 }
-static void *tracepoint_add_probe(const char *name, void *probe)
+static struct tracepoint_func *
+tracepoint_add_probe(const char *name, void *probe, void *data)
 {
        struct tracepoint_entry *entry;
-        void *old;
+        struct tracepoint_func *old;
        entry = get_tracepoint(name);
        if (!entry) {
                entry = add_tracepoint(name);
                if (IS_ERR(entry))
-                        return entry;
+                        return (struct tracepoint_func *)entry;
        }
-        old = tracepoint_entry_add_probe(entry, probe);
+        old = tracepoint_entry_add_probe(entry, probe, data);
        if (IS_ERR(old) && !entry->refcount)
                remove_tracepoint(entry);
        return old;
@@ -340,12 +348,12 @@ static void *tracepoint_add_probe(const char *name, void *probe)
 * Returns 0 if ok, error value on error.
 * The probe address must at least be aligned on the architecture pointer size.
 */
-int tracepoint_probe_register(const char *name, void *probe)
+int tracepoint_probe_register(const char *name, void *probe, void *data)
 {
-        void *old;
+        struct tracepoint_func *old;
        mutex_lock(&tracepoints_mutex);
-        old = tracepoint_add_probe(name, probe);
+        old = tracepoint_add_probe(name, probe, data);
        mutex_unlock(&tracepoints_mutex);
        if (IS_ERR(old))
                return PTR_ERR(old);
@@ -356,15 +364,16 @@ int tracepoint_probe_register(const char *name, void *probe)
 }
 EXPORT_SYMBOL_GPL(tracepoint_probe_register);
-static void *tracepoint_remove_probe(const char *name, void *probe)
+static struct tracepoint_func *
+tracepoint_remove_probe(const char *name, void *probe, void *data)
 {
        struct tracepoint_entry *entry;
-        void *old;
+        struct tracepoint_func *old;
        entry = get_tracepoint(name);
        if (!entry)
                return ERR_PTR(-ENOENT);
-        old = tracepoint_entry_remove_probe(entry, probe);
+        old = tracepoint_entry_remove_probe(entry, probe, data);
        if (IS_ERR(old))
                return old;
        if (!entry->refcount)
@@ -382,12 +391,12 @@ static void *tracepoint_remove_probe(const char *name, void *probe)
 * itself uses stop_machine(), which insures that every preempt disabled section
 * have finished.
 */
-int tracepoint_probe_unregister(const char *name, void *probe)
+int tracepoint_probe_unregister(const char *name, void *probe, void *data)
 {
-        void *old;
+        struct tracepoint_func *old;
        mutex_lock(&tracepoints_mutex);
-        old = tracepoint_remove_probe(name, probe);
+        old = tracepoint_remove_probe(name, probe, data);
        mutex_unlock(&tracepoints_mutex);
        if (IS_ERR(old))
                return PTR_ERR(old);
@@ -418,12 +427,13 @@ static void tracepoint_add_old_probes(void *old)
 *
 * caller must call tracepoint_probe_update_all()
 */
-int tracepoint_probe_register_noupdate(const char *name, void *probe)
+int tracepoint_probe_register_noupdate(const char *name, void *probe,
+                                       void *data)
 {
-        void *old;
+        struct tracepoint_func *old;
        mutex_lock(&tracepoints_mutex);
-        old = tracepoint_add_probe(name, probe);
+        old = tracepoint_add_probe(name, probe, data);
        if (IS_ERR(old)) {
                mutex_unlock(&tracepoints_mutex);
                return PTR_ERR(old);
@@ -441,12 +451,13 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
 *
 * caller must call tracepoint_probe_update_all()
 */
-int tracepoint_probe_unregister_noupdate(const char *name, void *probe)
+int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
+                                         void *data)
 {
-        void *old;
+        struct tracepoint_func *old;
        mutex_lock(&tracepoints_mutex);
-        old = tracepoint_remove_probe(name, probe);
+        old = tracepoint_remove_probe(name, probe, data);
        if (IS_ERR(old)) {
                mutex_unlock(&tracepoints_mutex);
                return PTR_ERR(old);
diff --git a/kernel/user.c b/kernel/user.c
index 766467b3bcb7..7e72614b736d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,6 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/user_namespace.h>
-#include "cred-internals.h"
 struct user_namespace init_user_ns = {
        .kref = {
@@ -137,9 +136,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
        struct hlist_head *hashent = uidhashentry(ns, uid);
        struct user_struct *up, *new;
-        /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
-         * atomic.
-         */
        spin_lock_irq(&uidhash_lock);
        up = uid_hash_find(uid, hashent);
        spin_unlock_irq(&uidhash_lock);
@@ -161,11 +157,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
                spin_lock_irq(&uidhash_lock);
                up = uid_hash_find(uid, hashent);
                if (up) {
-                        /* This case is not possible when CONFIG_USER_SCHED
-                         * is defined, since we serialize alloc_uid() using
-                         * uids_mutex. Hence no need to call
-                         * sched_destroy_user() or remove_user_sysfs_dir().
-                         */
                        key_put(new->uid_keyring);
                        key_put(new->session_keyring);
                        kmem_cache_free(uid_cachep, new);
@@ -178,8 +169,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
        return up;
-        put_user_ns(new->user_ns);
-        kmem_cache_free(uid_cachep, new);
 out_unlock:
        return NULL;
 }
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index cf208d8042b1..ad41529fb60f 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -172,12 +172,12 @@ out:
        return;
 }
-static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
+static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location)
 {
        trace_drop_common(skb, location);
 }
-static void trace_napi_poll_hit(struct napi_struct *napi)
+static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi)
 {
        struct dm_hw_stat_delta *new_stat;
@@ -225,12 +225,12 @@ static int set_all_monitor_traces(int state)
        switch (state) {
        case TRACE_ON:
-                rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
+                rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
-                rc |= register_trace_napi_poll(trace_napi_poll_hit);
+                rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL);
                break;
        case TRACE_OFF:
-                rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
+                rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
-                rc |= unregister_trace_napi_poll(trace_napi_poll_hit);
+                rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL);
                tracepoint_synchronize_unregister();
diff --git a/samples/tracepoints/tp-samples-trace.h b/samples/tracepoints/tp-samples-trace.h
index dffdc49878af..4d46be965961 100644
--- a/samples/tracepoints/tp-samples-trace.h
+++ b/samples/tracepoints/tp-samples-trace.h
@@ -7,7 +7,5 @@
 DECLARE_TRACE(subsys_event,
        TP_PROTO(struct inode *inode, struct file *file),
        TP_ARGS(inode, file));
-DECLARE_TRACE(subsys_eventb,
+DECLARE_TRACE_NOARGS(subsys_eventb);
-        TP_PROTO(void),
-        TP_ARGS());
 #endif
diff --git a/samples/tracepoints/tracepoint-probe-sample.c b/samples/tracepoints/tracepoint-probe-sample.c
index 9e60eb6ca2d8..744c0b9652a7 100644
--- a/samples/tracepoints/tracepoint-probe-sample.c
+++ b/samples/tracepoints/tracepoint-probe-sample.c
@@ -13,7 +13,8 @@
 * Here the caller only guarantees locking for struct file and struct inode.
 * Locking must therefore be done in the probe to use the dentry.
 */
-static void probe_subsys_event(struct inode *inode, struct file *file)
+static void probe_subsys_event(void *ignore,
+                               struct inode *inode, struct file *file)
 {
        path_get(&file->f_path);
        dget(file->f_path.dentry);
@@ -23,7 +24,7 @@ static void probe_subsys_event(struct inode *inode, struct file *file)
        path_put(&file->f_path);
 }
-static void probe_subsys_eventb(void)
+static void probe_subsys_eventb(void *ignore)
 {
        printk(KERN_INFO "Event B is encountered\n");
 }
@@ -32,9 +33,9 @@ static int __init tp_sample_trace_init(void)
 {
        int ret;
-        ret = register_trace_subsys_event(probe_subsys_event);
+        ret = register_trace_subsys_event(probe_subsys_event, NULL);
        WARN_ON(ret);
-        ret = register_trace_subsys_eventb(probe_subsys_eventb);
+        ret = register_trace_subsys_eventb(probe_subsys_eventb, NULL);
        WARN_ON(ret);
        return 0;
@@ -44,8 +45,8 @@ module_init(tp_sample_trace_init);
 static void __exit tp_sample_trace_exit(void)
 {
-        unregister_trace_subsys_eventb(probe_subsys_eventb);
+        unregister_trace_subsys_eventb(probe_subsys_eventb, NULL);
-        unregister_trace_subsys_event(probe_subsys_event);
+        unregister_trace_subsys_event(probe_subsys_event, NULL);
        tracepoint_synchronize_unregister();
 }
diff --git a/samples/tracepoints/tracepoint-probe-sample2.c b/samples/tracepoints/tracepoint-probe-sample2.c
index be2a960573f1..9fcf990e5d4b 100644
--- a/samples/tracepoints/tracepoint-probe-sample2.c
+++ b/samples/tracepoints/tracepoint-probe-sample2.c
@@ -12,7 +12,8 @@
 * Here the caller only guarantees locking for struct file and struct inode.
 * Locking must therefore be done in the probe to use the dentry.
 */
-static void probe_subsys_event(struct inode *inode, struct file *file)
+static void probe_subsys_event(void *ignore,
+                               struct inode *inode, struct file *file)
 {
        printk(KERN_INFO "Event is encountered with inode number %lu\n",
                inode->i_ino);
@@ -22,7 +23,7 @@ static int __init tp_sample_trace_init(void)
 {
        int ret;
-        ret = register_trace_subsys_event(probe_subsys_event);
+        ret = register_trace_subsys_event(probe_subsys_event, NULL);
        WARN_ON(ret);
        return 0;
@@ -32,7 +33,7 @@ module_init(tp_sample_trace_init);
 static void __exit tp_sample_trace_exit(void)
 {
-        unregister_trace_subsys_event(probe_subsys_event);
+        unregister_trace_subsys_event(probe_subsys_event, NULL);
        tracepoint_synchronize_unregister();
 }