30 files changed, 1295 insertions, 2051 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 3d9c7e27e3f9..7c9b0a585502 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -87,7 +87,6 @@ obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
-obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index abb6e17505e2..ead9b610aa71 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/taskstats.h>
 #include <linux/time.h>
 #include <linux/sysctl.h>
 #include <linux/delayacct.h>
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 05071bf6a37b..c03f221fee44 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -48,37 +48,6 @@
 #include <asm/uaccess.h>
-/**
- * ktime_get - get the monotonic time in ktime_t format
- *
- * returns the time in ktime_t format
- */
-ktime_t ktime_get(void)
-{
-        struct timespec now;
-        ktime_get_ts(&now);
-        return timespec_to_ktime(now);
-}
-EXPORT_SYMBOL_GPL(ktime_get);
-/**
- * ktime_get_real - get the real (wall-) time in ktime_t format
- *
- * returns the time in ktime_t format
- */
-ktime_t ktime_get_real(void)
-{
-        struct timespec now;
-        getnstimeofday(&now);
-        return timespec_to_ktime(now);
-}
-EXPORT_SYMBOL_GPL(ktime_get_real);
 /*
 * The timer bases:
 *
@@ -106,31 +75,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
        }
 };
-/**
- * ktime_get_ts - get the monotonic clock in timespec format
- * @ts:         pointer to timespec variable
- *
- * The function calculates the monotonic clock from the realtime
- * clock and the wall_to_monotonic offset and stores the result
- * in normalized timespec format in the variable pointed to by @ts.
- */
-void ktime_get_ts(struct timespec *ts)
-{
-        struct timespec tomono;
-        unsigned long seq;
-        do {
-                seq = read_seqbegin(&xtime_lock);
-                getnstimeofday(ts);
-                tomono = wall_to_monotonic;
-        } while (read_seqretry(&xtime_lock, seq));
-        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
-                                ts->tv_nsec + tomono.tv_nsec);
-}
-EXPORT_SYMBOL_GPL(ktime_get_ts);
 /*
 * Get the coarse grained time at the softirq based on xtime and
 * wall_to_monotonic.
@@ -1155,7 +1099,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
                clock_id = CLOCK_MONOTONIC;
        timer->base = &cpu_base->clock_base[clock_id];
-        INIT_LIST_HEAD(&timer->cb_entry);
        hrtimer_init_timer_hres(timer);
 #ifdef CONFIG_TIMER_STATS
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 26539e3228e5..3765ff3c1bbe 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(kfifo_free);
 * writer, you don't need extra locking to use these functions.
 */
 unsigned int __kfifo_put(struct kfifo *fifo,
-                         unsigned char *buffer, unsigned int len)
+                        const unsigned char *buffer, unsigned int len)
 {
        unsigned int l;
diff --git a/kernel/marker.c b/kernel/marker.c
deleted file mode 100644
index ea54f2647868..000000000000
--- a/kernel/marker.c
+++ /dev/null
@@ -1,930 +0,0 @@
-/*
- * Copyright (C) 2007 Mathieu Desnoyers
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- */
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/types.h>
-#include <linux/jhash.h>
-#include <linux/list.h>
-#include <linux/rcupdate.h>
-#include <linux/marker.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-extern struct marker __start___markers[];
-extern struct marker __stop___markers[];
-/* Set to 1 to enable marker debug output */
-static const int marker_debug;
-/*
- * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
- * and module markers and the hash table.
- */
-static DEFINE_MUTEX(markers_mutex);
-/*
- * Marker hash table, containing the active markers.
- * Protected by module_mutex.
- */
-#define MARKER_HASH_BITS 6
-#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
-static struct hlist_head marker_table[MARKER_TABLE_SIZE];
-/*
- * Note about RCU :
- * It is used to make sure every handler has finished using its private data
- * between two consecutive operation (add or remove) on a given marker.  It is
- * also used to delay the free of multiple probes array until a quiescent state
- * is reached.
- * marker entries modifications are protected by the markers_mutex.
- */
-struct marker_entry {
-        struct hlist_node hlist;
-        char *format;
-                        /* Probe wrapper */
-        void (*call)(const struct marker *mdata, void *call_private, ...);
-        struct marker_probe_closure single;
-        struct marker_probe_closure *multi;
-        int refcount;   /* Number of times armed. 0 if disarmed. */
-        struct rcu_head rcu;
-        void *oldptr;
-        int rcu_pending;
-        unsigned char ptype:1;
-        unsigned char format_allocated:1;
-        char name[0];   /* Contains name'\0'format'\0' */
-};
-/**
- * __mark_empty_function - Empty probe callback
- * @probe_private: probe private data
- * @call_private: call site private data
- * @fmt: format string
- * @...: variable argument list
- *
- * Empty callback provided as a probe to the markers. By providing this to a
- * disabled marker, we make sure the  execution flow is always valid even
- * though the function pointer change and the marker enabling are two distinct
- * operations that modifies the execution flow of preemptible code.
- */
-notrace void __mark_empty_function(void *probe_private, void *call_private,
-        const char *fmt, va_list *args)
-{
-}
-EXPORT_SYMBOL_GPL(__mark_empty_function);
-/*
- * marker_probe_cb Callback that prepares the variable argument list for probes.
- * @mdata: pointer of type struct marker
- * @call_private: caller site private data
- * @...:  Variable argument list.
- *
- * Since we do not use "typical" pointer based RCU in the 1 argument case, we
- * need to put a full smp_rmb() in this branch. This is why we do not use
- * rcu_dereference() for the pointer read.
- */
-notrace void marker_probe_cb(const struct marker *mdata,
-                void *call_private, ...)
-{
-        va_list args;
-        char ptype;
-        /*
-         * rcu_read_lock_sched does two things : disabling preemption to make
-         * sure the teardown of the callbacks can be done correctly when they
-         * are in modules and they insure RCU read coherency.
-         */
-        rcu_read_lock_sched_notrace();
-        ptype = mdata->ptype;
-        if (likely(!ptype)) {
-                marker_probe_func *func;
-                /* Must read the ptype before ptr. They are not data dependant,
-                 * so we put an explicit smp_rmb() here. */
-                smp_rmb();
-                func = mdata->single.func;
-                /* Must read the ptr before private data. They are not data
-                 * dependant, so we put an explicit smp_rmb() here. */
-                smp_rmb();
-                va_start(args, call_private);
-                func(mdata->single.probe_private, call_private, mdata->format,
-                        &args);
-                va_end(args);
-        } else {
-                struct marker_probe_closure *multi;
-                int i;
-                /*
-                 * Read mdata->ptype before mdata->multi.
-                 */
-                smp_rmb();
-                multi = mdata->multi;
-                /*
-                 * multi points to an array, therefore accessing the array
-                 * depends on reading multi. However, even in this case,
-                 * we must insure that the pointer is read _before_ the array
-                 * data. Same as rcu_dereference, but we need a full smp_rmb()
-                 * in the fast path, so put the explicit barrier here.
-                 */
-                smp_read_barrier_depends();
-                for (i = 0; multi[i].func; i++) {
-                        va_start(args, call_private);
-                        multi[i].func(multi[i].probe_private, call_private,
-                                mdata->format, &args);
-                        va_end(args);
-                }
-        }
-        rcu_read_unlock_sched_notrace();
-}
-EXPORT_SYMBOL_GPL(marker_probe_cb);
-/*
- * marker_probe_cb Callback that does not prepare the variable argument list.
- * @mdata: pointer of type struct marker
- * @call_private: caller site private data
- * @...:  Variable argument list.
- *
- * Should be connected to markers "MARK_NOARGS".
- */
-static notrace void marker_probe_cb_noarg(const struct marker *mdata,
-                void *call_private, ...)
-{
-        va_list args;   /* not initialized */
-        char ptype;
-        rcu_read_lock_sched_notrace();
-        ptype = mdata->ptype;
-        if (likely(!ptype)) {
-                marker_probe_func *func;
-                /* Must read the ptype before ptr. They are not data dependant,
-                 * so we put an explicit smp_rmb() here. */
-                smp_rmb();
-                func = mdata->single.func;
-                /* Must read the ptr before private data. They are not data
-                 * dependant, so we put an explicit smp_rmb() here. */
-                smp_rmb();
-                func(mdata->single.probe_private, call_private, mdata->format,
-                        &args);
-        } else {
-                struct marker_probe_closure *multi;
-                int i;
-                /*
-                 * Read mdata->ptype before mdata->multi.
-                 */
-                smp_rmb();
-                multi = mdata->multi;
-                /*
-                 * multi points to an array, therefore accessing the array
-                 * depends on reading multi. However, even in this case,
-                 * we must insure that the pointer is read _before_ the array
-                 * data. Same as rcu_dereference, but we need a full smp_rmb()
-                 * in the fast path, so put the explicit barrier here.
-                 */
-                smp_read_barrier_depends();
-                for (i = 0; multi[i].func; i++)
-                        multi[i].func(multi[i].probe_private, call_private,
-                                mdata->format, &args);
-        }
-        rcu_read_unlock_sched_notrace();
-}
-static void free_old_closure(struct rcu_head *head)
-{
-        struct marker_entry *entry = container_of(head,
-                struct marker_entry, rcu);
-        kfree(entry->oldptr);
-        /* Make sure we free the data before setting the pending flag to 0 */
-        smp_wmb();
-        entry->rcu_pending = 0;
-}
-static void debug_print_probes(struct marker_entry *entry)
-{
-        int i;
-        if (!marker_debug)
-                return;
-        if (!entry->ptype) {
-                printk(KERN_DEBUG "Single probe : %p %p\n",
-                        entry->single.func,
-                        entry->single.probe_private);
-        } else {
-                for (i = 0; entry->multi[i].func; i++)
-                        printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
-                                entry->multi[i].func,
-                                entry->multi[i].probe_private);
-        }
-}
-static struct marker_probe_closure *
-marker_entry_add_probe(struct marker_entry *entry,
-                marker_probe_func *probe, void *probe_private)
-{
-        int nr_probes = 0;
-        struct marker_probe_closure *old, *new;
-        WARN_ON(!probe);
-        debug_print_probes(entry);
-        old = entry->multi;
-        if (!entry->ptype) {
-                if (entry->single.func == probe &&
-                                entry->single.probe_private == probe_private)
-                        return ERR_PTR(-EBUSY);
-                if (entry->single.func == __mark_empty_function) {
-                        /* 0 -> 1 probes */
-                        entry->single.func = probe;
-                        entry->single.probe_private = probe_private;
-                        entry->refcount = 1;
-                        entry->ptype = 0;
-                        debug_print_probes(entry);
-                        return NULL;
-                } else {
-                        /* 1 -> 2 probes */
-                        nr_probes = 1;
-                        old = NULL;
-                }
-        } else {
-                /* (N -> N+1), (N != 0, 1) probes */
-                for (nr_probes = 0; old[nr_probes].func; nr_probes++)
-                        if (old[nr_probes].func == probe
-                                        && old[nr_probes].probe_private
-                                                == probe_private)
-                                return ERR_PTR(-EBUSY);
-        }
-        /* + 2 : one for new probe, one for NULL func */
-        new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
-                        GFP_KERNEL);
-        if (new == NULL)
-                return ERR_PTR(-ENOMEM);
-        if (!old)
-                new[0] = entry->single;
-        else
-                memcpy(new, old,
-                        nr_probes * sizeof(struct marker_probe_closure));
-        new[nr_probes].func = probe;
-        new[nr_probes].probe_private = probe_private;
-        entry->refcount = nr_probes + 1;
-        entry->multi = new;
-        entry->ptype = 1;
-        debug_print_probes(entry);
-        return old;
-}
-static struct marker_probe_closure *
-marker_entry_remove_probe(struct marker_entry *entry,
-                marker_probe_func *probe, void *probe_private)
-{
-        int nr_probes = 0, nr_del = 0, i;
-        struct marker_probe_closure *old, *new;
-        old = entry->multi;
-        debug_print_probes(entry);
-        if (!entry->ptype) {
-                /* 0 -> N is an error */
-                WARN_ON(entry->single.func == __mark_empty_function);
-                /* 1 -> 0 probes */
-                WARN_ON(probe && entry->single.func != probe);
-                WARN_ON(entry->single.probe_private != probe_private);
-                entry->single.func = __mark_empty_function;
-                entry->refcount = 0;
-                entry->ptype = 0;
-                debug_print_probes(entry);
-                return NULL;
-        } else {
-                /* (N -> M), (N > 1, M >= 0) probes */
-                for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
-                        if ((!probe || old[nr_probes].func == probe)
-                                        && old[nr_probes].probe_private
-                                                == probe_private)
-                                nr_del++;
-                }
-        }
-        if (nr_probes - nr_del == 0) {
-                /* N -> 0, (N > 1) */
-                entry->single.func = __mark_empty_function;
-                entry->refcount = 0;
-                entry->ptype = 0;
-        } else if (nr_probes - nr_del == 1) {
-                /* N -> 1, (N > 1) */
-                for (i = 0; old[i].func; i++)
-                        if ((probe && old[i].func != probe) ||
-                                        old[i].probe_private != probe_private)
-                                entry->single = old[i];
-                entry->refcount = 1;
-                entry->ptype = 0;
-        } else {
-                int j = 0;
-                /* N -> M, (N > 1, M > 1) */
-                /* + 1 for NULL */
-                new = kzalloc((nr_probes - nr_del + 1)
-                        * sizeof(struct marker_probe_closure), GFP_KERNEL);
-                if (new == NULL)
-                        return ERR_PTR(-ENOMEM);
-                for (i = 0; old[i].func; i++)
-                        if ((probe && old[i].func != probe) ||
-                                        old[i].probe_private != probe_private)
-                                new[j++] = old[i];
-                entry->refcount = nr_probes - nr_del;
-                entry->ptype = 1;
-                entry->multi = new;
-        }
-        debug_print_probes(entry);
-        return old;
-}
-/*
- * Get marker if the marker is present in the marker hash table.
- * Must be called with markers_mutex held.
- * Returns NULL if not present.
- */
-static struct marker_entry *get_marker(const char *name)
-{
-        struct hlist_head *head;
-        struct hlist_node *node;
-        struct marker_entry *e;
-        u32 hash = jhash(name, strlen(name), 0);
-        head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
-        hlist_for_each_entry(e, node, head, hlist) {
-                if (!strcmp(name, e->name))
-                        return e;
-        }
-        return NULL;
-}
-/*
- * Add the marker to the marker hash table. Must be called with markers_mutex
- * held.
- */
-static struct marker_entry *add_marker(const char *name, const char *format)
-{
-        struct hlist_head *head;
-        struct hlist_node *node;
-        struct marker_entry *e;
-        size_t name_len = strlen(name) + 1;
-        size_t format_len = 0;
-        u32 hash = jhash(name, name_len-1, 0);
-        if (format)
-                format_len = strlen(format) + 1;
-        head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
-        hlist_for_each_entry(e, node, head, hlist) {
-                if (!strcmp(name, e->name)) {
-                        printk(KERN_NOTICE
-                                "Marker %s busy\n", name);
-                        return ERR_PTR(-EBUSY); /* Already there */
-                }
-        }
-        /*
-         * Using kmalloc here to allocate a variable length element. Could
-         * cause some memory fragmentation if overused.
-         */
-        e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
-                        GFP_KERNEL);
-        if (!e)
-                return ERR_PTR(-ENOMEM);
-        memcpy(&e->name[0], name, name_len);
-        if (format) {
-                e->format = &e->name[name_len];
-                memcpy(e->format, format, format_len);
-                if (strcmp(e->format, MARK_NOARGS) == 0)
-                        e->call = marker_probe_cb_noarg;
-                else
-                        e->call = marker_probe_cb;
-                trace_mark(core_marker_format, "name %s format %s",
-                                e->name, e->format);
-        } else {
-                e->format = NULL;
-                e->call = marker_probe_cb;
-        }
-        e->single.func = __mark_empty_function;
-        e->single.probe_private = NULL;
-        e->multi = NULL;
-        e->ptype = 0;
-        e->format_allocated = 0;
-        e->refcount = 0;
-        e->rcu_pending = 0;
-        hlist_add_head(&e->hlist, head);
-        return e;
-}
-/*
- * Remove the marker from the marker hash table. Must be called with mutex_lock
- * held.
- */
-static int remove_marker(const char *name)
-{
-        struct hlist_head *head;
-        struct hlist_node *node;
-        struct marker_entry *e;
-        int found = 0;
-        size_t len = strlen(name) + 1;
-        u32 hash = jhash(name, len-1, 0);
-        head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
-        hlist_for_each_entry(e, node, head, hlist) {
-                if (!strcmp(name, e->name)) {
-                        found = 1;
-                        break;
-                }
-        }
-        if (!found)
-                return -ENOENT;
-        if (e->single.func != __mark_empty_function)
-                return -EBUSY;
-        hlist_del(&e->hlist);
-        if (e->format_allocated)
-                kfree(e->format);
-        /* Make sure the call_rcu has been executed */
-        if (e->rcu_pending)
-                rcu_barrier_sched();
-        kfree(e);
-        return 0;
-}
-/*
- * Set the mark_entry format to the format found in the element.
- */
-static int marker_set_format(struct marker_entry *entry, const char *format)
-{
-        entry->format = kstrdup(format, GFP_KERNEL);
-        if (!entry->format)
-                return -ENOMEM;
-        entry->format_allocated = 1;
-        trace_mark(core_marker_format, "name %s format %s",
-                        entry->name, entry->format);
-        return 0;
-}
-/*
- * Sets the probe callback corresponding to one marker.
- */
-static int set_marker(struct marker_entry *entry, struct marker *elem,
-                int active)
-{
-        int ret = 0;
-        WARN_ON(strcmp(entry->name, elem->name) != 0);
-        if (entry->format) {
-                if (strcmp(entry->format, elem->format) != 0) {
-                        printk(KERN_NOTICE
-                                "Format mismatch for probe %s "
-                                "(%s), marker (%s)\n",
-                                entry->name,
-                                entry->format,
-                                elem->format);
-                        return -EPERM;
-                }
-        } else {
-                ret = marker_set_format(entry, elem->format);
-                if (ret)
-                        return ret;
-        }
-        /*
-         * probe_cb setup (statically known) is done here. It is
-         * asynchronous with the rest of execution, therefore we only
-         * pass from a "safe" callback (with argument) to an "unsafe"
-         * callback (does not set arguments).
-         */
-        elem->call = entry->call;
-        /*
-         * Sanity check :
-         * We only update the single probe private data when the ptr is
-         * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
-         */
-        WARN_ON(elem->single.func != __mark_empty_function
-                && elem->single.probe_private != entry->single.probe_private
-                && !elem->ptype);
-        elem->single.probe_private = entry->single.probe_private;
-        /*
-         * Make sure the private data is valid when we update the
-         * single probe ptr.
-         */
-        smp_wmb();
-        elem->single.func = entry->single.func;
-        /*
-         * We also make sure that the new probe callbacks array is consistent
-         * before setting a pointer to it.
-         */
-        rcu_assign_pointer(elem->multi, entry->multi);
-        /*
-         * Update the function or multi probe array pointer before setting the
-         * ptype.
-         */
-        smp_wmb();
-        elem->ptype = entry->ptype;
-        if (elem->tp_name && (active ^ elem->state)) {
-                WARN_ON(!elem->tp_cb);
-                /*
-                 * It is ok to directly call the probe registration because type
-                 * checking has been done in the __trace_mark_tp() macro.
-                 */
-                if (active) {
-                        /*
-                         * try_module_get should always succeed because we hold
-                         * lock_module() to get the tp_cb address.
-                         */
-                        ret = try_module_get(__module_text_address(
-                                (unsigned long)elem->tp_cb));
-                        BUG_ON(!ret);
-                        ret = tracepoint_probe_register_noupdate(
-                                elem->tp_name,
-                                elem->tp_cb);
-                } else {
-                        ret = tracepoint_probe_unregister_noupdate(
-                                elem->tp_name,
-                                elem->tp_cb);
-                        /*
-                         * tracepoint_probe_update_all() must be called
-                         * before the module containing tp_cb is unloaded.
-                         */
-                        module_put(__module_text_address(
-                                (unsigned long)elem->tp_cb));
-                }
-        }
-        elem->state = active;
-        return ret;
-}
-/*
- * Disable a marker and its probe callback.
- * Note: only waiting an RCU period after setting elem->call to the empty
- * function insures that the original callback is not used anymore. This insured
- * by rcu_read_lock_sched around the call site.
- */
-static void disable_marker(struct marker *elem)
-{
-        int ret;
-        /* leave "call" as is. It is known statically. */
-        if (elem->tp_name && elem->state) {
-                WARN_ON(!elem->tp_cb);
-                /*
-                 * It is ok to directly call the probe registration because type
-                 * checking has been done in the __trace_mark_tp() macro.
-                 */
-                ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
-                        elem->tp_cb);
-                WARN_ON(ret);
-                /*
-                 * tracepoint_probe_update_all() must be called
-                 * before the module containing tp_cb is unloaded.
-                 */
-                module_put(__module_text_address((unsigned long)elem->tp_cb));
-        }
-        elem->state = 0;
-        elem->single.func = __mark_empty_function;
-        /* Update the function before setting the ptype */
-        smp_wmb();
-        elem->ptype = 0;        /* single probe */
-        /*
-         * Leave the private data and id there, because removal is racy and
-         * should be done only after an RCU period. These are never used until
-         * the next initialization anyway.
-         */
-}
-/**
- * marker_update_probe_range - Update a probe range
- * @begin: beginning of the range
- * @end: end of the range
- *
- * Updates the probe callback corresponding to a range of markers.
- */
-void marker_update_probe_range(struct marker *begin,
-        struct marker *end)
-{
-        struct marker *iter;
-        struct marker_entry *mark_entry;
-        mutex_lock(&markers_mutex);
-        for (iter = begin; iter < end; iter++) {
-                mark_entry = get_marker(iter->name);
-                if (mark_entry) {
-                        set_marker(mark_entry, iter, !!mark_entry->refcount);
-                        /*
-                         * ignore error, continue
-                         */
-                } else {
-                        disable_marker(iter);
-                }
-        }
-        mutex_unlock(&markers_mutex);
-}
-/*
- * Update probes, removing the faulty probes.
- *
- * Internal callback only changed before the first probe is connected to it.
- * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
- * transitions.  All other transitions will leave the old private data valid.
- * This makes the non-atomicity of the callback/private data updates valid.
- *
- * "special case" updates :
- * 0 -> 1 callback
- * 1 -> 0 callback
- * 1 -> 2 callbacks
- * 2 -> 1 callbacks
- * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
- * Site effect : marker_set_format may delete the marker entry (creating a
- * replacement).
- */
-static void marker_update_probes(void)
-{
-        /* Core kernel markers */
-        marker_update_probe_range(__start___markers, __stop___markers);
-        /* Markers in modules. */
-        module_update_markers();
-        tracepoint_probe_update_all();
-}
-/**
- * marker_probe_register -  Connect a probe to a marker
- * @name: marker name
- * @format: format string
- * @probe: probe handler
- * @probe_private: probe private data
- *
- * private data must be a valid allocated memory address, or NULL.
- * Returns 0 if ok, error value on error.
- * The probe address must at least be aligned on the architecture pointer size.
- */
-int marker_probe_register(const char *name, const char *format,
-                        marker_probe_func *probe, void *probe_private)
-{
-        struct marker_entry *entry;
-        int ret = 0;
-        struct marker_probe_closure *old;
-        mutex_lock(&markers_mutex);
-        entry = get_marker(name);
-        if (!entry) {
-                entry = add_marker(name, format);
-                if (IS_ERR(entry))
-                        ret = PTR_ERR(entry);
-        } else if (format) {
-                if (!entry->format)
-                        ret = marker_set_format(entry, format);
-                else if (strcmp(entry->format, format))
-                        ret = -EPERM;
-        }
-        if (ret)
-                goto end;
-        /*
-         * If we detect that a call_rcu is pending for this marker,
-         * make sure it's executed now.
-         */
-        if (entry->rcu_pending)
-                rcu_barrier_sched();
-        old = marker_entry_add_probe(entry, probe, probe_private);
-        if (IS_ERR(old)) {
-                ret = PTR_ERR(old);
-                goto end;
-        }
-        mutex_unlock(&markers_mutex);
-        marker_update_probes();
-        mutex_lock(&markers_mutex);
-        entry = get_marker(name);
-        if (!entry)
-                goto end;
-        if (entry->rcu_pending)
-                rcu_barrier_sched();
-        entry->oldptr = old;
-        entry->rcu_pending = 1;
-        /* write rcu_pending before calling the RCU callback */
-        smp_wmb();
-        call_rcu_sched(&entry->rcu, free_old_closure);
-end:
-        mutex_unlock(&markers_mutex);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(marker_probe_register);
-/**
- * marker_probe_unregister -  Disconnect a probe from a marker
- * @name: marker name
- * @probe: probe function pointer
- * @probe_private: probe private data
- *
- * Returns the private data given to marker_probe_register, or an ERR_PTR().
- * We do not need to call a synchronize_sched to make sure the probes have
- * finished running before doing a module unload, because the module unload
- * itself uses stop_machine(), which insures that every preempt disabled section
- * have finished.
- */
-int marker_probe_unregister(const char *name,
-        marker_probe_func *probe, void *probe_private)
-{
-        struct marker_entry *entry;
-        struct marker_probe_closure *old;
-        int ret = -ENOENT;
-        mutex_lock(&markers_mutex);
-        entry = get_marker(name);
-        if (!entry)
-                goto end;
-        if (entry->rcu_pending)
-                rcu_barrier_sched();
-        old = marker_entry_remove_probe(entry, probe, probe_private);
-        mutex_unlock(&markers_mutex);
-        marker_update_probes();
-        mutex_lock(&markers_mutex);
-        entry = get_marker(name);
-        if (!entry)
-                goto end;
-        if (entry->rcu_pending)
-                rcu_barrier_sched();
-        entry->oldptr = old;
-        entry->rcu_pending = 1;
-        /* write rcu_pending before calling the RCU callback */
-        smp_wmb();
-        call_rcu_sched(&entry->rcu, free_old_closure);
-        remove_marker(name);    /* Ignore busy error message */
-        ret = 0;
-end:
-        mutex_unlock(&markers_mutex);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(marker_probe_unregister);
-static struct marker_entry *
-get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
-{
-        struct marker_entry *entry;
-        unsigned int i;
-        struct hlist_head *head;
-        struct hlist_node *node;
-        for (i = 0; i < MARKER_TABLE_SIZE; i++) {
-                head = &marker_table[i];
-                hlist_for_each_entry(entry, node, head, hlist) {
-                        if (!entry->ptype) {
-                                if (entry->single.func == probe
-                                                && entry->single.probe_private
-                                                == probe_private)
-                                        return entry;
-                        } else {
-                                struct marker_probe_closure *closure;
-                                closure = entry->multi;
-                                for (i = 0; closure[i].func; i++) {
-                                        if (closure[i].func == probe &&
-                                                        closure[i].probe_private
-                                                        == probe_private)
-                                                return entry;
-                                }
-                        }
-                }
-        }
-        return NULL;
-}
-/**
- * marker_probe_unregister_private_data -  Disconnect a probe from a marker
- * @probe: probe function
- * @probe_private: probe private data
- *
- * Unregister a probe by providing the registered private data.
- * Only removes the first marker found in hash table.
- * Return 0 on success or error value.
- * We do not need to call a synchronize_sched to make sure the probes have
- * finished running before doing a module unload, because the module unload
- * itself uses stop_machine(), which insures that every preempt disabled section
- * have finished.
- */
-int marker_probe_unregister_private_data(marker_probe_func *probe,
-                void *probe_private)
-{
-        struct marker_entry *entry;
-        int ret = 0;
-        struct marker_probe_closure *old;
-        mutex_lock(&markers_mutex);
-        entry = get_marker_from_private_data(probe, probe_private);
-        if (!entry) {
-                ret = -ENOENT;
-                goto end;
-        }
-        if (entry->rcu_pending)
-                rcu_barrier_sched();
-        old = marker_entry_remove_probe(entry, NULL, probe_private);
-        mutex_unlock(&markers_mutex);
-        marker_update_probes();
-        mutex_lock(&markers_mutex);
-        entry = get_marker_from_private_data(probe, probe_private);
-        if (!entry)
-                goto end;
-        if (entry->rcu_pending)
-                rcu_barrier_sched();
-        entry->oldptr = old;
-        entry->rcu_pending = 1;
-        /* write rcu_pending before calling the RCU callback */
-        smp_wmb();
-        call_rcu_sched(&entry->rcu, free_old_closure);
-        remove_marker(entry->name);     /* Ignore busy error message */
-end:
-        mutex_unlock(&markers_mutex);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
-/**
- * marker_get_private_data - Get a marker's probe private data
- * @name: marker name
- * @probe: probe to match
- * @num: get the nth matching probe's private data
- *
- * Returns the nth private data pointer (starting from 0) matching, or an
- * ERR_PTR.
- * Returns the private data pointer, or an ERR_PTR.
- * The private data pointer should _only_ be dereferenced if the caller is the
- * owner of the data, or its content could vanish. This is mostly used to
- * confirm that a caller is the owner of a registered probe.
- */
-void *marker_get_private_data(const char *name, marker_probe_func *probe,
-                int num)
-{
-        struct hlist_head *head;
-        struct hlist_node *node;
-        struct marker_entry *e;
-        size_t name_len = strlen(name) + 1;
-        u32 hash = jhash(name, name_len-1, 0);
-        int i;
-        head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
-        hlist_for_each_entry(e, node, head, hlist) {
-                if (!strcmp(name, e->name)) {
-                        if (!e->ptype) {
-                                if (num == 0 && e->single.func == probe)
-                                        return e->single.probe_private;
-                        } else {
-                                struct marker_probe_closure *closure;
-                                int match = 0;
-                                closure = e->multi;
-                                for (i = 0; closure[i].func; i++) {
-                                        if (closure[i].func != probe)
-                                                continue;
-                                        if (match++ == num)
-                                                return closure[i].probe_private;
-                                }
-                        }
-                        break;
-                }
-        }
-        return ERR_PTR(-ENOENT);
-}
-EXPORT_SYMBOL_GPL(marker_get_private_data);
-#ifdef CONFIG_MODULES
-int marker_module_notify(struct notifier_block *self,
-                         unsigned long val, void *data)
-{
-        struct module *mod = data;
-        switch (val) {
-        case MODULE_STATE_COMING:
-                marker_update_probe_range(mod->markers,
-                        mod->markers + mod->num_markers);
-                break;
-        case MODULE_STATE_GOING:
-                marker_update_probe_range(mod->markers,
-                        mod->markers + mod->num_markers);
-                break;
-        }
-        return 0;
-}
-struct notifier_block marker_module_nb = {
-        .notifier_call = marker_module_notify,
-        .priority = 0,
-};
-static int init_markers(void)
-{
-        return register_module_notifier(&marker_module_nb);
-}
-__initcall(init_markers);
-#endif /* CONFIG_MODULES */
diff --git a/kernel/module.c b/kernel/module.c
index 05ce49ced8f6..b6ee424245dd 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2237,10 +2237,6 @@ static noinline struct module *load_module(void __user *umod,
                                  sizeof(*mod->ctors), &mod->num_ctors);
 #endif
-#ifdef CONFIG_MARKERS
-        mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
-                                    sizeof(*mod->markers), &mod->num_markers);
-#endif
 #ifdef CONFIG_TRACEPOINTS
        mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
                                        "__tracepoints",
@@ -2958,20 +2954,6 @@ void module_layout(struct module *mod,
 EXPORT_SYMBOL(module_layout);
 #endif
-#ifdef CONFIG_MARKERS
-void module_update_markers(void)
-{
-        struct module *mod;
-        mutex_lock(&module_mutex);
-        list_for_each_entry(mod, &modules, list)
-                if (!mod->taints)
-                        marker_update_probe_range(mod->markers,
-                                mod->markers + mod->num_markers);
-        mutex_unlock(&module_mutex);
-}
-#endif
 #ifdef CONFIG_TRACEPOINTS
 void module_update_tracepoints(void)
 {
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8cb94a52d1bb..cc768ab81ac8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2176,6 +2176,13 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
        data->nr_pages = nr_pages;
        atomic_set(&data->lock, -1);
+        if (counter->attr.watermark) {
+                data->watermark = min_t(long, PAGE_SIZE * nr_pages,
+                                      counter->attr.wakeup_watermark);
+        }
+        if (!data->watermark)
+                data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
        rcu_assign_pointer(counter->data, data);
        return 0;
@@ -2315,7 +2322,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        lock_limit >>= PAGE_SHIFT;
        locked = vma->vm_mm->locked_vm + extra;
-        if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+        if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+                !capable(CAP_IPC_LOCK)) {
                ret = -EPERM;
                goto unlock;
        }
@@ -2504,35 +2512,15 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 /*
 * Output
 */
+static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
-struct perf_output_handle {
+                              unsigned long offset, unsigned long head)
-        struct perf_counter     *counter;
-        struct perf_mmap_data   *data;
-        unsigned long           head;
-        unsigned long           offset;
-        int                     nmi;
-        int                     sample;
-        int                     locked;
-        unsigned long           flags;
-};
-static bool perf_output_space(struct perf_mmap_data *data,
-                              unsigned int offset, unsigned int head)
 {
-        unsigned long tail;
        unsigned long mask;
        if (!data->writable)
                return true;
        mask = (data->nr_pages << PAGE_SHIFT) - 1;
-        /*
-         * Userspace could choose to issue a mb() before updating the tail
-         * pointer. So that all reads will be completed before the write is
-         * issued.
-         */
-        tail = ACCESS_ONCE(data->user_page->data_tail);
-        smp_rmb();
        offset = (offset - tail) & mask;
        head   = (head   - tail) & mask;
@@ -2633,8 +2621,8 @@ out:
        local_irq_restore(handle->flags);
 }
-static void perf_output_copy(struct perf_output_handle *handle,
+void perf_output_copy(struct perf_output_handle *handle,
-                             const void *buf, unsigned int len)
+                      const void *buf, unsigned int len)
 {
        unsigned int pages_mask;
        unsigned int offset;
@@ -2669,16 +2657,13 @@ static void perf_output_copy(struct perf_output_handle *handle,
        WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
 }
-#define perf_output_put(handle, x) \
+int perf_output_begin(struct perf_output_handle *handle,
-        perf_output_copy((handle), &(x), sizeof(x))
+                      struct perf_counter *counter, unsigned int size,
+                      int nmi, int sample)
-static int perf_output_begin(struct perf_output_handle *handle,
-                             struct perf_counter *counter, unsigned int size,
-                             int nmi, int sample)
 {
        struct perf_counter *output_counter;
        struct perf_mmap_data *data;
-        unsigned int offset, head;
+        unsigned long tail, offset, head;
        int have_lost;
        struct {
                struct perf_event_header header;
@@ -2716,16 +2701,23 @@ static int perf_output_begin(struct perf_output_handle *handle,
        perf_output_lock(handle);
        do {
+                /*
+                 * Userspace could choose to issue a mb() before updating the
+                 * tail pointer. So that all reads will be completed before the
+                 * write is issued.
+                 */
+                tail = ACCESS_ONCE(data->user_page->data_tail);
+                smp_rmb();
                offset = head = atomic_long_read(&data->head);
                head += size;
-                if (unlikely(!perf_output_space(data, offset, head)))
+                if (unlikely(!perf_output_space(data, tail, offset, head)))
                        goto fail;
        } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
        handle->offset  = offset;
        handle->head    = head;
-        if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
+        if (head - tail > data->watermark)
                atomic_set(&data->wakeup, 1);
        if (have_lost) {
@@ -2749,7 +2741,7 @@ out:
        return -ENOSPC;
 }
-static void perf_output_end(struct perf_output_handle *handle)
+void perf_output_end(struct perf_output_handle *handle)
 {
        struct perf_counter *counter = handle->counter;
        struct perf_mmap_data *data = handle->data;
@@ -2863,156 +2855,176 @@ static void perf_output_read(struct perf_output_handle *handle,
                perf_output_read_one(handle, counter);
 }
-void perf_counter_output(struct perf_counter *counter, int nmi,
+void perf_output_sample(struct perf_output_handle *handle,
-                                struct perf_sample_data *data)
+                        struct perf_event_header *header,
+                        struct perf_sample_data *data,
+                        struct perf_counter *counter)
 {
-        int ret;
+        u64 sample_type = data->type;
-        u64 sample_type = counter->attr.sample_type;
-        struct perf_output_handle handle;
-        struct perf_event_header header;
-        u64 ip;
-        struct {
-                u32 pid, tid;
-        } tid_entry;
-        struct perf_callchain_entry *callchain = NULL;
-        int callchain_size = 0;
-        u64 time;
-        struct {
-                u32 cpu, reserved;
-        } cpu_entry;
-        header.type = PERF_EVENT_SAMPLE;
+        perf_output_put(handle, *header);
-        header.size = sizeof(header);
-        header.misc = 0;
+        if (sample_type & PERF_SAMPLE_IP)
-        header.misc |= perf_misc_flags(data->regs);
+                perf_output_put(handle, data->ip);
-        if (sample_type & PERF_SAMPLE_IP) {
-                ip = perf_instruction_pointer(data->regs);
-                header.size += sizeof(ip);
-        }
-        if (sample_type & PERF_SAMPLE_TID) {
-                /* namespace issues */
-                tid_entry.pid = perf_counter_pid(counter, current);
-                tid_entry.tid = perf_counter_tid(counter, current);
-                header.size += sizeof(tid_entry);
-        }
-        if (sample_type & PERF_SAMPLE_TIME) {
+        if (sample_type & PERF_SAMPLE_TID)
-                /*
+                perf_output_put(handle, data->tid_entry);
-                 * Maybe do better on x86 and provide cpu_clock_nmi()
-                 */
-                time = sched_clock();
-                header.size += sizeof(u64);
+        if (sample_type & PERF_SAMPLE_TIME)
-        }
+                perf_output_put(handle, data->time);
        if (sample_type & PERF_SAMPLE_ADDR)
-                header.size += sizeof(u64);
+                perf_output_put(handle, data->addr);
        if (sample_type & PERF_SAMPLE_ID)
-                header.size += sizeof(u64);
+                perf_output_put(handle, data->id);
        if (sample_type & PERF_SAMPLE_STREAM_ID)
-                header.size += sizeof(u64);
+                perf_output_put(handle, data->stream_id);
-        if (sample_type & PERF_SAMPLE_CPU) {
-                header.size += sizeof(cpu_entry);
-                cpu_entry.cpu = raw_smp_processor_id();
+        if (sample_type & PERF_SAMPLE_CPU)
-                cpu_entry.reserved = 0;
+                perf_output_put(handle, data->cpu_entry);
-        }
        if (sample_type & PERF_SAMPLE_PERIOD)
-                header.size += sizeof(u64);
+                perf_output_put(handle, data->period);
        if (sample_type & PERF_SAMPLE_READ)
-                header.size += perf_counter_read_size(counter);
+                perf_output_read(handle, counter);
        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-                callchain = perf_callchain(data->regs);
+                if (data->callchain) {
+                        int size = 1;
-                if (callchain) {
+                        if (data->callchain)
-                        callchain_size = (1 + callchain->nr) * sizeof(u64);
+                                size += data->callchain->nr;
-                        header.size += callchain_size;
-                } else
+                        size *= sizeof(u64);
-                        header.size += sizeof(u64);
+                        perf_output_copy(handle, data->callchain, size);
+                } else {
+                        u64 nr = 0;
+                        perf_output_put(handle, nr);
+                }
        }
        if (sample_type & PERF_SAMPLE_RAW) {
-                int size = sizeof(u32);
+                if (data->raw) {
+                        perf_output_put(handle, data->raw->size);
+                        perf_output_copy(handle, data->raw->data,
+                                         data->raw->size);
+                } else {
+                        struct {
+                                u32     size;
+                                u32     data;
+                        } raw = {
+                                .size = sizeof(u32),
+                                .data = 0,
+                        };
+                        perf_output_put(handle, raw);
+                }
+        }
+}
-                if (data->raw)
+void perf_prepare_sample(struct perf_event_header *header,
-                        size += data->raw->size;
+                         struct perf_sample_data *data,
-                else
+                         struct perf_counter *counter,
-                        size += sizeof(u32);
+                         struct pt_regs *regs)
+{
+        u64 sample_type = counter->attr.sample_type;
-                WARN_ON_ONCE(size & (sizeof(u64)-1));
+        data->type = sample_type;
-                header.size += size;
-        }
-        ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
+        header->type = PERF_EVENT_SAMPLE;
-        if (ret)
+        header->size = sizeof(*header);
-                return;
-        perf_output_put(&handle, header);
+        header->misc = 0;
+        header->misc |= perf_misc_flags(regs);
-        if (sample_type & PERF_SAMPLE_IP)
+        if (sample_type & PERF_SAMPLE_IP) {
-                perf_output_put(&handle, ip);
+                data->ip = perf_instruction_pointer(regs);
-        if (sample_type & PERF_SAMPLE_TID)
+                header->size += sizeof(data->ip);
-                perf_output_put(&handle, tid_entry);
+        }
-        if (sample_type & PERF_SAMPLE_TIME)
+        if (sample_type & PERF_SAMPLE_TID) {
-                perf_output_put(&handle, time);
+                /* namespace issues */
+                data->tid_entry.pid = perf_counter_pid(counter, current);
+                data->tid_entry.tid = perf_counter_tid(counter, current);
+                header->size += sizeof(data->tid_entry);
+        }
+        if (sample_type & PERF_SAMPLE_TIME) {
+                data->time = perf_clock();
+                header->size += sizeof(data->time);
+        }
        if (sample_type & PERF_SAMPLE_ADDR)
-                perf_output_put(&handle, data->addr);
+                header->size += sizeof(data->addr);
        if (sample_type & PERF_SAMPLE_ID) {
-                u64 id = primary_counter_id(counter);
+                data->id = primary_counter_id(counter);
-                perf_output_put(&handle, id);
+                header->size += sizeof(data->id);
        }
-        if (sample_type & PERF_SAMPLE_STREAM_ID)
+        if (sample_type & PERF_SAMPLE_STREAM_ID) {
-                perf_output_put(&handle, counter->id);
+                data->stream_id = counter->id;
-        if (sample_type & PERF_SAMPLE_CPU)
+                header->size += sizeof(data->stream_id);
-                perf_output_put(&handle, cpu_entry);
+        }
+        if (sample_type & PERF_SAMPLE_CPU) {
+                data->cpu_entry.cpu             = raw_smp_processor_id();
+                data->cpu_entry.reserved        = 0;
+                header->size += sizeof(data->cpu_entry);
+        }
        if (sample_type & PERF_SAMPLE_PERIOD)
-                perf_output_put(&handle, data->period);
+                header->size += sizeof(data->period);
        if (sample_type & PERF_SAMPLE_READ)
-                perf_output_read(&handle, counter);
+                header->size += perf_counter_read_size(counter);
        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-                if (callchain)
+                int size = 1;
-                        perf_output_copy(&handle, callchain, callchain_size);
-                else {
+                data->callchain = perf_callchain(regs);
-                        u64 nr = 0;
-                        perf_output_put(&handle, nr);
+                if (data->callchain)
-                }
+                        size += data->callchain->nr;
+                header->size += size * sizeof(u64);
        }
        if (sample_type & PERF_SAMPLE_RAW) {
-                if (data->raw) {
+                int size = sizeof(u32);
-                        perf_output_put(&handle, data->raw->size);
-                        perf_output_copy(&handle, data->raw->data, data->raw->size);
+                if (data->raw)
-                } else {
+                        size += data->raw->size;
-                        struct {
+                else
-                                u32     size;
+                        size += sizeof(u32);
-                                u32     data;
-                        } raw = {
+                WARN_ON_ONCE(size & (sizeof(u64)-1));
-                                .size = sizeof(u32),
+                header->size += size;
-                                .data = 0,
-                        };
-                        perf_output_put(&handle, raw);
-                }
        }
+}
+static void perf_counter_output(struct perf_counter *counter, int nmi,
+                                struct perf_sample_data *data,
+                                struct pt_regs *regs)
+{
+        struct perf_output_handle handle;
+        struct perf_event_header header;
+        perf_prepare_sample(&header, data, counter, regs);
+        if (perf_output_begin(&handle, counter, header.size, nmi, 1))
+                return;
+        perf_output_sample(&handle, &header, data, counter);
        perf_output_end(&handle);
 }
@@ -3071,6 +3083,7 @@ struct perf_task_event {
                u32                             ppid;
                u32                             tid;
                u32                             ptid;
+                u64                             time;
        } event;
 };
@@ -3078,9 +3091,12 @@ static void perf_counter_task_output(struct perf_counter *counter,
                                     struct perf_task_event *task_event)
 {
        struct perf_output_handle handle;
-        int size = task_event->event.header.size;
+        int size;
        struct task_struct *task = task_event->task;
-        int ret = perf_output_begin(&handle, counter, size, 0, 0);
+        int ret;
+        size  = task_event->event.header.size;
+        ret = perf_output_begin(&handle, counter, size, 0, 0);
        if (ret)
                return;
@@ -3091,7 +3107,10 @@ static void perf_counter_task_output(struct perf_counter *counter,
        task_event->event.tid = perf_counter_tid(counter, task);
        task_event->event.ptid = perf_counter_tid(counter, current);
+        task_event->event.time = perf_clock();
        perf_output_put(&handle, task_event->event);
        perf_output_end(&handle);
 }
@@ -3473,7 +3492,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
                        .misc = 0,
                        .size = sizeof(throttle_event),
                },
-                .time           = sched_clock(),
+                .time           = perf_clock(),
                .id             = primary_counter_id(counter),
                .stream_id      = counter->id,
        };
@@ -3493,14 +3512,16 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 * Generic counter overflow handling, sampling.
 */
-int perf_counter_overflow(struct perf_counter *counter, int nmi,
+static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
-                          struct perf_sample_data *data)
+                                   int throttle, struct perf_sample_data *data,
+                                   struct pt_regs *regs)
 {
        int events = atomic_read(&counter->event_limit);
-        int throttle = counter->pmu->unthrottle != NULL;
        struct hw_perf_counter *hwc = &counter->hw;
        int ret = 0;
+        throttle = (throttle && counter->pmu->unthrottle != NULL);
        if (!throttle) {
                hwc->interrupts++;
        } else {
@@ -3523,7 +3544,7 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
        }
        if (counter->attr.freq) {
-                u64 now = sched_clock();
+                u64 now = perf_clock();
                s64 delta = now - hwc->freq_stamp;
                hwc->freq_stamp = now;
@@ -3549,10 +3570,17 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
                        perf_counter_disable(counter);
        }
-        perf_counter_output(counter, nmi, data);
+        perf_counter_output(counter, nmi, data, regs);
        return ret;
 }
+int perf_counter_overflow(struct perf_counter *counter, int nmi,
+                          struct perf_sample_data *data,
+                          struct pt_regs *regs)
+{
+        return __perf_counter_overflow(counter, nmi, 1, data, regs);
+}
 /*
 * Generic software counter infrastructure
 */
@@ -3588,9 +3616,11 @@ again:
 }
 static void perf_swcounter_overflow(struct perf_counter *counter,
-                                    int nmi, struct perf_sample_data *data)
+                                    int nmi, struct perf_sample_data *data,
+                                    struct pt_regs *regs)
 {
        struct hw_perf_counter *hwc = &counter->hw;
+        int throttle = 0;
        u64 overflow;
        data->period = counter->hw.last_period;
@@ -3600,13 +3630,15 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
                return;
        for (; overflow; overflow--) {
-                if (perf_counter_overflow(counter, nmi, data)) {
+                if (__perf_counter_overflow(counter, nmi, throttle,
+                                            data, regs)) {
                        /*
                         * We inhibit the overflow from happening when
                         * hwc->interrupts == MAX_INTERRUPTS.
                         */
                        break;
                }
+                throttle = 1;
        }
 }
@@ -3618,7 +3650,8 @@ static void perf_swcounter_unthrottle(struct perf_counter *counter)
 }
 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-                               int nmi, struct perf_sample_data *data)
+                               int nmi, struct perf_sample_data *data,
+                               struct pt_regs *regs)
 {
        struct hw_perf_counter *hwc = &counter->hw;
@@ -3627,11 +3660,11 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
        if (!hwc->sample_period)
                return;
-        if (!data->regs)
+        if (!regs)
                return;
        if (!atomic64_add_negative(nr, &hwc->period_left))
-                perf_swcounter_overflow(counter, nmi, data);
+                perf_swcounter_overflow(counter, nmi, data, regs);
 }
 static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3690,7 +3723,8 @@ static int perf_swcounter_match(struct perf_counter *counter,
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
                                     enum perf_type_id type,
                                     u32 event, u64 nr, int nmi,
-                                     struct perf_sample_data *data)
+                                     struct perf_sample_data *data,
+                                     struct pt_regs *regs)
 {
        struct perf_counter *counter;
@@ -3699,8 +3733,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
        rcu_read_lock();
        list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-                if (perf_swcounter_match(counter, type, event, data->regs))
+                if (perf_swcounter_match(counter, type, event, regs))
-                        perf_swcounter_add(counter, nr, nmi, data);
+                        perf_swcounter_add(counter, nr, nmi, data, regs);
        }
        rcu_read_unlock();
 }
@@ -3721,7 +3755,8 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
 static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
                                    u64 nr, int nmi,
-                                    struct perf_sample_data *data)
+                                    struct perf_sample_data *data,
+                                    struct pt_regs *regs)
 {
        struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
        int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -3734,7 +3769,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
        barrier();
        perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
-                                 nr, nmi, data);
+                                 nr, nmi, data, regs);
        rcu_read_lock();
        /*
         * doesn't really matter which of the child contexts the
@@ -3742,7 +3777,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
         */
        ctx = rcu_dereference(current->perf_counter_ctxp);
        if (ctx)
-                perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
+                perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs);
        rcu_read_unlock();
        barrier();
@@ -3756,11 +3791,11 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,
                            struct pt_regs *regs, u64 addr)
 {
        struct perf_sample_data data = {
-                .regs = regs,
                .addr = addr,
        };
-        do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
+        do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi,
+                                &data, regs);
 }
 static void perf_swcounter_read(struct perf_counter *counter)
@@ -3797,6 +3832,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 {
        enum hrtimer_restart ret = HRTIMER_RESTART;
        struct perf_sample_data data;
+        struct pt_regs *regs;
        struct perf_counter *counter;
        u64 period;
@@ -3804,17 +3840,17 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
        counter->pmu->read(counter);
        data.addr = 0;
-        data.regs = get_irq_regs();
+        regs = get_irq_regs();
        /*
         * In case we exclude kernel IPs or are somehow not in interrupt
         * context, provide the next best thing, the user IP.
         */
-        if ((counter->attr.exclude_kernel || !data.regs) &&
+        if ((counter->attr.exclude_kernel || !regs) &&
                        !counter->attr.exclude_user)
-                data.regs = task_pt_regs(current);
+                regs = task_pt_regs(current);
-        if (data.regs) {
+        if (regs) {
-                if (perf_counter_overflow(counter, 0, &data))
+                if (perf_counter_overflow(counter, 0, &data, regs))
                        ret = HRTIMER_NORESTART;
        }
@@ -3950,15 +3986,17 @@ void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
        };
        struct perf_sample_data data = {
-                .regs = get_irq_regs(),
                .addr = addr,
                .raw = &raw,
        };
-        if (!data.regs)
+        struct pt_regs *regs = get_irq_regs();
-                data.regs = task_pt_regs(current);
+        if (!regs)
+                regs = task_pt_regs(current);
-        do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
+        do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+                                &data, regs);
 }
 EXPORT_SYMBOL_GPL(perf_tpcounter_event);
@@ -4170,8 +4208,8 @@ done:
 static int perf_copy_attr(struct perf_counter_attr __user *uattr,
                          struct perf_counter_attr *attr)
 {
-        int ret;
        u32 size;
+        int ret;
        if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
                return -EFAULT;
@@ -4196,19 +4234,19 @@ static int perf_copy_attr(struct perf_counter_attr __user *uattr,
        /*
         * If we're handed a bigger struct than we know of,
-         * ensure all the unknown bits are 0.
+         * ensure all the unknown bits are 0 - i.e. new
+         * user-space does not rely on any kernel feature
+         * extensions we dont know about yet.
         */
        if (size > sizeof(*attr)) {
-                unsigned long val;
+                unsigned char __user *addr;
-                unsigned long __user *addr;
+                unsigned char __user *end;
-                unsigned long __user *end;
+                unsigned char val;
-                addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
+                addr = (void __user *)uattr + sizeof(*attr);
-                                sizeof(unsigned long));
+                end  = (void __user *)uattr + size;
-                end  = PTR_ALIGN((void __user *)uattr + size,
-                                sizeof(unsigned long));
-                for (; addr < end; addr += sizeof(unsigned long)) {
+                for (; addr < end; addr++) {
                        ret = get_user(val, addr);
                        if (ret)
                                return ret;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index d089d052c4a9..495440779ce3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
        return 0;
 }
+static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
+{
+        *tp = current_kernel_time();
+        return 0;
+}
+static int posix_get_monotonic_coarse(clockid_t which_clock,
+                                                struct timespec *tp)
+{
+        *tp = get_monotonic_coarse();
+        return 0;
+}
+int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+{
+        *tp = ktime_to_timespec(KTIME_LOW_RES);
+        return 0;
+}
 /*
 * Initialize everything, well, just everything in Posix clocks/timers ;)
 */
@@ -262,10 +281,26 @@ static __init int init_posix_timers(void)
                .timer_create = no_timer_create,
                .nsleep = no_nsleep,
        };
+        struct k_clock clock_realtime_coarse = {
+                .clock_getres = posix_get_coarse_res,
+                .clock_get = posix_get_realtime_coarse,
+                .clock_set = do_posix_clock_nosettime,
+                .timer_create = no_timer_create,
+                .nsleep = no_nsleep,
+        };
+        struct k_clock clock_monotonic_coarse = {
+                .clock_getres = posix_get_coarse_res,
+                .clock_get = posix_get_monotonic_coarse,
+                .clock_set = do_posix_clock_nosettime,
+                .timer_create = no_timer_create,
+                .nsleep = no_nsleep,
+        };
        register_posix_clock(CLOCK_REALTIME, &clock_realtime);
        register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
        register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
+        register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
+        register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
        posix_timers_cache = kmem_cache_create("posix_timers_cache",
                                        sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/power/console.c b/kernel/power/console.c
index a3961b205de7..5187136fe1de 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -14,56 +14,13 @@
 #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
 static int orig_fgconsole, orig_kmsg;
-static int disable_vt_switch;
-/*
- * Normally during a suspend, we allocate a new console and switch to it.
- * When we resume, we switch back to the original console.  This switch
- * can be slow, so on systems where the framebuffer can handle restoration
- * of video registers anyways, there's little point in doing the console
- * switch.  This function allows you to disable it by passing it '0'.
- */
-void pm_set_vt_switch(int do_switch)
-{
-        acquire_console_sem();
-        disable_vt_switch = !do_switch;
-        release_console_sem();
-}
-EXPORT_SYMBOL(pm_set_vt_switch);
 int pm_prepare_console(void)
 {
-        acquire_console_sem();
+        orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
+        if (orig_fgconsole < 0)
-        if (disable_vt_switch) {
-                release_console_sem();
-                return 0;
-        }
-        orig_fgconsole = fg_console;
-        if (vc_allocate(SUSPEND_CONSOLE)) {
-          /* we can't have a free VC for now. Too bad,
-           * we don't want to mess the screen for now. */
-                release_console_sem();
                return 1;
-        }
-        if (set_console(SUSPEND_CONSOLE)) {
-                /*
-                 * We're unable to switch to the SUSPEND_CONSOLE.
-                 * Let the calling function know so it can decide
-                 * what to do.
-                 */
-                release_console_sem();
-                return 1;
-        }
-        release_console_sem();
-        if (vt_waitactive(SUSPEND_CONSOLE)) {
-                pr_debug("Suspend: Can't switch VCs.");
-                return 1;
-        }
        orig_kmsg = kmsg_redirect;
        kmsg_redirect = SUSPEND_CONSOLE;
        return 0;
@@ -71,19 +28,9 @@ int pm_prepare_console(void)
 void pm_restore_console(void)
 {
-        acquire_console_sem();
+        if (orig_fgconsole >= 0) {
-        if (disable_vt_switch) {
+                vt_move_to_console(orig_fgconsole, 0);
-                release_console_sem();
+                kmsg_redirect = orig_kmsg;
-                return;
-        }
-        set_console(orig_fgconsole);
-        release_console_sem();
-        if (vt_waitactive(orig_fgconsole)) {
-                pr_debug("Resume: Can't switch VCs.");
-                return;
        }
-        kmsg_redirect = orig_kmsg;
 }
 #endif
diff --git a/kernel/profile.c b/kernel/profile.c
index 419250ebec4d..a55d3a367ae8 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -442,48 +442,51 @@ void profile_tick(int type)
 #ifdef CONFIG_PROC_FS
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <asm/uaccess.h>
-static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
+static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
-                        int count, int *eof, void *data)
 {
-        int len = cpumask_scnprintf(page, count, data);
+        seq_cpumask(m, prof_cpu_mask);
-        if (count - len < 2)
+        seq_putc(m, '\n');
-                return -EINVAL;
+        return 0;
-        len += sprintf(page + len, "\n");
-        return len;
 }
-static int prof_cpu_mask_write_proc(struct file *file,
+static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
-        const char __user *buffer,  unsigned long count, void *data)
+{
+        return single_open(file, prof_cpu_mask_proc_show, NULL);
+}
+static ssize_t prof_cpu_mask_proc_write(struct file *file,
+        const char __user *buffer, size_t count, loff_t *pos)
 {
-        struct cpumask *mask = data;
-        unsigned long full_count = count, err;
        cpumask_var_t new_value;
+        int err;
        if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
                return -ENOMEM;
        err = cpumask_parse_user(buffer, count, new_value);
        if (!err) {
-                cpumask_copy(mask, new_value);
+                cpumask_copy(prof_cpu_mask, new_value);
-                err = full_count;
+                err = count;
        }
        free_cpumask_var(new_value);
        return err;
 }
+static const struct file_operations prof_cpu_mask_proc_fops = {
+        .open           = prof_cpu_mask_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = prof_cpu_mask_proc_write,
+};
 void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
 {
-        struct proc_dir_entry *entry;
        /* create /proc/irq/prof_cpu_mask */
-        entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
+        proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops);
-        if (!entry)
-                return;
-        entry->data = prof_cpu_mask;
-        entry->read_proc = prof_cpu_mask_read_proc;
-        entry->write_proc = prof_cpu_mask_write_proc;
 }
 /*
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e1d16c9a7680..ac2e1dc708bd 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -48,13 +48,6 @@ static __read_mostly int sched_clock_running;
 __read_mostly int sched_clock_stable;
 struct sched_clock_data {
-        /*
-         * Raw spinlock - this is a special case: this might be called
-         * from within instrumentation code so we dont want to do any
-         * instrumentation ourselves.
-         */
-        raw_spinlock_t          lock;
        u64                     tick_raw;
        u64                     tick_gtod;
        u64                     clock;
@@ -80,7 +73,6 @@ void sched_clock_init(void)
        for_each_possible_cpu(cpu) {
                struct sched_clock_data *scd = cpu_sdc(cpu);
-                scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
                scd->tick_raw = 0;
                scd->tick_gtod = ktime_now;
                scd->clock = ktime_now;
@@ -109,14 +101,19 @@ static inline u64 wrap_max(u64 x, u64 y)
 *  - filter out backward motion
 *  - use the GTOD tick value to create a window to filter crazy TSC values
 */
-static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
+static u64 sched_clock_local(struct sched_clock_data *scd)
 {
-        s64 delta = now - scd->tick_raw;
+        u64 now, clock, old_clock, min_clock, max_clock;
-        u64 clock, min_clock, max_clock;
+        s64 delta;
+again:
+        now = sched_clock();
+        delta = now - scd->tick_raw;
        if (unlikely(delta < 0))
                delta = 0;
+        old_clock = scd->clock;
        /*
         * scd->clock = clamp(scd->tick_gtod + delta,
         *                    max(scd->tick_gtod, scd->clock),
@@ -124,84 +121,73 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
         */
        clock = scd->tick_gtod + delta;
-        min_clock = wrap_max(scd->tick_gtod, scd->clock);
+        min_clock = wrap_max(scd->tick_gtod, old_clock);
-        max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
+        max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
        clock = wrap_max(clock, min_clock);
        clock = wrap_min(clock, max_clock);
-        scd->clock = clock;
+        if (cmpxchg(&scd->clock, old_clock, clock) != old_clock)
+                goto again;
-        return scd->clock;
+        return clock;
 }
-static void lock_double_clock(struct sched_clock_data *data1,
+static u64 sched_clock_remote(struct sched_clock_data *scd)
-                                struct sched_clock_data *data2)
 {
-        if (data1 < data2) {
+        struct sched_clock_data *my_scd = this_scd();
-                __raw_spin_lock(&data1->lock);
+        u64 this_clock, remote_clock;
-                __raw_spin_lock(&data2->lock);
+        u64 *ptr, old_val, val;
+        sched_clock_local(my_scd);
+again:
+        this_clock = my_scd->clock;
+        remote_clock = scd->clock;
+        /*
+         * Use the opportunity that we have both locks
+         * taken to couple the two clocks: we take the
+         * larger time as the latest time for both
+         * runqueues. (this creates monotonic movement)
+         */
+        if (likely((s64)(remote_clock - this_clock) < 0)) {
+                ptr = &scd->clock;
+                old_val = remote_clock;
+                val = this_clock;
        } else {
-                __raw_spin_lock(&data2->lock);
+                /*
-                __raw_spin_lock(&data1->lock);
+                 * Should be rare, but possible:
+                 */
+                ptr = &my_scd->clock;
+                old_val = this_clock;
+                val = remote_clock;
        }
+        if (cmpxchg(ptr, old_val, val) != old_val)
+                goto again;
+        return val;
 }
 u64 sched_clock_cpu(int cpu)
 {
-        u64 now, clock, this_clock, remote_clock;
        struct sched_clock_data *scd;
+        u64 clock;
+        WARN_ON_ONCE(!irqs_disabled());
        if (sched_clock_stable)
                return sched_clock();
-        scd = cpu_sdc(cpu);
-        /*
-         * Normally this is not called in NMI context - but if it is,
-         * trying to do any locking here is totally lethal.
-         */
-        if (unlikely(in_nmi()))
-                return scd->clock;
        if (unlikely(!sched_clock_running))
                return 0ull;
-        WARN_ON_ONCE(!irqs_disabled());
+        scd = cpu_sdc(cpu);
-        now = sched_clock();
-        if (cpu != raw_smp_processor_id()) {
-                struct sched_clock_data *my_scd = this_scd();
-                lock_double_clock(scd, my_scd);
-                this_clock = __update_sched_clock(my_scd, now);
-                remote_clock = scd->clock;
-                /*
-                 * Use the opportunity that we have both locks
-                 * taken to couple the two clocks: we take the
-                 * larger time as the latest time for both
-                 * runqueues. (this creates monotonic movement)
-                 */
-                if (likely((s64)(remote_clock - this_clock) < 0)) {
-                        clock = this_clock;
-                        scd->clock = clock;
-                } else {
-                        /*
-                         * Should be rare, but possible:
-                         */
-                        clock = remote_clock;
-                        my_scd->clock = remote_clock;
-                }
-                __raw_spin_unlock(&my_scd->lock);
-        } else {
-                __raw_spin_lock(&scd->lock);
-                clock = __update_sched_clock(scd, now);
-        }
-        __raw_spin_unlock(&scd->lock);
+        if (cpu != smp_processor_id())
+                clock = sched_clock_remote(scd);
+        else
+                clock = sched_clock_local(scd);
        return clock;
 }
@@ -223,11 +209,9 @@ void sched_clock_tick(void)
        now_gtod = ktime_to_ns(ktime_get());
        now = sched_clock();
-        __raw_spin_lock(&scd->lock);
        scd->tick_raw = now;
        scd->tick_gtod = now_gtod;
-        __update_sched_clock(scd, now);
+        sched_clock_local(scd);
-        __raw_spin_unlock(&scd->lock);
 }
 /*
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cd73738f0d5f..ecc637a0d591 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -513,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
        if (entity_is_task(curr)) {
                struct task_struct *curtask = task_of(curr);
+                trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
                cpuacct_charge(curtask, delta_exec);
                account_group_exec_runtime(curtask, delta_exec);
        }
diff --git a/kernel/time.c b/kernel/time.c
index 29511943871a..2e2e469a7fec 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -370,13 +370,20 @@ EXPORT_SYMBOL(mktime);
 *      0 <= tv_nsec < NSEC_PER_SEC
 * For negative values only the tv_sec field is negative !
 */
-void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
+void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
 {
        while (nsec >= NSEC_PER_SEC) {
+                /*
+                 * The following asm() prevents the compiler from
+                 * optimising this loop into a modulo operation. See
+                 * also __iter_div_u64_rem() in include/linux/time.h
+                 */
+                asm("" : "+rm"(nsec));
                nsec -= NSEC_PER_SEC;
                ++sec;
        }
        while (nsec < 0) {
+                asm("" : "+rm"(nsec));
                nsec += NSEC_PER_SEC;
                --sec;
        }
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 7466cb811251..09113347d328 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -21,7 +21,6 @@
 *
 * TODO WishList:
 *   o Allow clocksource drivers to be unregistered
- *   o get rid of clocksource_jiffies extern
 */
 #include <linux/clocksource.h>
@@ -30,6 +29,7 @@
 #include <linux/module.h>
 #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
 #include <linux/tick.h>
+#include <linux/kthread.h>
 void timecounter_init(struct timecounter *tc,
                      const struct cyclecounter *cc,
@@ -107,50 +107,35 @@ u64 timecounter_cyc2time(struct timecounter *tc,
 }
 EXPORT_SYMBOL(timecounter_cyc2time);
-/* XXX - Would like a better way for initializing curr_clocksource */
-extern struct clocksource clocksource_jiffies;
 /*[Clocksource internal variables]---------
 * curr_clocksource:
- *      currently selected clocksource. Initialized to clocksource_jiffies.
+ *      currently selected clocksource.
- * next_clocksource:
- *      pending next selected clocksource.
 * clocksource_list:
 *      linked list with the registered clocksources
- * clocksource_lock:
+ * clocksource_mutex:
- *      protects manipulations to curr_clocksource and next_clocksource
+ *      protects manipulations to curr_clocksource and the clocksource_list
- *      and the clocksource_list
 * override_name:
 *      Name of the user-specified clocksource.
 */
-static struct clocksource *curr_clocksource = &clocksource_jiffies;
+static struct clocksource *curr_clocksource;
-static struct clocksource *next_clocksource;
-static struct clocksource *clocksource_override;
 static LIST_HEAD(clocksource_list);
-static DEFINE_SPINLOCK(clocksource_lock);
+static DEFINE_MUTEX(clocksource_mutex);
 static char override_name[32];
 static int finished_booting;
-/* clocksource_done_booting - Called near the end of core bootup
- *
- * Hack to avoid lots of clocksource churn at boot time.
- * We use fs_initcall because we want this to start before
- * device_initcall but after subsys_initcall.
- */
-static int __init clocksource_done_booting(void)
-{
-        finished_booting = 1;
-        return 0;
-}
-fs_initcall(clocksource_done_booting);
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
+static void clocksource_watchdog_work(struct work_struct *work);
 static LIST_HEAD(watchdog_list);
 static struct clocksource *watchdog;
 static struct timer_list watchdog_timer;
+static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
 static DEFINE_SPINLOCK(watchdog_lock);
 static cycle_t watchdog_last;
-static unsigned long watchdog_resumed;
+static int watchdog_running;
+static int clocksource_watchdog_kthread(void *data);
+static void __clocksource_change_rating(struct clocksource *cs, int rating);
 /*
 * Interval: 0.5sec Threshold: 0.0625s
@@ -158,135 +143,249 @@ static unsigned long watchdog_resumed;
 #define WATCHDOG_INTERVAL (HZ >> 1)
 #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
-static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
+static void clocksource_watchdog_work(struct work_struct *work)
 {
-        if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD)
+        /*
-                return;
+         * If kthread_run fails the next watchdog scan over the
+         * watchdog_list will find the unstable clock again.
+         */
+        kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
+}
+static void __clocksource_unstable(struct clocksource *cs)
+{
+        cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
+        cs->flags |= CLOCK_SOURCE_UNSTABLE;
+        if (finished_booting)
+                schedule_work(&watchdog_work);
+}
+static void clocksource_unstable(struct clocksource *cs, int64_t delta)
+{
        printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
               cs->name, delta);
-        cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
+        __clocksource_unstable(cs);
-        clocksource_change_rating(cs, 0);
+}
-        list_del(&cs->wd_list);
+/**
+ * clocksource_mark_unstable - mark clocksource unstable via watchdog
+ * @cs:         clocksource to be marked unstable
+ *
+ * This function is called instead of clocksource_change_rating from
+ * cpu hotplug code to avoid a deadlock between the clocksource mutex
+ * and the cpu hotplug mutex. It defers the update of the clocksource
+ * to the watchdog thread.
+ */
+void clocksource_mark_unstable(struct clocksource *cs)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&watchdog_lock, flags);
+        if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
+                if (list_empty(&cs->wd_list))
+                        list_add(&cs->wd_list, &watchdog_list);
+                __clocksource_unstable(cs);
+        }
+        spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 static void clocksource_watchdog(unsigned long data)
 {
-        struct clocksource *cs, *tmp;
+        struct clocksource *cs;
        cycle_t csnow, wdnow;
        int64_t wd_nsec, cs_nsec;
-        int resumed;
+        int next_cpu;
        spin_lock(&watchdog_lock);
+        if (!watchdog_running)
-        resumed = test_and_clear_bit(0, &watchdog_resumed);
+                goto out;
        wdnow = watchdog->read(watchdog);
-        wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
+        wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
+                                     watchdog->mult, watchdog->shift);
        watchdog_last = wdnow;
-        list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
+        list_for_each_entry(cs, &watchdog_list, wd_list) {
-                csnow = cs->read(cs);
-                if (unlikely(resumed)) {
+                /* Clocksource already marked unstable? */
-                        cs->wd_last = csnow;
+                if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
+                        if (finished_booting)
+                                schedule_work(&watchdog_work);
                        continue;
                }
-                /* Initialized ? */
+                csnow = cs->read(cs);
+                /* Clocksource initialized ? */
                if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
-                        if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
-                            (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
-                                cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
-                                /*
-                                 * We just marked the clocksource as
-                                 * highres-capable, notify the rest of the
-                                 * system as well so that we transition
-                                 * into high-res mode:
-                                 */
-                                tick_clock_notify();
-                        }
                        cs->flags |= CLOCK_SOURCE_WATCHDOG;
                        cs->wd_last = csnow;
-                } else {
+                        continue;
-                        cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
-                        cs->wd_last = csnow;
-                        /* Check the delta. Might remove from the list ! */
-                        clocksource_ratewd(cs, cs_nsec - wd_nsec);
                }
-        }
-        if (!list_empty(&watchdog_list)) {
+                /* Check the deviation from the watchdog clocksource. */
-                /*
+                cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
-                 * Cycle through CPUs to check if the CPUs stay
+                                             cs->mask, cs->mult, cs->shift);
-                 * synchronized to each other.
+                cs->wd_last = csnow;
-                 */
+                if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
-                int next_cpu = cpumask_next(raw_smp_processor_id(),
+                        clocksource_unstable(cs, cs_nsec - wd_nsec);
-                                            cpu_online_mask);
+                        continue;
+                }
-                if (next_cpu >= nr_cpu_ids)
+                if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
-                        next_cpu = cpumask_first(cpu_online_mask);
+                    (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
-                watchdog_timer.expires += WATCHDOG_INTERVAL;
+                    (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
-                add_timer_on(&watchdog_timer, next_cpu);
+                        cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+                        /*
+                         * We just marked the clocksource as highres-capable,
+                         * notify the rest of the system as well so that we
+                         * transition into high-res mode:
+                         */
+                        tick_clock_notify();
+                }
        }
+        /*
+         * Cycle through CPUs to check if the CPUs stay synchronized
+         * to each other.
+         */
+        next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+        if (next_cpu >= nr_cpu_ids)
+                next_cpu = cpumask_first(cpu_online_mask);
+        watchdog_timer.expires += WATCHDOG_INTERVAL;
+        add_timer_on(&watchdog_timer, next_cpu);
+out:
        spin_unlock(&watchdog_lock);
 }
+static inline void clocksource_start_watchdog(void)
+{
+        if (watchdog_running || !watchdog || list_empty(&watchdog_list))
+                return;
+        init_timer(&watchdog_timer);
+        watchdog_timer.function = clocksource_watchdog;
+        watchdog_last = watchdog->read(watchdog);
+        watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
+        add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
+        watchdog_running = 1;
+}
+static inline void clocksource_stop_watchdog(void)
+{
+        if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
+                return;
+        del_timer(&watchdog_timer);
+        watchdog_running = 0;
+}
+static inline void clocksource_reset_watchdog(void)
+{
+        struct clocksource *cs;
+        list_for_each_entry(cs, &watchdog_list, wd_list)
+                cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+}
 static void clocksource_resume_watchdog(void)
 {
-        set_bit(0, &watchdog_resumed);
+        unsigned long flags;
+        spin_lock_irqsave(&watchdog_lock, flags);
+        clocksource_reset_watchdog();
+        spin_unlock_irqrestore(&watchdog_lock, flags);
 }
-static void clocksource_check_watchdog(struct clocksource *cs)
+static void clocksource_enqueue_watchdog(struct clocksource *cs)
 {
-        struct clocksource *cse;
        unsigned long flags;
        spin_lock_irqsave(&watchdog_lock, flags);
        if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
-                int started = !list_empty(&watchdog_list);
+                /* cs is a clocksource to be watched. */
                list_add(&cs->wd_list, &watchdog_list);
-                if (!started && watchdog) {
+                cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
-                        watchdog_last = watchdog->read(watchdog);
-                        watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
-                        add_timer_on(&watchdog_timer,
-                                     cpumask_first(cpu_online_mask));
-                }
        } else {
+                /* cs is a watchdog. */
                if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
                        cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+                /* Pick the best watchdog. */
                if (!watchdog || cs->rating > watchdog->rating) {
-                        if (watchdog)
-                                del_timer(&watchdog_timer);
                        watchdog = cs;
-                        init_timer(&watchdog_timer);
-                        watchdog_timer.function = clocksource_watchdog;
                        /* Reset watchdog cycles */
-                        list_for_each_entry(cse, &watchdog_list, wd_list)
+                        clocksource_reset_watchdog();
-                                cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
+                }
-                        /* Start if list is not empty */
+        }
-                        if (!list_empty(&watchdog_list)) {
+        /* Check if the watchdog timer needs to be started. */
-                                watchdog_last = watchdog->read(watchdog);
+        clocksource_start_watchdog();
-                                watchdog_timer.expires =
+        spin_unlock_irqrestore(&watchdog_lock, flags);
-                                        jiffies + WATCHDOG_INTERVAL;
+}
-                                add_timer_on(&watchdog_timer,
-                                             cpumask_first(cpu_online_mask));
+static void clocksource_dequeue_watchdog(struct clocksource *cs)
-                        }
+{
+        struct clocksource *tmp;
+        unsigned long flags;
+        spin_lock_irqsave(&watchdog_lock, flags);
+        if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
+                /* cs is a watched clocksource. */
+                list_del_init(&cs->wd_list);
+        } else if (cs == watchdog) {
+                /* Reset watchdog cycles */
+                clocksource_reset_watchdog();
+                /* Current watchdog is removed. Find an alternative. */
+                watchdog = NULL;
+                list_for_each_entry(tmp, &clocksource_list, list) {
+                        if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
+                                continue;
+                        if (!watchdog || tmp->rating > watchdog->rating)
+                                watchdog = tmp;
                }
        }
+        cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+        /* Check if the watchdog timer needs to be stopped. */
+        clocksource_stop_watchdog();
        spin_unlock_irqrestore(&watchdog_lock, flags);
 }
-#else
-static void clocksource_check_watchdog(struct clocksource *cs)
+static int clocksource_watchdog_kthread(void *data)
+{
+        struct clocksource *cs, *tmp;
+        unsigned long flags;
+        LIST_HEAD(unstable);
+        mutex_lock(&clocksource_mutex);
+        spin_lock_irqsave(&watchdog_lock, flags);
+        list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list)
+                if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
+                        list_del_init(&cs->wd_list);
+                        list_add(&cs->wd_list, &unstable);
+                }
+        /* Check if the watchdog timer needs to be stopped. */
+        clocksource_stop_watchdog();
+        spin_unlock_irqrestore(&watchdog_lock, flags);
+        /* Needs to be done outside of watchdog lock */
+        list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
+                list_del_init(&cs->wd_list);
+                __clocksource_change_rating(cs, 0);
+        }
+        mutex_unlock(&clocksource_mutex);
+        return 0;
+}
+#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
+static void clocksource_enqueue_watchdog(struct clocksource *cs)
 {
        if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
                cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
 }
+static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
 static inline void clocksource_resume_watchdog(void) { }
-#endif
+static inline int clocksource_watchdog_kthread(void *data) { return 0; }
+#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
 /**
 * clocksource_resume - resume the clocksource(s)
@@ -294,18 +393,16 @@ static inline void clocksource_resume_watchdog(void) { }
 void clocksource_resume(void)
 {
        struct clocksource *cs;
-        unsigned long flags;
-        spin_lock_irqsave(&clocksource_lock, flags);
+        mutex_lock(&clocksource_mutex);
-        list_for_each_entry(cs, &clocksource_list, list) {
+        list_for_each_entry(cs, &clocksource_list, list)
                if (cs->resume)
                        cs->resume();
-        }
        clocksource_resume_watchdog();
-        spin_unlock_irqrestore(&clocksource_lock, flags);
+        mutex_unlock(&clocksource_mutex);
 }
 /**
@@ -320,75 +417,94 @@ void clocksource_touch_watchdog(void)
        clocksource_resume_watchdog();
 }
+#ifdef CONFIG_GENERIC_TIME
 /**
- * clocksource_get_next - Returns the selected clocksource
+ * clocksource_select - Select the best clocksource available
+ *
+ * Private function. Must hold clocksource_mutex when called.
 *
+ * Select the clocksource with the best rating, or the clocksource,
+ * which is selected by userspace override.
 */
-struct clocksource *clocksource_get_next(void)
+static void clocksource_select(void)
 {
-        unsigned long flags;
+        struct clocksource *best, *cs;
-        spin_lock_irqsave(&clocksource_lock, flags);
+        if (!finished_booting || list_empty(&clocksource_list))
-        if (next_clocksource && finished_booting) {
+                return;
-                curr_clocksource = next_clocksource;
+        /* First clocksource on the list has the best rating. */
-                next_clocksource = NULL;
+        best = list_first_entry(&clocksource_list, struct clocksource, list);
+        /* Check for the override clocksource. */
+        list_for_each_entry(cs, &clocksource_list, list) {
+                if (strcmp(cs->name, override_name) != 0)
+                        continue;
+                /*
+                 * Check to make sure we don't switch to a non-highres
+                 * capable clocksource if the tick code is in oneshot
+                 * mode (highres or nohz)
+                 */
+                if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
+                    tick_oneshot_mode_active()) {
+                        /* Override clocksource cannot be used. */
+                        printk(KERN_WARNING "Override clocksource %s is not "
+                               "HRT compatible. Cannot switch while in "
+                               "HRT/NOHZ mode\n", cs->name);
+                        override_name[0] = 0;
+                } else
+                        /* Override clocksource can be used. */
+                        best = cs;
+                break;
+        }
+        if (curr_clocksource != best) {
+                printk(KERN_INFO "Switching to clocksource %s\n", best->name);
+                curr_clocksource = best;
+                timekeeping_notify(curr_clocksource);
        }
-        spin_unlock_irqrestore(&clocksource_lock, flags);
-        return curr_clocksource;
 }
-/**
+#else /* CONFIG_GENERIC_TIME */
- * select_clocksource - Selects the best registered clocksource.
- *
+static inline void clocksource_select(void) { }
- * Private function. Must hold clocksource_lock when called.
+#endif
+/*
+ * clocksource_done_booting - Called near the end of core bootup
 *
- * Select the clocksource with the best rating, or the clocksource,
+ * Hack to avoid lots of clocksource churn at boot time.
- * which is selected by userspace override.
+ * We use fs_initcall because we want this to start before
+ * device_initcall but after subsys_initcall.
 */
-static struct clocksource *select_clocksource(void)
+static int __init clocksource_done_booting(void)
 {
-        struct clocksource *next;
+        finished_booting = 1;
-        if (list_empty(&clocksource_list))
-                return NULL;
-        if (clocksource_override)
-                next = clocksource_override;
-        else
-                next = list_entry(clocksource_list.next, struct clocksource,
-                                  list);
-        if (next == curr_clocksource)
+        /*
-                return NULL;
+         * Run the watchdog first to eliminate unstable clock sources
+         */
+        clocksource_watchdog_kthread(NULL);
-        return next;
+        mutex_lock(&clocksource_mutex);
+        clocksource_select();
+        mutex_unlock(&clocksource_mutex);
+        return 0;
 }
+fs_initcall(clocksource_done_booting);
 /*
 * Enqueue the clocksource sorted by rating
 */
-static int clocksource_enqueue(struct clocksource *c)
+static void clocksource_enqueue(struct clocksource *cs)
 {
-        struct list_head *tmp, *entry = &clocksource_list;
+        struct list_head *entry = &clocksource_list;
+        struct clocksource *tmp;
-        list_for_each(tmp, &clocksource_list) {
+        list_for_each_entry(tmp, &clocksource_list, list)
-                struct clocksource *cs;
-                cs = list_entry(tmp, struct clocksource, list);
-                if (cs == c)
-                        return -EBUSY;
                /* Keep track of the place, where to insert */
-                if (cs->rating >= c->rating)
+                if (tmp->rating >= cs->rating)
-                        entry = tmp;
+                        entry = &tmp->list;
-        }
+        list_add(&cs->list, entry);
-        list_add(&c->list, entry);
-        if (strlen(c->name) == strlen(override_name) &&
-            !strcmp(c->name, override_name))
-                clocksource_override = c;
-        return 0;
 }
 /**
@@ -397,52 +513,48 @@ static int clocksource_enqueue(struct clocksource *c)
 *
 * Returns -EBUSY if registration fails, zero otherwise.
 */
-int clocksource_register(struct clocksource *c)
+int clocksource_register(struct clocksource *cs)
 {
-        unsigned long flags;
+        mutex_lock(&clocksource_mutex);
-        int ret;
+        clocksource_enqueue(cs);
+        clocksource_select();
-        spin_lock_irqsave(&clocksource_lock, flags);
+        clocksource_enqueue_watchdog(cs);
-        ret = clocksource_enqueue(c);
+        mutex_unlock(&clocksource_mutex);
-        if (!ret)
+        return 0;
-                next_clocksource = select_clocksource();
-        spin_unlock_irqrestore(&clocksource_lock, flags);
-        if (!ret)
-                clocksource_check_watchdog(c);
-        return ret;
 }
 EXPORT_SYMBOL(clocksource_register);
+static void __clocksource_change_rating(struct clocksource *cs, int rating)
+{
+        list_del(&cs->list);
+        cs->rating = rating;
+        clocksource_enqueue(cs);
+        clocksource_select();
+}
 /**
 * clocksource_change_rating - Change the rating of a registered clocksource
- *
 */
 void clocksource_change_rating(struct clocksource *cs, int rating)
 {
-        unsigned long flags;
+        mutex_lock(&clocksource_mutex);
+        __clocksource_change_rating(cs, rating);
-        spin_lock_irqsave(&clocksource_lock, flags);
+        mutex_unlock(&clocksource_mutex);
-        list_del(&cs->list);
-        cs->rating = rating;
-        clocksource_enqueue(cs);
-        next_clocksource = select_clocksource();
-        spin_unlock_irqrestore(&clocksource_lock, flags);
 }
+EXPORT_SYMBOL(clocksource_change_rating);
 /**
 * clocksource_unregister - remove a registered clocksource
 */
 void clocksource_unregister(struct clocksource *cs)
 {
-        unsigned long flags;
+        mutex_lock(&clocksource_mutex);
+        clocksource_dequeue_watchdog(cs);
-        spin_lock_irqsave(&clocksource_lock, flags);
        list_del(&cs->list);
-        if (clocksource_override == cs)
+        clocksource_select();
-                clocksource_override = NULL;
+        mutex_unlock(&clocksource_mutex);
-        next_clocksource = select_clocksource();
-        spin_unlock_irqrestore(&clocksource_lock, flags);
 }
+EXPORT_SYMBOL(clocksource_unregister);
 #ifdef CONFIG_SYSFS
 /**
@@ -458,9 +570,9 @@ sysfs_show_current_clocksources(struct sys_device *dev,
 {
        ssize_t count = 0;
-        spin_lock_irq(&clocksource_lock);
+        mutex_lock(&clocksource_mutex);
        count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
-        spin_unlock_irq(&clocksource_lock);
+        mutex_unlock(&clocksource_mutex);
        return count;
 }
@@ -478,9 +590,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
                                          struct sysdev_attribute *attr,
                                          const char *buf, size_t count)
 {
-        struct clocksource *ovr = NULL;
        size_t ret = count;
-        int len;
        /* strings from sysfs write are not 0 terminated! */
        if (count >= sizeof(override_name))
@@ -490,44 +600,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
        if (buf[count-1] == '\n')
                count--;
-        spin_lock_irq(&clocksource_lock);
+        mutex_lock(&clocksource_mutex);
        if (count > 0)
                memcpy(override_name, buf, count);
        override_name[count] = 0;
+        clocksource_select();
-        len = strlen(override_name);
+        mutex_unlock(&clocksource_mutex);
-        if (len) {
-                struct clocksource *cs;
-                ovr = clocksource_override;
-                /* try to select it: */
-                list_for_each_entry(cs, &clocksource_list, list) {
-                        if (strlen(cs->name) == len &&
-                            !strcmp(cs->name, override_name))
-                                ovr = cs;
-                }
-        }
-        /*
-         * Check to make sure we don't switch to a non-highres capable
-         * clocksource if the tick code is in oneshot mode (highres or nohz)
-         */
-        if (tick_oneshot_mode_active() && ovr &&
-            !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
-                printk(KERN_WARNING "%s clocksource is not HRT compatible. "
-                        "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
-                ovr = NULL;
-                override_name[0] = 0;
-        }
-        /* Reselect, when the override name has changed */
-        if (ovr != clocksource_override) {
-                clocksource_override = ovr;
-                next_clocksource = select_clocksource();
-        }
-        spin_unlock_irq(&clocksource_lock);
        return ret;
 }
@@ -547,7 +627,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
        struct clocksource *src;
        ssize_t count = 0;
-        spin_lock_irq(&clocksource_lock);
+        mutex_lock(&clocksource_mutex);
        list_for_each_entry(src, &clocksource_list, list) {
                /*
                 * Don't show non-HRES clocksource if the tick code is
@@ -559,7 +639,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
                                  max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
                                  "%s ", src->name);
        }
-        spin_unlock_irq(&clocksource_lock);
+        mutex_unlock(&clocksource_mutex);
        count += snprintf(buf + count,
                          max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
@@ -614,11 +694,10 @@ device_initcall(init_clocksource_sysfs);
 */
 static int __init boot_override_clocksource(char* str)
 {
-        unsigned long flags;
+        mutex_lock(&clocksource_mutex);
-        spin_lock_irqsave(&clocksource_lock, flags);
        if (str)
                strlcpy(override_name, str, sizeof(override_name));
-        spin_unlock_irqrestore(&clocksource_lock, flags);
+        mutex_unlock(&clocksource_mutex);
        return 1;
 }
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index c3f6c30816e3..5404a8456909 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = {
        .read           = jiffies_read,
        .mask           = 0xffffffff, /*32bits*/
        .mult           = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
-        .mult_orig      = NSEC_PER_JIFFY << JIFFIES_SHIFT,
        .shift          = JIFFIES_SHIFT,
 };
@@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void)
 }
 core_initcall(init_jiffies_clocksource);
+struct clocksource * __init __weak clocksource_default_clock(void)
+{
+        return &clocksource_jiffies;
+}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7fc64375ff43..4800f933910e 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
        case TIME_OK:
                break;
        case TIME_INS:
-                xtime.tv_sec--;
+                timekeeping_leap_insert(-1);
-                wall_to_monotonic.tv_sec++;
                time_state = TIME_OOP;
                printk(KERN_NOTICE
                        "Clock: inserting leap second 23:59:60 UTC\n");
@@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
                res = HRTIMER_RESTART;
                break;
        case TIME_DEL:
-                xtime.tv_sec++;
+                timekeeping_leap_insert(1);
                time_tai--;
-                wall_to_monotonic.tv_sec--;
                time_state = TIME_WAIT;
                printk(KERN_NOTICE
                        "Clock: deleting leap second 23:59:59 UTC\n");
@@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
                        time_state = TIME_OK;
                break;
        }
-        update_vsyscall(&xtime, clock);
        write_sequnlock(&xtime_lock);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e8c77d9c633a..fb0f46fa1ecd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -18,7 +18,117 @@
 #include <linux/jiffies.h>
 #include <linux/time.h>
 #include <linux/tick.h>
+#include <linux/stop_machine.h>
+/* Structure holding internal timekeeping values. */
+struct timekeeper {
+        /* Current clocksource used for timekeeping. */
+        struct clocksource *clock;
+        /* The shift value of the current clocksource. */
+        int     shift;
+        /* Number of clock cycles in one NTP interval. */
+        cycle_t cycle_interval;
+        /* Number of clock shifted nano seconds in one NTP interval. */
+        u64     xtime_interval;
+        /* Raw nano seconds accumulated per NTP interval. */
+        u32     raw_interval;
+        /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
+        u64     xtime_nsec;
+        /* Difference between accumulated time and NTP time in ntp
+         * shifted nano seconds. */
+        s64     ntp_error;
+        /* Shift conversion between clock shifted nano seconds and
+         * ntp shifted nano seconds. */
+        int     ntp_error_shift;
+        /* NTP adjusted clock multiplier */
+        u32     mult;
+};
+struct timekeeper timekeeper;
+/**
+ * timekeeper_setup_internals - Set up internals to use clocksource clock.
+ *
+ * @clock:              Pointer to clocksource.
+ *
+ * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
+ * pair and interval request.
+ *
+ * Unless you're the timekeeping code, you should not be using this!
+ */
+static void timekeeper_setup_internals(struct clocksource *clock)
+{
+        cycle_t interval;
+        u64 tmp;
+        timekeeper.clock = clock;
+        clock->cycle_last = clock->read(clock);
+        /* Do the ns -> cycle conversion first, using original mult */
+        tmp = NTP_INTERVAL_LENGTH;
+        tmp <<= clock->shift;
+        tmp += clock->mult/2;
+        do_div(tmp, clock->mult);
+        if (tmp == 0)
+                tmp = 1;
+        interval = (cycle_t) tmp;
+        timekeeper.cycle_interval = interval;
+        /* Go back from cycles -> shifted ns */
+        timekeeper.xtime_interval = (u64) interval * clock->mult;
+        timekeeper.raw_interval =
+                ((u64) interval * clock->mult) >> clock->shift;
+        timekeeper.xtime_nsec = 0;
+        timekeeper.shift = clock->shift;
+        timekeeper.ntp_error = 0;
+        timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
+        /*
+         * The timekeeper keeps its own mult values for the currently
+         * active clocksource. These value will be adjusted via NTP
+         * to counteract clock drifting.
+         */
+        timekeeper.mult = clock->mult;
+}
+/* Timekeeper helper functions. */
+static inline s64 timekeeping_get_ns(void)
+{
+        cycle_t cycle_now, cycle_delta;
+        struct clocksource *clock;
+        /* read clocksource: */
+        clock = timekeeper.clock;
+        cycle_now = clock->read(clock);
+        /* calculate the delta since the last update_wall_time: */
+        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+        /* return delta convert to nanoseconds using ntp adjusted mult. */
+        return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
+                                  timekeeper.shift);
+}
+static inline s64 timekeeping_get_ns_raw(void)
+{
+        cycle_t cycle_now, cycle_delta;
+        struct clocksource *clock;
+        /* read clocksource: */
+        clock = timekeeper.clock;
+        cycle_now = clock->read(clock);
+        /* calculate the delta since the last update_wall_time: */
+        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+        /* return delta convert to nanoseconds using ntp adjusted mult. */
+        return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+}
 /*
 * This read-write spinlock protects us from races in SMP while
@@ -44,7 +154,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
 */
 struct timespec xtime __attribute__ ((aligned (16)));
 struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
-static unsigned long total_sleep_time;          /* seconds */
+static struct timespec total_sleep_time;
+/*
+ * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
+ */
+struct timespec raw_time;
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -56,35 +171,44 @@ void update_xtime_cache(u64 nsec)
        timespec_add_ns(&xtime_cache, nsec);
 }
-struct clocksource *clock;
+/* must hold xtime_lock */
+void timekeeping_leap_insert(int leapsecond)
+{
+        xtime.tv_sec += leapsecond;
+        wall_to_monotonic.tv_sec -= leapsecond;
+        update_vsyscall(&xtime, timekeeper.clock);
+}
 #ifdef CONFIG_GENERIC_TIME
 /**
- * clocksource_forward_now - update clock to the current time
+ * timekeeping_forward_now - update clock to the current time
 *
 * Forward the current clock to update its state since the last call to
 * update_wall_time(). This is useful before significant clock changes,
 * as it avoids having to deal with this time offset explicitly.
 */
-static void clocksource_forward_now(void)
+static void timekeeping_forward_now(void)
 {
        cycle_t cycle_now, cycle_delta;
+        struct clocksource *clock;
        s64 nsec;
-        cycle_now = clocksource_read(clock);
+        clock = timekeeper.clock;
+        cycle_now = clock->read(clock);
        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
        clock->cycle_last = cycle_now;
-        nsec = cyc2ns(clock, cycle_delta);
+        nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
+                                  timekeeper.shift);
        /* If arch requires, add in gettimeoffset() */
        nsec += arch_gettimeoffset();
        timespec_add_ns(&xtime, nsec);
-        nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
+        nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
-        clock->raw_time.tv_nsec += nsec;
+        timespec_add_ns(&raw_time, nsec);
 }
 /**
@@ -95,7 +219,6 @@ static void clocksource_forward_now(void)
 */
 void getnstimeofday(struct timespec *ts)
 {
-        cycle_t cycle_now, cycle_delta;
        unsigned long seq;
        s64 nsecs;
@@ -105,15 +228,7 @@ void getnstimeofday(struct timespec *ts)
                seq = read_seqbegin(&xtime_lock);
                *ts = xtime;
+                nsecs = timekeeping_get_ns();
-                /* read clocksource: */
-                cycle_now = clocksource_read(clock);
-                /* calculate the delta since the last update_wall_time: */
-                cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-                /* convert to nanoseconds: */
-                nsecs = cyc2ns(clock, cycle_delta);
                /* If arch requires, add in gettimeoffset() */
                nsecs += arch_gettimeoffset();
@@ -125,6 +240,57 @@ void getnstimeofday(struct timespec *ts)
 EXPORT_SYMBOL(getnstimeofday);
+ktime_t ktime_get(void)
+{
+        unsigned int seq;
+        s64 secs, nsecs;
+        WARN_ON(timekeeping_suspended);
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
+                nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
+                nsecs += timekeeping_get_ns();
+        } while (read_seqretry(&xtime_lock, seq));
+        /*
+         * Use ktime_set/ktime_add_ns to create a proper ktime on
+         * 32-bit architectures without CONFIG_KTIME_SCALAR.
+         */
+        return ktime_add_ns(ktime_set(secs, 0), nsecs);
+}
+EXPORT_SYMBOL_GPL(ktime_get);
+/**
+ * ktime_get_ts - get the monotonic clock in timespec format
+ * @ts:         pointer to timespec variable
+ *
+ * The function calculates the monotonic clock from the realtime
+ * clock and the wall_to_monotonic offset and stores the result
+ * in normalized timespec format in the variable pointed to by @ts.
+ */
+void ktime_get_ts(struct timespec *ts)
+{
+        struct timespec tomono;
+        unsigned int seq;
+        s64 nsecs;
+        WARN_ON(timekeeping_suspended);
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                *ts = xtime;
+                tomono = wall_to_monotonic;
+                nsecs = timekeeping_get_ns();
+        } while (read_seqretry(&xtime_lock, seq));
+        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
+                                ts->tv_nsec + tomono.tv_nsec + nsecs);
+}
+EXPORT_SYMBOL_GPL(ktime_get_ts);
 /**
 * do_gettimeofday - Returns the time of day in a timeval
 * @tv:         pointer to the timeval to be set
@@ -157,7 +323,7 @@ int do_settimeofday(struct timespec *tv)
        write_seqlock_irqsave(&xtime_lock, flags);
-        clocksource_forward_now();
+        timekeeping_forward_now();
        ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
        ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
@@ -167,10 +333,10 @@ int do_settimeofday(struct timespec *tv)
        update_xtime_cache(0);
-        clock->error = 0;
+        timekeeper.ntp_error = 0;
        ntp_clear();
-        update_vsyscall(&xtime, clock);
+        update_vsyscall(&xtime, timekeeper.clock);
        write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -187,44 +353,97 @@ EXPORT_SYMBOL(do_settimeofday);
 *
 * Accumulates current time interval and initializes new clocksource
 */
-static void change_clocksource(void)
+static int change_clocksource(void *data)
 {
        struct clocksource *new, *old;
-        new = clocksource_get_next();
+        new = (struct clocksource *) data;
+        timekeeping_forward_now();
+        if (!new->enable || new->enable(new) == 0) {
+                old = timekeeper.clock;
+                timekeeper_setup_internals(new);
+                if (old->disable)
+                        old->disable(old);
+        }
+        return 0;
+}
-        if (clock == new)
+/**
+ * timekeeping_notify - Install a new clock source
+ * @clock:              pointer to the clock source
+ *
+ * This function is called from clocksource.c after a new, better clock
+ * source has been registered. The caller holds the clocksource_mutex.
+ */
+void timekeeping_notify(struct clocksource *clock)
+{
+        if (timekeeper.clock == clock)
                return;
+        stop_machine(change_clocksource, clock, NULL);
+        tick_clock_notify();
+}
-        clocksource_forward_now();
+#else /* GENERIC_TIME */
-        if (clocksource_enable(new))
+static inline void timekeeping_forward_now(void) { }
-                return;
-        new->raw_time = clock->raw_time;
+/**
-        old = clock;
+ * ktime_get - get the monotonic time in ktime_t format
-        clock = new;
+ *
-        clocksource_disable(old);
+ * returns the time in ktime_t format
+ */
+ktime_t ktime_get(void)
+{
+        struct timespec now;
-        clock->cycle_last = 0;
+        ktime_get_ts(&now);
-        clock->cycle_last = clocksource_read(clock);
-        clock->error = 0;
-        clock->xtime_nsec = 0;
-        clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
-        tick_clock_notify();
+        return timespec_to_ktime(now);
+}
+EXPORT_SYMBOL_GPL(ktime_get);
-        /*
+/**
-         * We're holding xtime lock and waking up klogd would deadlock
+ * ktime_get_ts - get the monotonic clock in timespec format
-         * us on enqueue.  So no printing!
+ * @ts:         pointer to timespec variable
-        printk(KERN_INFO "Time: %s clocksource has been installed.\n",
+ *
-               clock->name);
+ * The function calculates the monotonic clock from the realtime
-         */
+ * clock and the wall_to_monotonic offset and stores the result
+ * in normalized timespec format in the variable pointed to by @ts.
+ */
+void ktime_get_ts(struct timespec *ts)
+{
+        struct timespec tomono;
+        unsigned long seq;
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                getnstimeofday(ts);
+                tomono = wall_to_monotonic;
+        } while (read_seqretry(&xtime_lock, seq));
+        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
+                                ts->tv_nsec + tomono.tv_nsec);
 }
-#else
+EXPORT_SYMBOL_GPL(ktime_get_ts);
-static inline void clocksource_forward_now(void) { }
-static inline void change_clocksource(void) { }
+#endif /* !GENERIC_TIME */
-#endif
+/**
+ * ktime_get_real - get the real (wall-) time in ktime_t format
+ *
+ * returns the time in ktime_t format
+ */
+ktime_t ktime_get_real(void)
+{
+        struct timespec now;
+        getnstimeofday(&now);
+        return timespec_to_ktime(now);
+}
+EXPORT_SYMBOL_GPL(ktime_get_real);
 /**
 * getrawmonotonic - Returns the raw monotonic time in a timespec
@@ -236,21 +455,11 @@ void getrawmonotonic(struct timespec *ts)
 {
        unsigned long seq;
        s64 nsecs;
-        cycle_t cycle_now, cycle_delta;
        do {
                seq = read_seqbegin(&xtime_lock);
+                nsecs = timekeeping_get_ns_raw();
-                /* read clocksource: */
+                *ts = raw_time;
-                cycle_now = clocksource_read(clock);
-                /* calculate the delta since the last update_wall_time: */
-                cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-                /* convert to nanoseconds: */
-                nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
-                *ts = clock->raw_time;
        } while (read_seqretry(&xtime_lock, seq));
@@ -270,7 +479,7 @@ int timekeeping_valid_for_hres(void)
        do {
                seq = read_seqbegin(&xtime_lock);
-                ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+                ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
        } while (read_seqretry(&xtime_lock, seq));
@@ -278,17 +487,33 @@ int timekeeping_valid_for_hres(void)
 }
 /**
- * read_persistent_clock -  Return time in seconds from the persistent clock.
+ * read_persistent_clock -  Return time from the persistent clock.
 *
 * Weak dummy function for arches that do not yet support it.
- * Returns seconds from epoch using the battery backed persistent clock.
+ * Reads the time from the battery backed persistent clock.
- * Returns zero if unsupported.
+ * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
 *
 *  XXX - Do be sure to remove it once all arches implement it.
 */
-unsigned long __attribute__((weak)) read_persistent_clock(void)
+void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
 {
-        return 0;
+        ts->tv_sec = 0;
+        ts->tv_nsec = 0;
+}
+/**
+ * read_boot_clock -  Return time of the system start.
+ *
+ * Weak dummy function for arches that do not yet support it.
+ * Function to read the exact time the system has been started.
+ * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
+ *
+ *  XXX - Do be sure to remove it once all arches implement it.
+ */
+void __attribute__((weak)) read_boot_clock(struct timespec *ts)
+{
+        ts->tv_sec = 0;
+        ts->tv_nsec = 0;
 }
 /*
@@ -296,29 +521,40 @@ unsigned long __attribute__((weak)) read_persistent_clock(void)
 */
 void __init timekeeping_init(void)
 {
+        struct clocksource *clock;
        unsigned long flags;
-        unsigned long sec = read_persistent_clock();
+        struct timespec now, boot;
+        read_persistent_clock(&now);
+        read_boot_clock(&boot);
        write_seqlock_irqsave(&xtime_lock, flags);
        ntp_init();
-        clock = clocksource_get_next();
+        clock = clocksource_default_clock();
-        clocksource_enable(clock);
+        if (clock->enable)
-        clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+                clock->enable(clock);
-        clock->cycle_last = clocksource_read(clock);
+        timekeeper_setup_internals(clock);
-        xtime.tv_sec = sec;
+        xtime.tv_sec = now.tv_sec;
-        xtime.tv_nsec = 0;
+        xtime.tv_nsec = now.tv_nsec;
+        raw_time.tv_sec = 0;
+        raw_time.tv_nsec = 0;
+        if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
+                boot.tv_sec = xtime.tv_sec;
+                boot.tv_nsec = xtime.tv_nsec;
+        }
        set_normalized_timespec(&wall_to_monotonic,
-                -xtime.tv_sec, -xtime.tv_nsec);
+                                -boot.tv_sec, -boot.tv_nsec);
        update_xtime_cache(0);
-        total_sleep_time = 0;
+        total_sleep_time.tv_sec = 0;
+        total_sleep_time.tv_nsec = 0;
        write_sequnlock_irqrestore(&xtime_lock, flags);
 }
 /* time in seconds when suspend began */
-static unsigned long timekeeping_suspend_time;
+static struct timespec timekeeping_suspend_time;
 /**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -331,24 +567,24 @@ static unsigned long timekeeping_suspend_time;
 static int timekeeping_resume(struct sys_device *dev)
 {
        unsigned long flags;
-        unsigned long now = read_persistent_clock();
+        struct timespec ts;
+        read_persistent_clock(&ts);
        clocksource_resume();
        write_seqlock_irqsave(&xtime_lock, flags);
-        if (now && (now > timekeeping_suspend_time)) {
+        if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
-                unsigned long sleep_length = now - timekeeping_suspend_time;
+                ts = timespec_sub(ts, timekeeping_suspend_time);
+                xtime = timespec_add_safe(xtime, ts);
-                xtime.tv_sec += sleep_length;
+                wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
-                wall_to_monotonic.tv_sec -= sleep_length;
+                total_sleep_time = timespec_add_safe(total_sleep_time, ts);
-                total_sleep_time += sleep_length;
        }
        update_xtime_cache(0);
        /* re-base the last cycle value */
-        clock->cycle_last = 0;
+        timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
-        clock->cycle_last = clocksource_read(clock);
+        timekeeper.ntp_error = 0;
-        clock->error = 0;
        timekeeping_suspended = 0;
        write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -366,10 +602,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
 {
        unsigned long flags;
-        timekeeping_suspend_time = read_persistent_clock();
+        read_persistent_clock(&timekeeping_suspend_time);
        write_seqlock_irqsave(&xtime_lock, flags);
-        clocksource_forward_now();
+        timekeeping_forward_now();
        timekeeping_suspended = 1;
        write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -404,7 +640,7 @@ device_initcall(timekeeping_init_device);
 * If the error is already larger, we look ahead even further
 * to compensate for late or lost adjustments.
 */
-static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
+static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
                                                 s64 *offset)
 {
        s64 tick_error, i;
@@ -420,7 +656,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
         * here.  This is tuned so that an error of about 1 msec is adjusted
         * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
         */
-        error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
+        error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
        error2 = abs(error2);
        for (look_ahead = 0; error2 > 0; look_ahead++)
                error2 >>= 2;
@@ -429,8 +665,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
         * Now calculate the error in (1 << look_ahead) ticks, but first
         * remove the single look ahead already included in the error.
         */
-        tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1);
+        tick_error = tick_length >> (timekeeper.ntp_error_shift + 1);
-        tick_error -= clock->xtime_interval >> 1;
+        tick_error -= timekeeper.xtime_interval >> 1;
        error = ((error - tick_error) >> look_ahead) + tick_error;
        /* Finally calculate the adjustment shift value.  */
@@ -455,18 +691,18 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
 * this is optimized for the most common adjustments of -1,0,1,
 * for other values we can do a bit more work.
 */
-static void clocksource_adjust(s64 offset)
+static void timekeeping_adjust(s64 offset)
 {
-        s64 error, interval = clock->cycle_interval;
+        s64 error, interval = timekeeper.cycle_interval;
        int adj;
-        error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1);
+        error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
        if (error > interval) {
                error >>= 2;
                if (likely(error <= interval))
                        adj = 1;
                else
-                        adj = clocksource_bigadjust(error, &interval, &offset);
+                        adj = timekeeping_bigadjust(error, &interval, &offset);
        } else if (error < -interval) {
                error >>= 2;
                if (likely(error >= -interval)) {
@@ -474,15 +710,15 @@ static void clocksource_adjust(s64 offset)
                        interval = -interval;
                        offset = -offset;
                } else
-                        adj = clocksource_bigadjust(error, &interval, &offset);
+                        adj = timekeeping_bigadjust(error, &interval, &offset);
        } else
                return;
-        clock->mult += adj;
+        timekeeper.mult += adj;
-        clock->xtime_interval += interval;
+        timekeeper.xtime_interval += interval;
-        clock->xtime_nsec -= offset;
+        timekeeper.xtime_nsec -= offset;
-        clock->error -= (interval - offset) <<
+        timekeeper.ntp_error -= (interval - offset) <<
-                        (NTP_SCALE_SHIFT - clock->shift);
+                                timekeeper.ntp_error_shift;
 }
 /**
@@ -492,53 +728,59 @@ static void clocksource_adjust(s64 offset)
 */
 void update_wall_time(void)
 {
+        struct clocksource *clock;
        cycle_t offset;
+        u64 nsecs;
        /* Make sure we're fully resumed: */
        if (unlikely(timekeeping_suspended))
                return;
+        clock = timekeeper.clock;
 #ifdef CONFIG_GENERIC_TIME
-        offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+        offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
 #else
-        offset = clock->cycle_interval;
+        offset = timekeeper.cycle_interval;
 #endif
-        clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
+        timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
        /* normally this loop will run just once, however in the
         * case of lost or late ticks, it will accumulate correctly.
         */
-        while (offset >= clock->cycle_interval) {
+        while (offset >= timekeeper.cycle_interval) {
+                u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
                /* accumulate one interval */
-                offset -= clock->cycle_interval;
+                offset -= timekeeper.cycle_interval;
-                clock->cycle_last += clock->cycle_interval;
+                clock->cycle_last += timekeeper.cycle_interval;
-                clock->xtime_nsec += clock->xtime_interval;
+                timekeeper.xtime_nsec += timekeeper.xtime_interval;
-                if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
+                if (timekeeper.xtime_nsec >= nsecps) {
-                        clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
+                        timekeeper.xtime_nsec -= nsecps;
                        xtime.tv_sec++;
                        second_overflow();
                }
-                clock->raw_time.tv_nsec += clock->raw_interval;
+                raw_time.tv_nsec += timekeeper.raw_interval;
-                if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
+                if (raw_time.tv_nsec >= NSEC_PER_SEC) {
-                        clock->raw_time.tv_nsec -= NSEC_PER_SEC;
+                        raw_time.tv_nsec -= NSEC_PER_SEC;
-                        clock->raw_time.tv_sec++;
+                        raw_time.tv_sec++;
                }
                /* accumulate error between NTP and clock interval */
-                clock->error += tick_length;
+                timekeeper.ntp_error += tick_length;
-                clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
+                timekeeper.ntp_error -= timekeeper.xtime_interval <<
+                                        timekeeper.ntp_error_shift;
        }
        /* correct the clock when NTP error is too big */
-        clocksource_adjust(offset);
+        timekeeping_adjust(offset);
        /*
         * Since in the loop above, we accumulate any amount of time
         * in xtime_nsec over a second into xtime.tv_sec, its possible for
         * xtime_nsec to be fairly small after the loop. Further, if we're
-         * slightly speeding the clocksource up in clocksource_adjust(),
+         * slightly speeding the clocksource up in timekeeping_adjust(),
         * its possible the required corrective factor to xtime_nsec could
         * cause it to underflow.
         *
@@ -550,24 +792,25 @@ void update_wall_time(void)
         * We'll correct this error next time through this function, when
         * xtime_nsec is not as small.
         */
-        if (unlikely((s64)clock->xtime_nsec < 0)) {
+        if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
-                s64 neg = -(s64)clock->xtime_nsec;
+                s64 neg = -(s64)timekeeper.xtime_nsec;
-                clock->xtime_nsec = 0;
+                timekeeper.xtime_nsec = 0;
-                clock->error += neg << (NTP_SCALE_SHIFT - clock->shift);
+                timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
        }
        /* store full nanoseconds into xtime after rounding it up and
         * add the remainder to the error difference.
         */
-        xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1;
+        xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
-        clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
+        timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
-        clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);
+        timekeeper.ntp_error += timekeeper.xtime_nsec <<
+                                timekeeper.ntp_error_shift;
-        update_xtime_cache(cyc2ns(clock, offset));
+        nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
+        update_xtime_cache(nsecs);
        /* check to see if there is a new clocksource to use */
-        change_clocksource();
+        update_vsyscall(&xtime, timekeeper.clock);
-        update_vsyscall(&xtime, clock);
 }
 /**
@@ -583,9 +826,12 @@ void update_wall_time(void)
 */
 void getboottime(struct timespec *ts)
 {
-        set_normalized_timespec(ts,
+        struct timespec boottime = {
-                - (wall_to_monotonic.tv_sec + total_sleep_time),
+                .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec,
-                - wall_to_monotonic.tv_nsec);
+                .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec
+        };
+        set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
 }
 /**
@@ -594,7 +840,7 @@ void getboottime(struct timespec *ts)
 */
 void monotonic_to_bootbased(struct timespec *ts)
 {
-        ts->tv_sec += total_sleep_time;
+        *ts = timespec_add_safe(*ts, total_sleep_time);
 }
 unsigned long get_seconds(void)
@@ -603,6 +849,10 @@ unsigned long get_seconds(void)
 }
 EXPORT_SYMBOL(get_seconds);
+struct timespec __current_kernel_time(void)
+{
+        return xtime_cache;
+}
 struct timespec current_kernel_time(void)
 {
@@ -618,3 +868,20 @@ struct timespec current_kernel_time(void)
        return now;
 }
 EXPORT_SYMBOL(current_kernel_time);
+struct timespec get_monotonic_coarse(void)
+{
+        struct timespec now, mono;
+        unsigned long seq;
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                now = xtime_cache;
+                mono = wall_to_monotonic;
+        } while (read_seqretry(&xtime_lock, seq));
+        set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
+                                now.tv_nsec + mono.tv_nsec);
+        return now;
+}
diff --git a/kernel/timer.c b/kernel/timer.c
index a3d25f415019..bbb51074680e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -72,6 +72,7 @@ struct tvec_base {
        spinlock_t lock;
        struct timer_list *running_timer;
        unsigned long timer_jiffies;
+        unsigned long next_timer;
        struct tvec_root tv1;
        struct tvec tv2;
        struct tvec tv3;
@@ -622,6 +623,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
        if (timer_pending(timer)) {
                detach_timer(timer, 0);
+                if (timer->expires == base->next_timer &&
+                    !tbase_get_deferrable(timer->base))
+                        base->next_timer = base->timer_jiffies;
                ret = 1;
        } else {
                if (pending_only)
@@ -663,6 +667,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
        }
        timer->expires = expires;
+        if (time_before(timer->expires, base->next_timer) &&
+            !tbase_get_deferrable(timer->base))
+                base->next_timer = timer->expires;
        internal_add_timer(base, timer);
 out_unlock:
@@ -781,6 +788,9 @@ void add_timer_on(struct timer_list *timer, int cpu)
        spin_lock_irqsave(&base->lock, flags);
        timer_set_base(timer, base);
        debug_timer_activate(timer);
+        if (time_before(timer->expires, base->next_timer) &&
+            !tbase_get_deferrable(timer->base))
+                base->next_timer = timer->expires;
        internal_add_timer(base, timer);
        /*
         * Check whether the other CPU is idle and needs to be
@@ -817,6 +827,9 @@ int del_timer(struct timer_list *timer)
                base = lock_timer_base(timer, &flags);
                if (timer_pending(timer)) {
                        detach_timer(timer, 1);
+                        if (timer->expires == base->next_timer &&
+                            !tbase_get_deferrable(timer->base))
+                                base->next_timer = base->timer_jiffies;
                        ret = 1;
                }
                spin_unlock_irqrestore(&base->lock, flags);
@@ -850,6 +863,9 @@ int try_to_del_timer_sync(struct timer_list *timer)
        ret = 0;
        if (timer_pending(timer)) {
                detach_timer(timer, 1);
+                if (timer->expires == base->next_timer &&
+                    !tbase_get_deferrable(timer->base))
+                        base->next_timer = base->timer_jiffies;
                ret = 1;
        }
 out:
@@ -1007,8 +1023,8 @@ static inline void __run_timers(struct tvec_base *base)
 #ifdef CONFIG_NO_HZ
 /*
 * Find out when the next timer event is due to happen. This
- * is used on S/390 to stop all activity when a cpus is idle.
+ * is used on S/390 to stop all activity when a CPU is idle.
- * This functions needs to be called disabled.
+ * This function needs to be called with interrupts disabled.
 */
 static unsigned long __next_timer_interrupt(struct tvec_base *base)
 {
@@ -1134,7 +1150,9 @@ unsigned long get_next_timer_interrupt(unsigned long now)
        unsigned long expires;
        spin_lock(&base->lock);
-        expires = __next_timer_interrupt(base);
+        if (time_before_eq(base->next_timer, base->timer_jiffies))
+                base->next_timer = __next_timer_interrupt(base);
+        expires = base->next_timer;
        spin_unlock(&base->lock);
        if (time_before_eq(expires, now))
@@ -1522,6 +1540,7 @@ static int __cpuinit init_timers_cpu(int cpu)
                INIT_LIST_HEAD(base->tv1.vec + j);
        base->timer_jiffies = jiffies;
+        base->next_timer = base->timer_jiffies;
        return 0;
 }
@@ -1534,6 +1553,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
                timer = list_first_entry(head, struct timer_list, entry);
                detach_timer(timer, 0);
                timer_set_base(timer, new_base);
+                if (time_before(timer->expires, new_base->next_timer) &&
+                    !tbase_get_deferrable(timer->base))
+                        new_base->next_timer = timer->expires;
                internal_add_timer(new_base, timer);
        }
 }
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 844164dca90a..26f03ac07c2b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
-obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
@@ -54,5 +53,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
+obj-$(CONFIG_EVENT_TRACING) += power-traces.o
 libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cc615f84751b..c71e91bf7372 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2414,11 +2414,9 @@ unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
 static void *
 __g_next(struct seq_file *m, loff_t *pos)
 {
-        unsigned long *array = m->private;
        if (*pos >= ftrace_graph_count)
                return NULL;
-        return &array[*pos];
+        return &ftrace_graph_funcs[*pos];
 }
 static void *
@@ -2482,16 +2480,10 @@ ftrace_graph_open(struct inode *inode, struct file *file)
                ftrace_graph_count = 0;
                memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
        }
+        mutex_unlock(&graph_lock);
-        if (file->f_mode & FMODE_READ) {
+        if (file->f_mode & FMODE_READ)
                ret = seq_open(file, &ftrace_graph_seq_ops);
-                if (!ret) {
-                        struct seq_file *m = file->private_data;
-                        m->private = ftrace_graph_funcs;
-                }
-        } else
-                file->private_data = ftrace_graph_funcs;
-        mutex_unlock(&graph_lock);
        return ret;
 }
@@ -2560,7 +2552,6 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
                   size_t cnt, loff_t *ppos)
 {
        struct trace_parser parser;
-        unsigned long *array;
        size_t read = 0;
        ssize_t ret;
@@ -2574,12 +2565,6 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
                goto out;
        }
-        if (file->f_mode & FMODE_READ) {
-                struct seq_file *m = file->private_data;
-                array = m->private;
-        } else
-                array = file->private_data;
        if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
                ret = -ENOMEM;
                goto out;
@@ -2591,7 +2576,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
                parser.buffer[parser.idx] = 0;
                /* we allow only one expression at a time */
-                ret = ftrace_set_func(array, &ftrace_graph_count,
+                ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
                                        parser.buffer);
                if (ret)
                        goto out;
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
new file mode 100644
index 000000000000..e06c6e3d56a3
--- /dev/null
+++ b/kernel/trace/power-traces.c
@@ -0,0 +1,20 @@
+/*
+ * Power trace points
+ *
+ * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com>
+ */
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/power.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6eef38923b07..d4ff01970547 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -201,8 +201,6 @@ int tracing_is_on(void)
 }
 EXPORT_SYMBOL_GPL(tracing_is_on);
-#include "trace.h"
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT            4U
 #define RB_MAX_SMALL_DATA       (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 420232a1fbba..a35925d222ba 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -125,13 +125,13 @@ int ftrace_dump_on_oops;
 static int tracing_set_tracer(const char *buf);
-#define BOOTUP_TRACER_SIZE              100
+#define MAX_TRACER_SIZE         100
-static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata;
+static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
 static char *default_bootup_tracer;
 static int __init set_ftrace(char *str)
 {
-        strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE);
+        strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
        default_bootup_tracer = bootup_tracer_buf;
        /* We are using ftrace early, expand it */
        ring_buffer_expanded = 1;
@@ -242,13 +242,6 @@ static struct tracer		*trace_types __read_mostly;
 static struct tracer            *current_trace __read_mostly;
 /*
- * max_tracer_type_len is used to simplify the allocating of
- * buffers to read userspace tracer names. We keep track of
- * the longest tracer name registered.
- */
-static int                      max_tracer_type_len;
-/*
 * trace_types_lock is used to protect the trace_types list.
 * This lock is also used to keep user access serialized.
 * Accesses from userspace will grab this lock while userspace
@@ -625,7 +618,6 @@ __releases(kernel_lock)
 __acquires(kernel_lock)
 {
        struct tracer *t;
-        int len;
        int ret = 0;
        if (!type->name) {
@@ -633,6 +625,11 @@ __acquires(kernel_lock)
                return -1;
        }
+        if (strlen(type->name) > MAX_TRACER_SIZE) {
+                pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
+                return -1;
+        }
        /*
         * When this gets called we hold the BKL which means that
         * preemption is disabled. Various trace selftests however
@@ -647,7 +644,7 @@ __acquires(kernel_lock)
        for (t = trace_types; t; t = t->next) {
                if (strcmp(type->name, t->name) == 0) {
                        /* already found */
-                        pr_info("Trace %s already registered\n",
+                        pr_info("Tracer %s already registered\n",
                                type->name);
                        ret = -1;
                        goto out;
@@ -698,9 +695,6 @@ __acquires(kernel_lock)
        type->next = trace_types;
        trace_types = type;
-        len = strlen(type->name);
-        if (len > max_tracer_type_len)
-                max_tracer_type_len = len;
 out:
        tracing_selftest_running = false;
@@ -709,7 +703,7 @@ __acquires(kernel_lock)
        if (ret || !default_bootup_tracer)
                goto out_unlock;
-        if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE))
+        if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
                goto out_unlock;
        printk(KERN_INFO "Starting tracer '%s'\n", type->name);
@@ -731,14 +725,13 @@ __acquires(kernel_lock)
 void unregister_tracer(struct tracer *type)
 {
        struct tracer **t;
-        int len;
        mutex_lock(&trace_types_lock);
        for (t = &trace_types; *t; t = &(*t)->next) {
                if (*t == type)
                        goto found;
        }
-        pr_info("Trace %s not registered\n", type->name);
+        pr_info("Tracer %s not registered\n", type->name);
        goto out;
 found:
@@ -751,17 +744,7 @@ void unregister_tracer(struct tracer *type)
                        current_trace->stop(&global_trace);
                current_trace = &nop_trace;
        }
+out:
-        if (strlen(type->name) != max_tracer_type_len)
-                goto out;
-        max_tracer_type_len = 0;
-        for (t = &trace_types; *t; t = &(*t)->next) {
-                len = strlen((*t)->name);
-                if (len > max_tracer_type_len)
-                        max_tracer_type_len = len;
-        }
- out:
        mutex_unlock(&trace_types_lock);
 }
@@ -2610,7 +2593,7 @@ static ssize_t
 tracing_set_trace_read(struct file *filp, char __user *ubuf,
                       size_t cnt, loff_t *ppos)
 {
-        char buf[max_tracer_type_len+2];
+        char buf[MAX_TRACER_SIZE+2];
        int r;
        mutex_lock(&trace_types_lock);
@@ -2760,15 +2743,15 @@ static ssize_t
 tracing_set_trace_write(struct file *filp, const char __user *ubuf,
                        size_t cnt, loff_t *ppos)
 {
-        char buf[max_tracer_type_len+1];
+        char buf[MAX_TRACER_SIZE+1];
        int i;
        size_t ret;
        int err;
        ret = cnt;
-        if (cnt > max_tracer_type_len)
+        if (cnt > MAX_TRACER_SIZE)
-                cnt = max_tracer_type_len;
+                cnt = MAX_TRACER_SIZE;
        if (copy_from_user(&buf, ubuf, cnt))
                return -EFAULT;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 86bcff94791a..405cb850b75d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,7 +11,6 @@
 #include <linux/ftrace.h>
 #include <trace/boot.h>
 #include <linux/kmemtrace.h>
-#include <trace/power.h>
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
@@ -37,7 +36,6 @@ enum trace_type {
        TRACE_HW_BRANCHES,
        TRACE_KMEM_ALLOC,
        TRACE_KMEM_FREE,
-        TRACE_POWER,
        TRACE_BLK,
        __TRACE_LAST_TYPE,
@@ -207,7 +205,6 @@ extern void __ftrace_bad_type(void);
                IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,      \
                          TRACE_GRAPH_RET);             \
                IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
-                IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
                IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,       \
                          TRACE_KMEM_ALLOC);    \
                IF_ASSIGN(var, ent, struct kmemtrace_free_entry,        \
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index a431748ddd6e..ead3d724599d 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -330,23 +330,6 @@ FTRACE_ENTRY(hw_branch, hw_branch_entry,
        F_printk("from: %llx to: %llx", __entry->from, __entry->to)
 );
-FTRACE_ENTRY(power, trace_power,
-        TRACE_POWER,
-        F_STRUCT(
-                __field_struct( struct power_trace,     state_data      )
-                __field_desc(   s64,    state_data,     stamp           )
-                __field_desc(   s64,    state_data,     end             )
-                __field_desc(   int,    state_data,     type            )
-                __field_desc(   int,    state_data,     state           )
-        ),
-        F_printk("%llx->%llx type:%u state:%u",
-                 __entry->stamp, __entry->end,
-                 __entry->type, __entry->state)
-);
 FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
        TRACE_KMEM_ALLOC,
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 55a25c933d15..dd44b8768867 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -8,6 +8,57 @@
 #include <linux/module.h>
 #include "trace.h"
+/*
+ * We can't use a size but a type in alloc_percpu()
+ * So let's create a dummy type that matches the desired size
+ */
+typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
+char            *trace_profile_buf;
+EXPORT_SYMBOL_GPL(trace_profile_buf);
+char            *trace_profile_buf_nmi;
+EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
+/* Count the events in use (per event id, not per instance) */
+static int      total_profile_count;
+static int ftrace_profile_enable_event(struct ftrace_event_call *event)
+{
+        char *buf;
+        int ret = -ENOMEM;
+        if (atomic_inc_return(&event->profile_count))
+                return 0;
+        if (!total_profile_count++) {
+                buf = (char *)alloc_percpu(profile_buf_t);
+                if (!buf)
+                        goto fail_buf;
+                rcu_assign_pointer(trace_profile_buf, buf);
+                buf = (char *)alloc_percpu(profile_buf_t);
+                if (!buf)
+                        goto fail_buf_nmi;
+                rcu_assign_pointer(trace_profile_buf_nmi, buf);
+        }
+        ret = event->profile_enable();
+        if (!ret)
+                return 0;
+        kfree(trace_profile_buf_nmi);
+fail_buf_nmi:
+        kfree(trace_profile_buf);
+fail_buf:
+        total_profile_count--;
+        atomic_dec(&event->profile_count);
+        return ret;
+}
 int ftrace_profile_enable(int event_id)
 {
        struct ftrace_event_call *event;
@@ -17,7 +68,7 @@ int ftrace_profile_enable(int event_id)
        list_for_each_entry(event, &ftrace_events, list) {
                if (event->id == event_id && event->profile_enable &&
                    try_module_get(event->mod)) {
-                        ret = event->profile_enable(event);
+                        ret = ftrace_profile_enable_event(event);
                        break;
                }
        }
@@ -26,6 +77,33 @@ int ftrace_profile_enable(int event_id)
        return ret;
 }
+static void ftrace_profile_disable_event(struct ftrace_event_call *event)
+{
+        char *buf, *nmi_buf;
+        if (!atomic_add_negative(-1, &event->profile_count))
+                return;
+        event->profile_disable();
+        if (!--total_profile_count) {
+                buf = trace_profile_buf;
+                rcu_assign_pointer(trace_profile_buf, NULL);
+                nmi_buf = trace_profile_buf_nmi;
+                rcu_assign_pointer(trace_profile_buf_nmi, NULL);
+                /*
+                 * Ensure every events in profiling have finished before
+                 * releasing the buffers
+                 */
+                synchronize_sched();
+                free_percpu(buf);
+                free_percpu(nmi_buf);
+        }
+}
 void ftrace_profile_disable(int event_id)
 {
        struct ftrace_event_call *event;
@@ -33,7 +111,7 @@ void ftrace_profile_disable(int event_id)
        mutex_lock(&event_mutex);
        list_for_each_entry(event, &ftrace_events, list) {
                if (event->id == event_id) {
-                        event->profile_disable(event);
+                        ftrace_profile_disable_event(event);
                        module_put(event->mod);
                        break;
                }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 56c260b83a9c..6f03c8a1105e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -271,42 +271,32 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
-        struct list_head *list = m->private;
+        struct ftrace_event_call *call = v;
-        struct ftrace_event_call *call;
        (*pos)++;
-        for (;;) {
+        list_for_each_entry_continue(call, &ftrace_events, list) {
-                if (list == &ftrace_events)
-                        return NULL;
-                call = list_entry(list, struct ftrace_event_call, list);
                /*
                 * The ftrace subsystem is for showing formats only.
                 * They can not be enabled or disabled via the event files.
                 */
                if (call->regfunc)
-                        break;
+                        return call;
-                list = list->next;
        }
-        m->private = list->next;
+        return NULL;
-        return call;
 }
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
-        struct ftrace_event_call *call = NULL;
+        struct ftrace_event_call *call;
        loff_t l;
        mutex_lock(&event_mutex);
-        m->private = ftrace_events.next;
+        call = list_entry(&ftrace_events, struct ftrace_event_call, list);
        for (l = 0; l <= *pos; ) {
-                call = t_next(m, NULL, &l);
+                call = t_next(m, call, &l);
                if (!call)
                        break;
        }
@@ -316,37 +306,28 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 static void *
 s_next(struct seq_file *m, void *v, loff_t *pos)
 {
-        struct list_head *list = m->private;
+        struct ftrace_event_call *call = v;
-        struct ftrace_event_call *call;
        (*pos)++;
- retry:
+        list_for_each_entry_continue(call, &ftrace_events, list) {
-        if (list == &ftrace_events)
+                if (call->enabled)
-                return NULL;
+                        return call;
-        call = list_entry(list, struct ftrace_event_call, list);
-        if (!call->enabled) {
-                list = list->next;
-                goto retry;
        }
-        m->private = list->next;
+        return NULL;
-        return call;
 }
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
-        struct ftrace_event_call *call = NULL;
+        struct ftrace_event_call *call;
        loff_t l;
        mutex_lock(&event_mutex);
-        m->private = ftrace_events.next;
+        call = list_entry(&ftrace_events, struct ftrace_event_call, list);
        for (l = 0; l <= *pos; ) {
-                call = s_next(m, NULL, &l);
+                call = s_next(m, call, &l);
                if (!call)
                        break;
        }
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
deleted file mode 100644
index fe1a00f1445a..000000000000
--- a/kernel/trace/trace_power.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * ring buffer based C-state tracer
- *
- * Arjan van de Ven <arjan@linux.intel.com>
- * Copyright (C) 2008 Intel Corporation
- *
- * Much is borrowed from trace_boot.c which is
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- *
- */
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <trace/power.h>
-#include <linux/kallsyms.h>
-#include <linux/module.h>
-#include "trace.h"
-#include "trace_output.h"
-static struct trace_array *power_trace;
-static int __read_mostly trace_power_enabled;
-static void probe_power_start(struct power_trace *it, unsigned int type,
-                                unsigned int level)
-{
-        if (!trace_power_enabled)
-                return;
-        memset(it, 0, sizeof(struct power_trace));
-        it->state = level;
-        it->type = type;
-        it->stamp = ktime_get();
-}
-static void probe_power_end(struct power_trace *it)
-{
-        struct ftrace_event_call *call = &event_power;
-        struct ring_buffer_event *event;
-        struct ring_buffer *buffer;
-        struct trace_power *entry;
-        struct trace_array_cpu *data;
-        struct trace_array *tr = power_trace;
-        if (!trace_power_enabled)
-                return;
-        buffer = tr->buffer;
-        preempt_disable();
-        it->end = ktime_get();
-        data = tr->data[smp_processor_id()];
-        event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
-                                          sizeof(*entry), 0, 0);
-        if (!event)
-                goto out;
-        entry   = ring_buffer_event_data(event);
-        entry->state_data = *it;
-        if (!filter_check_discard(call, entry, buffer, event))
-                trace_buffer_unlock_commit(buffer, event, 0, 0);
- out:
-        preempt_enable();
-}
-static void probe_power_mark(struct power_trace *it, unsigned int type,
-                                unsigned int level)
-{
-        struct ftrace_event_call *call = &event_power;
-        struct ring_buffer_event *event;
-        struct ring_buffer *buffer;
-        struct trace_power *entry;
-        struct trace_array_cpu *data;
-        struct trace_array *tr = power_trace;
-        if (!trace_power_enabled)
-                return;
-        buffer = tr->buffer;
-        memset(it, 0, sizeof(struct power_trace));
-        it->state = level;
-        it->type = type;
-        it->stamp = ktime_get();
-        preempt_disable();
-        it->end = it->stamp;
-        data = tr->data[smp_processor_id()];
-        event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
-                                          sizeof(*entry), 0, 0);
-        if (!event)
-                goto out;
-        entry   = ring_buffer_event_data(event);
-        entry->state_data = *it;
-        if (!filter_check_discard(call, entry, buffer, event))
-                trace_buffer_unlock_commit(buffer, event, 0, 0);
- out:
-        preempt_enable();
-}
-static int tracing_power_register(void)
-{
-        int ret;
-        ret = register_trace_power_start(probe_power_start);
-        if (ret) {
-                pr_info("power trace: Couldn't activate tracepoint"
-                        " probe to trace_power_start\n");
-                return ret;
-        }
-        ret = register_trace_power_end(probe_power_end);
-        if (ret) {
-                pr_info("power trace: Couldn't activate tracepoint"
-                        " probe to trace_power_end\n");
-                goto fail_start;
-        }
-        ret = register_trace_power_mark(probe_power_mark);
-        if (ret) {
-                pr_info("power trace: Couldn't activate tracepoint"
-                        " probe to trace_power_mark\n");
-                goto fail_end;
-        }
-        return ret;
-fail_end:
-        unregister_trace_power_end(probe_power_end);
-fail_start:
-        unregister_trace_power_start(probe_power_start);
-        return ret;
-}
-static void start_power_trace(struct trace_array *tr)
-{
-        trace_power_enabled = 1;
-}
-static void stop_power_trace(struct trace_array *tr)
-{
-        trace_power_enabled = 0;
-}
-static void power_trace_reset(struct trace_array *tr)
-{
-        trace_power_enabled = 0;
-        unregister_trace_power_start(probe_power_start);
-        unregister_trace_power_end(probe_power_end);
-        unregister_trace_power_mark(probe_power_mark);
-}
-static int power_trace_init(struct trace_array *tr)
-{
-        power_trace = tr;
-        trace_power_enabled = 1;
-        tracing_power_register();
-        tracing_reset_online_cpus(tr);
-        return 0;
-}
-static enum print_line_t power_print_line(struct trace_iterator *iter)
-{
-        int ret = 0;
-        struct trace_entry *entry = iter->ent;
-        struct trace_power *field ;
-        struct power_trace *it;
-        struct trace_seq *s = &iter->seq;
-        struct timespec stamp;
-        struct timespec duration;
-        trace_assign_type(field, entry);
-        it = &field->state_data;
-        stamp = ktime_to_timespec(it->stamp);
-        duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
-        if (entry->type == TRACE_POWER) {
-                if (it->type == POWER_CSTATE)
-                        ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
-                                          stamp.tv_sec,
-                                          stamp.tv_nsec,
-                                          it->state, iter->cpu,
-                                          duration.tv_sec,
-                                          duration.tv_nsec);
-                if (it->type == POWER_PSTATE)
-                        ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
-                                          stamp.tv_sec,
-                                          stamp.tv_nsec,
-                                          it->state, iter->cpu);
-                if (!ret)
-                        return TRACE_TYPE_PARTIAL_LINE;
-                return TRACE_TYPE_HANDLED;
-        }
-        return TRACE_TYPE_UNHANDLED;
-}
-static void power_print_header(struct seq_file *s)
-{
-        seq_puts(s, "#   TIMESTAMP      STATE  EVENT\n");
-        seq_puts(s, "#       |            |      |\n");
-}
-static struct tracer power_tracer __read_mostly =
-{
-        .name           = "power",
-        .init           = power_trace_init,
-        .start          = start_power_trace,
-        .stop           = stop_power_trace,
-        .reset          = power_trace_reset,
-        .print_line     = power_print_line,
-        .print_header   = power_print_header,
-};
-static int init_power_trace(void)
-{
-        return register_tracer(&power_tracer);
-}
-device_initcall(init_power_trace);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 687699d365ae..2547d8813cf0 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -11,7 +11,6 @@
 #include <linux/ftrace.h>
 #include <linux/string.h>
 #include <linux/module.h>
-#include <linux/marker.h>
 #include <linux/mutex.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8712ce3c6a0e..7a3550cf2597 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -384,10 +384,13 @@ static int sys_prof_refcount_exit;
 static void prof_syscall_enter(struct pt_regs *regs, long id)
 {
-        struct syscall_trace_enter *rec;
        struct syscall_metadata *sys_data;
+        struct syscall_trace_enter *rec;
+        unsigned long flags;
+        char *raw_data;
        int syscall_nr;
        int size;
+        int cpu;
        syscall_nr = syscall_get_nr(current, regs);
        if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -402,20 +405,38 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
        size = ALIGN(size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        do {
+        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
-                char raw_data[size];
+                      "profile buffer not large enough"))
+                return;
+        /* Protect the per cpu buffer, begin the rcu read side */
+        local_irq_save(flags);
-                /* zero the dead bytes from align to not leak stack to user */
+        cpu = smp_processor_id();
-                *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+        if (in_nmi())
+                raw_data = rcu_dereference(trace_profile_buf_nmi);
+        else
+                raw_data = rcu_dereference(trace_profile_buf);
+        if (!raw_data)
+                goto end;
-                rec = (struct syscall_trace_enter *) raw_data;
+        raw_data = per_cpu_ptr(raw_data, cpu);
-                tracing_generic_entry_update(&rec->ent, 0, 0);
-                rec->ent.type = sys_data->enter_id;
+        /* zero the dead bytes from align to not leak stack to user */
-                rec->nr = syscall_nr;
+        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-                syscall_get_arguments(current, regs, 0, sys_data->nb_args,
-                                       (unsigned long *)&rec->args);
+        rec = (struct syscall_trace_enter *) raw_data;
-                perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
+        tracing_generic_entry_update(&rec->ent, 0, 0);
-        } while(0);
+        rec->ent.type = sys_data->enter_id;
+        rec->nr = syscall_nr;
+        syscall_get_arguments(current, regs, 0, sys_data->nb_args,
+                               (unsigned long *)&rec->args);
+        perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
+end:
+        local_irq_restore(flags);
 }
 int reg_prof_syscall_enter(char *name)
@@ -460,8 +481,12 @@ void unreg_prof_syscall_enter(char *name)
 static void prof_syscall_exit(struct pt_regs *regs, long ret)
 {
        struct syscall_metadata *sys_data;
-        struct syscall_trace_exit rec;
+        struct syscall_trace_exit *rec;
+        unsigned long flags;
        int syscall_nr;
+        char *raw_data;
+        int size;
+        int cpu;
        syscall_nr = syscall_get_nr(current, regs);
        if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -471,12 +496,46 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
        if (!sys_data)
                return;
-        tracing_generic_entry_update(&rec.ent, 0, 0);
+        /* We can probably do that at build time */
-        rec.ent.type = sys_data->exit_id;
+        size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
-        rec.nr = syscall_nr;
+        size -= sizeof(u32);
-        rec.ret = syscall_get_return_value(current, regs);
-        perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
+        /*
+         * Impossible, but be paranoid with the future
+         * How to put this check outside runtime?
+         */
+        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+                "exit event has grown above profile buffer size"))
+                return;
+        /* Protect the per cpu buffer, begin the rcu read side */
+        local_irq_save(flags);
+        cpu = smp_processor_id();
+        if (in_nmi())
+                raw_data = rcu_dereference(trace_profile_buf_nmi);
+        else
+                raw_data = rcu_dereference(trace_profile_buf);
+        if (!raw_data)
+                goto end;
+        raw_data = per_cpu_ptr(raw_data, cpu);
+        /* zero the dead bytes from align to not leak stack to user */
+        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+        rec = (struct syscall_trace_exit *)raw_data;
+        tracing_generic_entry_update(&rec->ent, 0, 0);
+        rec->ent.type = sys_data->exit_id;
+        rec->nr = syscall_nr;
+        rec->ret = syscall_get_return_value(current, regs);
+        perf_tpcounter_event(sys_data->exit_id, 0, 1, rec, size);
+end:
+        local_irq_restore(flags);
 }
 int reg_prof_syscall_exit(char *name)