10 files changed, 526 insertions, 1416 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 3d9c7e27e3f9..7c9b0a585502 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -87,7 +87,6 @@ obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
-obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
diff --git a/kernel/marker.c b/kernel/marker.c
deleted file mode 100644
index ea54f2647868..000000000000
--- a/kernel/marker.c
+++ /dev/null
@@ -1,930 +0,0 @@
-/*
- * Copyright (C) 2007 Mathieu Desnoyers
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- */
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/types.h>
-#include <linux/jhash.h>
-#include <linux/list.h>
-#include <linux/rcupdate.h>
-#include <linux/marker.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-extern struct marker __start___markers[];
-extern struct marker __stop___markers[];
-/* Set to 1 to enable marker debug output */
-static const int marker_debug;
-/*
- * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
- * and module markers and the hash table.
- */
-static DEFINE_MUTEX(markers_mutex);
-/*
- * Marker hash table, containing the active markers.
- * Protected by module_mutex.
- */
-#define MARKER_HASH_BITS 6
-#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
-static struct hlist_head marker_table[MARKER_TABLE_SIZE];
-/*
- * Note about RCU :
- * It is used to make sure every handler has finished using its private data
- * between two consecutive operation (add or remove) on a given marker.  It is
- * also used to delay the free of multiple probes array until a quiescent state
- * is reached.
- * marker entries modifications are protected by the markers_mutex.
- */
-struct marker_entry {
-        struct hlist_node hlist;
-        char *format;
-                        /* Probe wrapper */
-        void (*call)(const struct marker *mdata, void *call_private, ...);
-        struct marker_probe_closure single;
-        struct marker_probe_closure *multi;
-        int refcount;   /* Number of times armed. 0 if disarmed. */
-        struct rcu_head rcu;
-        void *oldptr;
-        int rcu_pending;
-        unsigned char ptype:1;
-        unsigned char format_allocated:1;
-        char name[0];   /* Contains name'\0'format'\0' */
-};
-/**
- * __mark_empty_function - Empty probe callback
- * @probe_private: probe private data
- * @call_private: call site private data
- * @fmt: format string
- * @...: variable argument list
- *
- * Empty callback provided as a probe to the markers. By providing this to a
- * disabled marker, we make sure the  execution flow is always valid even
- * though the function pointer change and the marker enabling are two distinct
- * operations that modifies the execution flow of preemptible code.
- */
-notrace void __mark_empty_function(void *probe_private, void *call_private,
-        const char *fmt, va_list *args)
-{
-}
-EXPORT_SYMBOL_GPL(__mark_empty_function);
-/*
- * marker_probe_cb Callback that prepares the variable argument list for probes.
- * @mdata: pointer of type struct marker
- * @call_private: caller site private data
- * @...:  Variable argument list.
- *
- * Since we do not use "typical" pointer based RCU in the 1 argument case, we
- * need to put a full smp_rmb() in this branch. This is why we do not use
- * rcu_dereference() for the pointer read.
- */
-notrace void marker_probe_cb(const struct marker *mdata,
-                void *call_private, ...)
-{
-        va_list args;
-        char ptype;
-        /*
-         * rcu_read_lock_sched does two things : disabling preemption to make
-         * sure the teardown of the callbacks can be done correctly when they
-         * are in modules and they insure RCU read coherency.
-         */
-        rcu_read_lock_sched_notrace();
-        ptype = mdata->ptype;
-        if (likely(!ptype)) {
-                marker_probe_func *func;
-                /* Must read the ptype before ptr. They are not data dependant,
-                 * so we put an explicit smp_rmb() here. */
-                smp_rmb();
-                func = mdata->single.func;
-                /* Must read the ptr before private data. They are not data
-                 * dependant, so we put an explicit smp_rmb() here. */
-                smp_rmb();
-                va_start(args, call_private);
-                func(mdata->single.probe_private, call_private, mdata->format,
-                        &args);
-                va_end(args);
-        } else {
-                struct marker_probe_closure *multi;
-                int i;
-                /*
-                 * Read mdata->ptype before mdata->multi.
-                 */
-                smp_rmb();
-                multi = mdata->multi;
-                /*
-                 * multi points to an array, therefore accessing the array
-                 * depends on reading multi. However, even in this case,
-                 * we must insure that the pointer is read _before_ the array
-                 * data. Same as rcu_dereference, but we need a full smp_rmb()
-                 * in the fast path, so put the explicit barrier here.
-                 */
-                smp_read_barrier_depends();
-                for (i = 0; multi[i].func; i++) {
-                        va_start(args, call_private);
-                        multi[i].func(multi[i].probe_private, call_private,
-                                mdata->format, &args);
-                        va_end(args);
-                }
-        }
-        rcu_read_unlock_sched_notrace();
-}
-EXPORT_SYMBOL_GPL(marker_probe_cb);
-/*
- * marker_probe_cb Callback that does not prepare the variable argument list.
- * @mdata: pointer of type struct marker
- * @call_private: caller site private data
- * @...:  Variable argument list.
- *
- * Should be connected to markers "MARK_NOARGS".
- */
-static notrace void marker_probe_cb_noarg(const struct marker *mdata,
-                void *call_private, ...)
-{
-        va_list args;   /* not initialized */
-        char ptype;
-        rcu_read_lock_sched_notrace();
-        ptype = mdata->ptype;
-        if (likely(!ptype)) {
-                marker_probe_func *func;
-                /* Must read the ptype before ptr. They are not data dependant,
-                 * so we put an explicit smp_rmb() here. */
-                smp_rmb();
-                func = mdata->single.func;
-                /* Must read the ptr before private data. They are not data
-                 * dependant, so we put an explicit smp_rmb() here. */
-                smp_rmb();
-                func(mdata->single.probe_private, call_private, mdata->format,
-                        &args);
-        } else {
-                struct marker_probe_closure *multi;
-                int i;
-                /*
-                 * Read mdata->ptype before mdata->multi.
-                 */
-                smp_rmb();
-                multi = mdata->multi;
-                /*
-                 * multi points to an array, therefore accessing the array
-                 * depends on reading multi. However, even in this case,
-                 * we must insure that the pointer is read _before_ the array
-                 * data. Same as rcu_dereference, but we need a full smp_rmb()
-                 * in the fast path, so put the explicit barrier here.
-                 */
-                smp_read_barrier_depends();
-                for (i = 0; multi[i].func; i++)
-                        multi[i].func(multi[i].probe_private, call_private,
-                                mdata->format, &args);
-        }
-        rcu_read_unlock_sched_notrace();
-}
-static void free_old_closure(struct rcu_head *head)
-{
-        struct marker_entry *entry = container_of(head,
-                struct marker_entry, rcu);
-        kfree(entry->oldptr);
-        /* Make sure we free the data before setting the pending flag to 0 */
-        smp_wmb();
-        entry->rcu_pending = 0;
-}
-static void debug_print_probes(struct marker_entry *entry)
-{
-        int i;
-        if (!marker_debug)
-                return;
-        if (!entry->ptype) {
-                printk(KERN_DEBUG "Single probe : %p %p\n",
-                        entry->single.func,
-                        entry->single.probe_private);
-        } else {
-                for (i = 0; entry->multi[i].func; i++)
-                        printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
-                                entry->multi[i].func,
-                                entry->multi[i].probe_private);
-        }
-}
-static struct marker_probe_closure *
-marker_entry_add_probe(struct marker_entry *entry,
-                marker_probe_func *probe, void *probe_private)
-{
-        int nr_probes = 0;
-        struct marker_probe_closure *old, *new;
-        WARN_ON(!probe);
-        debug_print_probes(entry);
-        old = entry->multi;
-        if (!entry->ptype) {
-                if (entry->single.func == probe &&
-                                entry->single.probe_private == probe_private)
-                        return ERR_PTR(-EBUSY);
-                if (entry->single.func == __mark_empty_function) {
-                        /* 0 -> 1 probes */
-                        entry->single.func = probe;
-                        entry->single.probe_private = probe_private;
-                        entry->refcount = 1;
-                        entry->ptype = 0;
-                        debug_print_probes(entry);
-                        return NULL;
-                } else {
-                        /* 1 -> 2 probes */
-                        nr_probes = 1;
-                        old = NULL;
-                }
-        } else {
-                /* (N -> N+1), (N != 0, 1) probes */
-                for (nr_probes = 0; old[nr_probes].func; nr_probes++)
-                        if (old[nr_probes].func == probe
-                                        && old[nr_probes].probe_private
-                                                == probe_private)
-                                return ERR_PTR(-EBUSY);
-        }
-        /* + 2 : one for new probe, one for NULL func */
-        new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
-                        GFP_KERNEL);
-        if (new == NULL)
-                return ERR_PTR(-ENOMEM);
-        if (!old)
-                new[0] = entry->single;
-        else
-                memcpy(new, old,
-                        nr_probes * sizeof(struct marker_probe_closure));
-        new[nr_probes].func = probe;
-        new[nr_probes].probe_private = probe_private;
-        entry->refcount = nr_probes + 1;
-        entry->multi = new;
-        entry->ptype = 1;
-        debug_print_probes(entry);
-        return old;
-}
-static struct marker_probe_closure *
-marker_entry_remove_probe(struct marker_entry *entry,
-                marker_probe_func *probe, void *probe_private)
-{
-        int nr_probes = 0, nr_del = 0, i;
-        struct marker_probe_closure *old, *new;
-        old = entry->multi;
-        debug_print_probes(entry);
-        if (!entry->ptype) {
-                /* 0 -> N is an error */
-                WARN_ON(entry->single.func == __mark_empty_function);
-                /* 1 -> 0 probes */
-                WARN_ON(probe && entry->single.func != probe);
-                WARN_ON(entry->single.probe_private != probe_private);
-                entry->single.func = __mark_empty_function;
-                entry->refcount = 0;
-                entry->ptype = 0;
-                debug_print_probes(entry);
-                return NULL;
-        } else {
-                /* (N -> M), (N > 1, M >= 0) probes */
-                for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
-                        if ((!probe || old[nr_probes].func == probe)
-                                        && old[nr_probes].probe_private
-                                                == probe_private)
-                                nr_del++;
-                }
-        }
-        if (nr_probes - nr_del == 0) {
-                /* N -> 0, (N > 1) */
-                entry->single.func = __mark_empty_function;
-                entry->refcount = 0;
-                entry->ptype = 0;
-        } else if (nr_probes - nr_del == 1) {
-                /* N -> 1, (N > 1) */
-                for (i = 0; old[i].func; i++)
-                        if ((probe && old[i].func != probe) ||
-                                        old[i].probe_private != probe_private)
-                                entry->single = old[i];
-                entry->refcount = 1;
-                entry->ptype = 0;
-        } else {
-                int j = 0;
-                /* N -> M, (N > 1, M > 1) */
-                /* + 1 for NULL */
-                new = kzalloc((nr_probes - nr_del + 1)
-                        * sizeof(struct marker_probe_closure), GFP_KERNEL);
-                if (new == NULL)
-                        return ERR_PTR(-ENOMEM);
-                for (i = 0; old[i].func; i++)
-                        if ((probe && old[i].func != probe) ||
-                                        old[i].probe_private != probe_private)
-                                new[j++] = old[i];
-                entry->refcount = nr_probes - nr_del;
-                entry->ptype = 1;
-                entry->multi = new;
-        }
-        debug_print_probes(entry);
-        return old;
-}
-/*
- * Get marker if the marker is present in the marker hash table.
- * Must be called with markers_mutex held.
- * Returns NULL if not present.
- */
-static struct marker_entry *get_marker(const char *name)
-{
-        struct hlist_head *head;
-        struct hlist_node *node;
-        struct marker_entry *e;
-        u32 hash = jhash(name, strlen(name), 0);
-        head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
-        hlist_for_each_entry(e, node, head, hlist) {
-                if (!strcmp(name, e->name))
-                        return e;
-        }
-        return NULL;
-}
-/*
- * Add the marker to the marker hash table. Must be called with markers_mutex
- * held.
- */
-static struct marker_entry *add_marker(const char *name, const char *format)
-{
-        struct hlist_head *head;
-        struct hlist_node *node;
-        struct marker_entry *e;
-        size_t name_len = strlen(name) + 1;
-        size_t format_len = 0;
-        u32 hash = jhash(name, name_len-1, 0);
-        if (format)
-                format_len = strlen(format) + 1;
-        head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
-        hlist_for_each_entry(e, node, head, hlist) {
-                if (!strcmp(name, e->name)) {
-                        printk(KERN_NOTICE
-                                "Marker %s busy\n", name);
-                        return ERR_PTR(-EBUSY); /* Already there */
-                }
-        }
-        /*
-         * Using kmalloc here to allocate a variable length element. Could
-         * cause some memory fragmentation if overused.
-         */
-        e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
-                        GFP_KERNEL);
-        if (!e)
-                return ERR_PTR(-ENOMEM);
-        memcpy(&e->name[0], name, name_len);
-        if (format) {
-                e->format = &e->name[name_len];
-                memcpy(e->format, format, format_len);
-                if (strcmp(e->format, MARK_NOARGS) == 0)
-                        e->call = marker_probe_cb_noarg;
-                else
-                        e->call = marker_probe_cb;
-                trace_mark(core_marker_format, "name %s format %s",
-                                e->name, e->format);
-        } else {
-                e->format = NULL;
-                e->call = marker_probe_cb;
-        }
-        e->single.func = __mark_empty_function;
-        e->single.probe_private = NULL;
-        e->multi = NULL;
-        e->ptype = 0;
-        e->format_allocated = 0;
-        e->refcount = 0;
-        e->rcu_pending = 0;
-        hlist_add_head(&e->hlist, head);
-        return e;
-}
-/*
- * Remove the marker from the marker hash table. Must be called with mutex_lock
- * held.
- */
-static int remove_marker(const char *name)
-{
-        struct hlist_head *head;
-        struct hlist_node *node;
-        struct marker_entry *e;
-        int found = 0;
-        size_t len = strlen(name) + 1;
-        u32 hash = jhash(name, len-1, 0);
-        head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
-        hlist_for_each_entry(e, node, head, hlist) {
-                if (!strcmp(name, e->name)) {
-                        found = 1;
-                        break;
-                }
-        }
-        if (!found)
-                return -ENOENT;
-        if (e->single.func != __mark_empty_function)
-                return -EBUSY;
-        hlist_del(&e->hlist);
-        if (e->format_allocated)
-                kfree(e->format);
-        /* Make sure the call_rcu has been executed */
-        if (e->rcu_pending)
-                rcu_barrier_sched();
-        kfree(e);
-        return 0;
-}
-/*
- * Set the mark_entry format to the format found in the element.
- */
-static int marker_set_format(struct marker_entry *entry, const char *format)
-{
-        entry->format = kstrdup(format, GFP_KERNEL);
-        if (!entry->format)
-                return -ENOMEM;
-        entry->format_allocated = 1;
-        trace_mark(core_marker_format, "name %s format %s",
-                        entry->name, entry->format);
-        return 0;
-}
-/*
- * Sets the probe callback corresponding to one marker.
- */
-static int set_marker(struct marker_entry *entry, struct marker *elem,
-                int active)
-{
-        int ret = 0;
-        WARN_ON(strcmp(entry->name, elem->name) != 0);
-        if (entry->format) {
-                if (strcmp(entry->format, elem->format) != 0) {
-                        printk(KERN_NOTICE
-                                "Format mismatch for probe %s "
-                                "(%s), marker (%s)\n",
-                                entry->name,
-                                entry->format,
-                                elem->format);
-                        return -EPERM;
-                }
-        } else {
-                ret = marker_set_format(entry, elem->format);
-                if (ret)
-                        return ret;
-        }
-        /*
-         * probe_cb setup (statically known) is done here. It is
-         * asynchronous with the rest of execution, therefore we only
-         * pass from a "safe" callback (with argument) to an "unsafe"
-         * callback (does not set arguments).
-         */
-        elem->call = entry->call;
-        /*
-         * Sanity check :
-         * We only update the single probe private data when the ptr is
-         * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
-         */
-        WARN_ON(elem->single.func != __mark_empty_function
-                && elem->single.probe_private != entry->single.probe_private
-                && !elem->ptype);
-        elem->single.probe_private = entry->single.probe_private;
-        /*
-         * Make sure the private data is valid when we update the
-         * single probe ptr.
-         */
-        smp_wmb();
-        elem->single.func = entry->single.func;
-        /*
-         * We also make sure that the new probe callbacks array is consistent
-         * before setting a pointer to it.
-         */
-        rcu_assign_pointer(elem->multi, entry->multi);
-        /*
-         * Update the function or multi probe array pointer before setting the
-         * ptype.
-         */
-        smp_wmb();
-        elem->ptype = entry->ptype;
-        if (elem->tp_name && (active ^ elem->state)) {
-                WARN_ON(!elem->tp_cb);
-                /*
-                 * It is ok to directly call the probe registration because type
-                 * checking has been done in the __trace_mark_tp() macro.
-                 */
-                if (active) {
-                        /*
-                         * try_module_get should always succeed because we hold
-                         * lock_module() to get the tp_cb address.
-                         */
-                        ret = try_module_get(__module_text_address(
-                                (unsigned long)elem->tp_cb));
-                        BUG_ON(!ret);
-                        ret = tracepoint_probe_register_noupdate(
-                                elem->tp_name,
-                                elem->tp_cb);
-                } else {
-                        ret = tracepoint_probe_unregister_noupdate(
-                                elem->tp_name,
-                                elem->tp_cb);
-                        /*
-                         * tracepoint_probe_update_all() must be called
-                         * before the module containing tp_cb is unloaded.
-                         */
-                        module_put(__module_text_address(
-                                (unsigned long)elem->tp_cb));
-                }
-        }
-        elem->state = active;
-        return ret;
-}
-/*
- * Disable a marker and its probe callback.
- * Note: only waiting an RCU period after setting elem->call to the empty
- * function insures that the original callback is not used anymore. This insured
- * by rcu_read_lock_sched around the call site.
- */
-static void disable_marker(struct marker *elem)
-{
-        int ret;
-        /* leave "call" as is. It is known statically. */
-        if (elem->tp_name && elem->state) {
-                WARN_ON(!elem->tp_cb);
-                /*
-                 * It is ok to directly call the probe registration because type
-                 * checking has been done in the __trace_mark_tp() macro.
-                 */
-                ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
-                        elem->tp_cb);
-                WARN_ON(ret);
-                /*
-                 * tracepoint_probe_update_all() must be called
-                 * before the module containing tp_cb is unloaded.
-                 */
-                module_put(__module_text_address((unsigned long)elem->tp_cb));
-        }
-        elem->state = 0;
-        elem->single.func = __mark_empty_function;
-        /* Update the function before setting the ptype */
-        smp_wmb();
-        elem->ptype = 0;        /* single probe */
-        /*
-         * Leave the private data and id there, because removal is racy and
-         * should be done only after an RCU period. These are never used until
-         * the next initialization anyway.
-         */
-}
-/**
- * marker_update_probe_range - Update a probe range
- * @begin: beginning of the range
- * @end: end of the range
- *
- * Updates the probe callback corresponding to a range of markers.
- */
-void marker_update_probe_range(struct marker *begin,
-        struct marker *end)
-{
-        struct marker *iter;
-        struct marker_entry *mark_entry;
-        mutex_lock(&markers_mutex);
-        for (iter = begin; iter < end; iter++) {
-                mark_entry = get_marker(iter->name);
-                if (mark_entry) {
-                        set_marker(mark_entry, iter, !!mark_entry->refcount);
-                        /*
-                         * ignore error, continue
-                         */
-                } else {
-                        disable_marker(iter);
-                }
-        }
-        mutex_unlock(&markers_mutex);
-}
-/*
- * Update probes, removing the faulty probes.
- *
- * Internal callback only changed before the first probe is connected to it.
- * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
- * transitions.  All other transitions will leave the old private data valid.
- * This makes the non-atomicity of the callback/private data updates valid.
- *
- * "special case" updates :
- * 0 -> 1 callback
- * 1 -> 0 callback
- * 1 -> 2 callbacks
- * 2 -> 1 callbacks
- * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
- * Site effect : marker_set_format may delete the marker entry (creating a
- * replacement).
- */
-static void marker_update_probes(void)
-{
-        /* Core kernel markers */
-        marker_update_probe_range(__start___markers, __stop___markers);
-        /* Markers in modules. */
-        module_update_markers();
-        tracepoint_probe_update_all();
-}
-/**
- * marker_probe_register -  Connect a probe to a marker
- * @name: marker name
- * @format: format string
- * @probe: probe handler
- * @probe_private: probe private data
- *
- * private data must be a valid allocated memory address, or NULL.
- * Returns 0 if ok, error value on error.
- * The probe address must at least be aligned on the architecture pointer size.
- */
-int marker_probe_register(const char *name, const char *format,
-                        marker_probe_func *probe, void *probe_private)
-{
-        struct marker_entry *entry;
-        int ret = 0;
-        struct marker_probe_closure *old;
-        mutex_lock(&markers_mutex);
-        entry = get_marker(name);
-        if (!entry) {
-                entry = add_marker(name, format);
-                if (IS_ERR(entry))
-                        ret = PTR_ERR(entry);
-        } else if (format) {
-                if (!entry->format)
-                        ret = marker_set_format(entry, format);
-                else if (strcmp(entry->format, format))
-                        ret = -EPERM;
-        }
-        if (ret)
-                goto end;
-        /*
-         * If we detect that a call_rcu is pending for this marker,
-         * make sure it's executed now.
-         */
-        if (entry->rcu_pending)
-                rcu_barrier_sched();
-        old = marker_entry_add_probe(entry, probe, probe_private);
-        if (IS_ERR(old)) {
-                ret = PTR_ERR(old);
-                goto end;
-        }
-        mutex_unlock(&markers_mutex);
-        marker_update_probes();
-        mutex_lock(&markers_mutex);
-        entry = get_marker(name);
-        if (!entry)
-                goto end;
-        if (entry->rcu_pending)
-                rcu_barrier_sched();
-        entry->oldptr = old;
-        entry->rcu_pending = 1;
-        /* write rcu_pending before calling the RCU callback */
-        smp_wmb();
-        call_rcu_sched(&entry->rcu, free_old_closure);
-end:
-        mutex_unlock(&markers_mutex);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(marker_probe_register);
-/**
- * marker_probe_unregister -  Disconnect a probe from a marker
- * @name: marker name
- * @probe: probe function pointer
- * @probe_private: probe private data
- *
- * Returns the private data given to marker_probe_register, or an ERR_PTR().
- * We do not need to call a synchronize_sched to make sure the probes have
- * finished running before doing a module unload, because the module unload
- * itself uses stop_machine(), which insures that every preempt disabled section
- * have finished.
- */
-int marker_probe_unregister(const char *name,
-        marker_probe_func *probe, void *probe_private)
-{
-        struct marker_entry *entry;
-        struct marker_probe_closure *old;
-        int ret = -ENOENT;
-        mutex_lock(&markers_mutex);
-        entry = get_marker(name);
-        if (!entry)
-                goto end;
-        if (entry->rcu_pending)
-                rcu_barrier_sched();
-        old = marker_entry_remove_probe(entry, probe, probe_private);
-        mutex_unlock(&markers_mutex);
-        marker_update_probes();
-        mutex_lock(&markers_mutex);
-        entry = get_marker(name);
-        if (!entry)
-                goto end;
-        if (entry->rcu_pending)
-                rcu_barrier_sched();
-        entry->oldptr = old;
-        entry->rcu_pending = 1;
-        /* write rcu_pending before calling the RCU callback */
-        smp_wmb();
-        call_rcu_sched(&entry->rcu, free_old_closure);
-        remove_marker(name);    /* Ignore busy error message */
-        ret = 0;
-end:
-        mutex_unlock(&markers_mutex);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(marker_probe_unregister);
-static struct marker_entry *
-get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
-{
-        struct marker_entry *entry;
-        unsigned int i;
-        struct hlist_head *head;
-        struct hlist_node *node;
-        for (i = 0; i < MARKER_TABLE_SIZE; i++) {
-                head = &marker_table[i];
-                hlist_for_each_entry(entry, node, head, hlist) {
-                        if (!entry->ptype) {
-                                if (entry->single.func == probe
-                                                && entry->single.probe_private
-                                                == probe_private)
-                                        return entry;
-                        } else {
-                                struct marker_probe_closure *closure;
-                                closure = entry->multi;
-                                for (i = 0; closure[i].func; i++) {
-                                        if (closure[i].func == probe &&
-                                                        closure[i].probe_private
-                                                        == probe_private)
-                                                return entry;
-                                }
-                        }
-                }
-        }
-        return NULL;
-}
-/**
- * marker_probe_unregister_private_data -  Disconnect a probe from a marker
- * @probe: probe function
- * @probe_private: probe private data
- *
- * Unregister a probe by providing the registered private data.
- * Only removes the first marker found in hash table.
- * Return 0 on success or error value.
- * We do not need to call a synchronize_sched to make sure the probes have
- * finished running before doing a module unload, because the module unload
- * itself uses stop_machine(), which insures that every preempt disabled section
- * have finished.
- */
-int marker_probe_unregister_private_data(marker_probe_func *probe,
-                void *probe_private)
-{
-        struct marker_entry *entry;
-        int ret = 0;
-        struct marker_probe_closure *old;
-        mutex_lock(&markers_mutex);
-        entry = get_marker_from_private_data(probe, probe_private);
-        if (!entry) {
-                ret = -ENOENT;
-                goto end;
-        }
-        if (entry->rcu_pending)
-                rcu_barrier_sched();
-        old = marker_entry_remove_probe(entry, NULL, probe_private);
-        mutex_unlock(&markers_mutex);
-        marker_update_probes();
-        mutex_lock(&markers_mutex);
-        entry = get_marker_from_private_data(probe, probe_private);
-        if (!entry)
-                goto end;
-        if (entry->rcu_pending)
-                rcu_barrier_sched();
-        entry->oldptr = old;
-        entry->rcu_pending = 1;
-        /* write rcu_pending before calling the RCU callback */
-        smp_wmb();
-        call_rcu_sched(&entry->rcu, free_old_closure);
-        remove_marker(entry->name);     /* Ignore busy error message */
-end:
-        mutex_unlock(&markers_mutex);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
-/**
- * marker_get_private_data - Get a marker's probe private data
- * @name: marker name
- * @probe: probe to match
- * @num: get the nth matching probe's private data
- *
- * Returns the nth private data pointer (starting from 0) matching, or an
- * ERR_PTR.
- * Returns the private data pointer, or an ERR_PTR.
- * The private data pointer should _only_ be dereferenced if the caller is the
- * owner of the data, or its content could vanish. This is mostly used to
- * confirm that a caller is the owner of a registered probe.
- */
-void *marker_get_private_data(const char *name, marker_probe_func *probe,
-                int num)
-{
-        struct hlist_head *head;
-        struct hlist_node *node;
-        struct marker_entry *e;
-        size_t name_len = strlen(name) + 1;
-        u32 hash = jhash(name, name_len-1, 0);
-        int i;
-        head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
-        hlist_for_each_entry(e, node, head, hlist) {
-                if (!strcmp(name, e->name)) {
-                        if (!e->ptype) {
-                                if (num == 0 && e->single.func == probe)
-                                        return e->single.probe_private;
-                        } else {
-                                struct marker_probe_closure *closure;
-                                int match = 0;
-                                closure = e->multi;
-                                for (i = 0; closure[i].func; i++) {
-                                        if (closure[i].func != probe)
-                                                continue;
-                                        if (match++ == num)
-                                                return closure[i].probe_private;
-                                }
-                        }
-                        break;
-                }
-        }
-        return ERR_PTR(-ENOENT);
-}
-EXPORT_SYMBOL_GPL(marker_get_private_data);
-#ifdef CONFIG_MODULES
-int marker_module_notify(struct notifier_block *self,
-                         unsigned long val, void *data)
-{
-        struct module *mod = data;
-        switch (val) {
-        case MODULE_STATE_COMING:
-                marker_update_probe_range(mod->markers,
-                        mod->markers + mod->num_markers);
-                break;
-        case MODULE_STATE_GOING:
-                marker_update_probe_range(mod->markers,
-                        mod->markers + mod->num_markers);
-                break;
-        }
-        return 0;
-}
-struct notifier_block marker_module_nb = {
-        .notifier_call = marker_module_notify,
-        .priority = 0,
-};
-static int init_markers(void)
-{
-        return register_module_notifier(&marker_module_nb);
-}
-__initcall(init_markers);
-#endif /* CONFIG_MODULES */
diff --git a/kernel/module.c b/kernel/module.c
index 05ce49ced8f6..b6ee424245dd 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2237,10 +2237,6 @@ static noinline struct module *load_module(void __user *umod,
                                  sizeof(*mod->ctors), &mod->num_ctors);
 #endif
-#ifdef CONFIG_MARKERS
-        mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
-                                    sizeof(*mod->markers), &mod->num_markers);
-#endif
 #ifdef CONFIG_TRACEPOINTS
        mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
                                        "__tracepoints",
@@ -2958,20 +2954,6 @@ void module_layout(struct module *mod,
 EXPORT_SYMBOL(module_layout);
 #endif
-#ifdef CONFIG_MARKERS
-void module_update_markers(void)
-{
-        struct module *mod;
-        mutex_lock(&module_mutex);
-        list_for_each_entry(mod, &modules, list)
-                if (!mod->taints)
-                        marker_update_probe_range(mod->markers,
-                                mod->markers + mod->num_markers);
-        mutex_unlock(&module_mutex);
-}
-#endif
 #ifdef CONFIG_TRACEPOINTS
 void module_update_tracepoints(void)
 {
diff --git a/kernel/sched.c b/kernel/sched.c
index d9db3fb17573..faf4d463bbff 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -119,8 +119,6 @@
 */
 #define RUNTIME_INF     ((u64)~0ULL)
-static void double_rq_lock(struct rq *rq1, struct rq *rq2);
 static inline int rt_policy(int policy)
 {
        if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -378,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 #else
-#ifdef CONFIG_SMP
-static int root_task_group_empty(void)
-{
-        return 1;
-}
-#endif
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 static inline struct task_group *task_group(struct task_struct *p)
 {
@@ -514,14 +505,6 @@ struct root_domain {
 #ifdef CONFIG_SMP
        struct cpupri cpupri;
 #endif
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-        /*
-         * Preferred wake up cpu nominated by sched_mc balance that will be
-         * used when most cpus are idle in the system indicating overall very
-         * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
-         */
-        unsigned int sched_mc_preferred_wakeup_cpu;
-#endif
 };
 /*
@@ -646,9 +629,10 @@ struct rq {
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
+static inline
+void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
-        rq->curr->sched_class->check_preempt_curr(rq, p, sync);
+        rq->curr->sched_class->check_preempt_curr(rq, p, flags);
 }
 static inline int cpu_of(struct rq *rq)
@@ -1509,8 +1493,65 @@ static int tg_nop(struct task_group *tg, void *data)
 #endif
 #ifdef CONFIG_SMP
-static unsigned long source_load(int cpu, int type);
+/* Used instead of source_load when we know the type == 0 */
-static unsigned long target_load(int cpu, int type);
+static unsigned long weighted_cpuload(const int cpu)
+{
+        return cpu_rq(cpu)->load.weight;
+}
+/*
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
+ *
+ * We want to under-estimate the load of migration sources, to
+ * balance conservatively.
+ */
+static unsigned long source_load(int cpu, int type)
+{
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long total = weighted_cpuload(cpu);
+        if (type == 0 || !sched_feat(LB_BIAS))
+                return total;
+        return min(rq->cpu_load[type-1], total);
+}
+/*
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
+ */
+static unsigned long target_load(int cpu, int type)
+{
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long total = weighted_cpuload(cpu);
+        if (type == 0 || !sched_feat(LB_BIAS))
+                return total;
+        return max(rq->cpu_load[type-1], total);
+}
+static struct sched_group *group_of(int cpu)
+{
+        struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
+        if (!sd)
+                return NULL;
+        return sd->groups;
+}
+static unsigned long power_of(int cpu)
+{
+        struct sched_group *group = group_of(cpu);
+        if (!group)
+                return SCHED_LOAD_SCALE;
+        return group->cpu_power;
+}
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 static unsigned long cpu_avg_load_per_task(int cpu)
@@ -1695,6 +1736,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 #ifdef CONFIG_PREEMPT
+static void double_rq_lock(struct rq *rq1, struct rq *rq2);
 /*
 * fair double_lock_balance: Safely acquires both rq->locks in a fair
 * way at the expense of forcing extra atomic operations in all
@@ -1959,13 +2002,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 }
 #ifdef CONFIG_SMP
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
-{
-        return cpu_rq(cpu)->load.weight;
-}
 /*
 * Is this task likely cache-hot:
 */
@@ -2239,185 +2275,6 @@ void kick_process(struct task_struct *p)
        preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kick_process);
-/*
- * Return a low guess at the load of a migration-source cpu weighted
- * according to the scheduling class and "nice" value.
- *
- * We want to under-estimate the load of migration sources, to
- * balance conservatively.
- */
-static unsigned long source_load(int cpu, int type)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long total = weighted_cpuload(cpu);
-        if (type == 0 || !sched_feat(LB_BIAS))
-                return total;
-        return min(rq->cpu_load[type-1], total);
-}
-/*
- * Return a high guess at the load of a migration-target cpu weighted
- * according to the scheduling class and "nice" value.
- */
-static unsigned long target_load(int cpu, int type)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long total = weighted_cpuload(cpu);
-        if (type == 0 || !sched_feat(LB_BIAS))
-                return total;
-        return max(rq->cpu_load[type-1], total);
-}
-/*
- * find_idlest_group finds and returns the least busy CPU group within the
- * domain.
- */
-static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
-{
-        struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
-        unsigned long min_load = ULONG_MAX, this_load = 0;
-        int load_idx = sd->forkexec_idx;
-        int imbalance = 100 + (sd->imbalance_pct-100)/2;
-        do {
-                unsigned long load, avg_load;
-                int local_group;
-                int i;
-                /* Skip over this group if it has no CPUs allowed */
-                if (!cpumask_intersects(sched_group_cpus(group),
-                                        &p->cpus_allowed))
-                        continue;
-                local_group = cpumask_test_cpu(this_cpu,
-                                               sched_group_cpus(group));
-                /* Tally up the load of all CPUs in the group */
-                avg_load = 0;
-                for_each_cpu(i, sched_group_cpus(group)) {
-                        /* Bias balancing toward cpus of our domain */
-                        if (local_group)
-                                load = source_load(i, load_idx);
-                        else
-                                load = target_load(i, load_idx);
-                        avg_load += load;
-                }
-                /* Adjust by relative CPU power of the group */
-                avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
-                if (local_group) {
-                        this_load = avg_load;
-                        this = group;
-                } else if (avg_load < min_load) {
-                        min_load = avg_load;
-                        idlest = group;
-                }
-        } while (group = group->next, group != sd->groups);
-        if (!idlest || 100*this_load < imbalance*min_load)
-                return NULL;
-        return idlest;
-}
-/*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
- */
-static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
-{
-        unsigned long load, min_load = ULONG_MAX;
-        int idlest = -1;
-        int i;
-        /* Traverse only the allowed CPUs */
-        for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
-                load = weighted_cpuload(i);
-                if (load < min_load || (load == min_load && i == this_cpu)) {
-                        min_load = load;
-                        idlest = i;
-                }
-        }
-        return idlest;
-}
-/*
- * sched_balance_self: balance the current task (running on cpu) in domains
- * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
- * SD_BALANCE_EXEC.
- *
- * Balance, ie. select the least loaded group.
- *
- * Returns the target CPU number, or the same CPU if no balancing is needed.
- *
- * preempt must be disabled.
- */
-static int sched_balance_self(int cpu, int flag)
-{
-        struct task_struct *t = current;
-        struct sched_domain *tmp, *sd = NULL;
-        for_each_domain(cpu, tmp) {
-                /*
-                 * If power savings logic is enabled for a domain, stop there.
-                 */
-                if (tmp->flags & SD_POWERSAVINGS_BALANCE)
-                        break;
-                if (tmp->flags & flag)
-                        sd = tmp;
-        }
-        if (sd)
-                update_shares(sd);
-        while (sd) {
-                struct sched_group *group;
-                int new_cpu, weight;
-                if (!(sd->flags & flag)) {
-                        sd = sd->child;
-                        continue;
-                }
-                group = find_idlest_group(sd, t, cpu);
-                if (!group) {
-                        sd = sd->child;
-                        continue;
-                }
-                new_cpu = find_idlest_cpu(group, t, cpu);
-                if (new_cpu == -1 || new_cpu == cpu) {
-                        /* Now try balancing at a lower domain level of cpu */
-                        sd = sd->child;
-                        continue;
-                }
-                /* Now try balancing at a lower domain level of new_cpu */
-                cpu = new_cpu;
-                weight = cpumask_weight(sched_domain_span(sd));
-                sd = NULL;
-                for_each_domain(cpu, tmp) {
-                        if (weight <= cpumask_weight(sched_domain_span(tmp)))
-                                break;
-                        if (tmp->flags & flag)
-                                sd = tmp;
-                }
-                /* while loop will break here if sd == NULL */
-        }
-        return cpu;
-}
 #endif /* CONFIG_SMP */
 /**
@@ -2455,37 +2312,22 @@ void task_oncpu_function_call(struct task_struct *p,
 *
 * returns failure only if the task is already active.
 */
-static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
+static int try_to_wake_up(struct task_struct *p, unsigned int state,
+                          int wake_flags)
 {
        int cpu, orig_cpu, this_cpu, success = 0;
        unsigned long flags;
-        long old_state;
        struct rq *rq;
        if (!sched_feat(SYNC_WAKEUPS))
-                sync = 0;
+                wake_flags &= ~WF_SYNC;
-#ifdef CONFIG_SMP
-        if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
-                struct sched_domain *sd;
-                this_cpu = raw_smp_processor_id();
+        this_cpu = get_cpu();
-                cpu = task_cpu(p);
-                for_each_domain(this_cpu, sd) {
-                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                                update_shares(sd);
-                                break;
-                        }
-                }
-        }
-#endif
        smp_wmb();
        rq = task_rq_lock(p, &flags);
        update_rq_clock(rq);
-        old_state = p->state;
+        if (!(p->state & state))
-        if (!(old_state & state))
                goto out;
        if (p->se.on_rq)
@@ -2493,27 +2335,29 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
        cpu = task_cpu(p);
        orig_cpu = cpu;
-        this_cpu = smp_processor_id();
 #ifdef CONFIG_SMP
        if (unlikely(task_running(rq, p)))
                goto out_activate;
-        cpu = p->sched_class->select_task_rq(p, sync);
+        /*
-        if (cpu != orig_cpu) {
+         * In order to handle concurrent wakeups and release the rq->lock
+         * we put the task in TASK_WAKING state.
+         *
+         * First fix up the nr_uninterruptible count:
+         */
+        if (task_contributes_to_load(p))
+                rq->nr_uninterruptible--;
+        p->state = TASK_WAKING;
+        task_rq_unlock(rq, &flags);
+        cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+        if (cpu != orig_cpu)
                set_task_cpu(p, cpu);
-                task_rq_unlock(rq, &flags);
-                /* might preempt at this point */
-                rq = task_rq_lock(p, &flags);
-                old_state = p->state;
-                if (!(old_state & state))
-                        goto out;
-                if (p->se.on_rq)
-                        goto out_running;
-                this_cpu = smp_processor_id();
+        rq = task_rq_lock(p, &flags);
-                cpu = task_cpu(p);
+        WARN_ON(p->state != TASK_WAKING);
-        }
+        cpu = task_cpu(p);
 #ifdef CONFIG_SCHEDSTATS
        schedstat_inc(rq, ttwu_count);
@@ -2533,7 +2377,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 out_activate:
 #endif /* CONFIG_SMP */
        schedstat_inc(p, se.nr_wakeups);
-        if (sync)
+        if (wake_flags & WF_SYNC)
                schedstat_inc(p, se.nr_wakeups_sync);
        if (orig_cpu != cpu)
                schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2562,7 +2406,7 @@ out_activate:
 out_running:
        trace_sched_wakeup(rq, p, success);
-        check_preempt_curr(rq, p, sync);
+        check_preempt_curr(rq, p, wake_flags);
        p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
@@ -2571,6 +2415,7 @@ out_running:
 #endif
 out:
        task_rq_unlock(rq, &flags);
+        put_cpu();
        return success;
 }
@@ -2613,6 +2458,7 @@ static void __sched_fork(struct task_struct *p)
        p->se.avg_overlap               = 0;
        p->se.start_runtime             = 0;
        p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
+        p->se.avg_running               = 0;
 #ifdef CONFIG_SCHEDSTATS
        p->se.wait_start                        = 0;
@@ -2674,11 +2520,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
        __sched_fork(p);
-#ifdef CONFIG_SMP
-        cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
-#endif
-        set_task_cpu(p, cpu);
        /*
         * Make sure we do not leak PI boosting priority to the child.
         */
@@ -2709,6 +2550,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
        if (!rt_prio(p->prio))
                p->sched_class = &fair_sched_class;
+#ifdef CONFIG_SMP
+        cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
+#endif
+        set_task_cpu(p, cpu);
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
        if (likely(sched_info_on()))
                memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -2754,7 +2600,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                inc_nr_running(rq);
        }
        trace_sched_wakeup_new(rq, p, 1);
-        check_preempt_curr(rq, p, 0);
+        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
                p->sched_class->task_wake_up(rq, p);
@@ -3263,7 +3109,7 @@ out:
 void sched_exec(void)
 {
        int new_cpu, this_cpu = get_cpu();
-        new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
+        new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
        put_cpu();
        if (new_cpu != this_cpu)
                sched_migrate_task(current, new_cpu);
@@ -3683,11 +3529,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
        *imbalance = sds->min_load_per_task;
        sds->busiest = sds->group_min;
-        if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
-                cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-                        group_first_cpu(sds->group_leader);
-        }
        return 1;
 }
@@ -3711,7 +3552,18 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+        return SCHED_LOAD_SCALE;
+}
+unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+        return default_scale_freq_power(sd, cpu);
+}
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
 {
        unsigned long weight = cpumask_weight(sched_domain_span(sd));
        unsigned long smt_gain = sd->smt_gain;
@@ -3721,6 +3573,11 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
        return smt_gain;
 }
+unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+        return default_scale_smt_power(sd, cpu);
+}
 unsigned long scale_rt_power(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -3745,10 +3602,19 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
        unsigned long power = SCHED_LOAD_SCALE;
        struct sched_group *sdg = sd->groups;
-        /* here we could scale based on cpufreq */
+        if (sched_feat(ARCH_POWER))
+                power *= arch_scale_freq_power(sd, cpu);
+        else
+                power *= default_scale_freq_power(sd, cpu);
+        power >>= SCHED_LOAD_SHIFT;
        if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
-                power *= arch_scale_smt_power(sd, cpu);
+                if (sched_feat(ARCH_POWER))
+                        power *= arch_scale_smt_power(sd, cpu);
+                else
+                        power *= default_scale_smt_power(sd, cpu);
                power >>= SCHED_LOAD_SHIFT;
        }
@@ -4161,26 +4027,6 @@ ret:
        return NULL;
 }
-static struct sched_group *group_of(int cpu)
-{
-        struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
-        if (!sd)
-                return NULL;
-        return sd->groups;
-}
-static unsigned long power_of(int cpu)
-{
-        struct sched_group *group = group_of(cpu);
-        if (!group)
-                return SCHED_LOAD_SCALE;
-        return group->cpu_power;
-}
 /*
 * find_busiest_queue - find the busiest runqueue among the cpus in group.
 */
@@ -5465,14 +5311,13 @@ static inline void schedule_debug(struct task_struct *prev)
 #endif
 }
-static void put_prev_task(struct rq *rq, struct task_struct *prev)
+static void put_prev_task(struct rq *rq, struct task_struct *p)
 {
-        if (prev->state == TASK_RUNNING) {
+        u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
-                u64 runtime = prev->se.sum_exec_runtime;
-                runtime -= prev->se.prev_sum_exec_runtime;
+        update_avg(&p->se.avg_running, runtime);
-                runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+        if (p->state == TASK_RUNNING) {
                /*
                 * In order to avoid avg_overlap growing stale when we are
                 * indeed overlapping and hence not getting put to sleep, grow
@@ -5482,9 +5327,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
                 * correlates to the amount of cache footprint a task can
                 * build up.
                 */
-                update_avg(&prev->se.avg_overlap, runtime);
+                runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+                update_avg(&p->se.avg_overlap, runtime);
+        } else {
+                update_avg(&p->se.avg_running, 0);
        }
-        prev->sched_class->put_prev_task(rq, prev);
+        p->sched_class->put_prev_task(rq, p);
 }
 /*
@@ -5716,10 +5564,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
 #endif /* CONFIG_PREEMPT */
-int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
                          void *key)
 {
-        return try_to_wake_up(curr->private, mode, sync);
+        return try_to_wake_up(curr->private, mode, wake_flags);
 }
 EXPORT_SYMBOL(default_wake_function);
@@ -5733,14 +5581,14 @@ EXPORT_SYMBOL(default_wake_function);
 * zero in this (rare) case, and we handle it by continuing to scan the queue.
 */
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-                        int nr_exclusive, int sync, void *key)
+                        int nr_exclusive, int wake_flags, void *key)
 {
        wait_queue_t *curr, *next;
        list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
                unsigned flags = curr->flags;
-                if (curr->func(curr, mode, sync, key) &&
+                if (curr->func(curr, mode, wake_flags, key) &&
                                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
                        break;
        }
@@ -5801,16 +5649,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, void *key)
 {
        unsigned long flags;
-        int sync = 1;
+        int wake_flags = WF_SYNC;
        if (unlikely(!q))
                return;
        if (unlikely(!nr_exclusive))
-                sync = 0;
+                wake_flags = 0;
        spin_lock_irqsave(&q->lock, flags);
-        __wake_up_common(q, mode, nr_exclusive, sync, key);
+        __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
        spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
@@ -8000,9 +7848,7 @@ static int sd_degenerate(struct sched_domain *sd)
        }
        /* Following flags don't use groups */
-        if (sd->flags & (SD_WAKE_IDLE |
+        if (sd->flags & (SD_WAKE_AFFINE))
-                         SD_WAKE_AFFINE |
-                         SD_WAKE_BALANCE))
                return 0;
        return 1;
@@ -8019,10 +7865,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
        if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
                return 0;
-        /* Does parent contain flags not in child? */
-        /* WAKE_BALANCE is a subset of WAKE_AFFINE */
-        if (cflags & SD_WAKE_AFFINE)
-                pflags &= ~SD_WAKE_BALANCE;
        /* Flags needing groups don't count if only 1 group in parent */
        if (parent->groups == parent->groups->next) {
                pflags &= ~(SD_LOAD_BALANCE |
@@ -8708,10 +8550,10 @@ static void set_domain_attribute(struct sched_domain *sd,
                request = attr->relax_domain_level;
        if (request < sd->level) {
                /* turn off idle balance on this domain */
-                sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
+                sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
        } else {
                /* turn on idle balance on this domain */
-                sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
+                sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
        }
 }
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5ddbd0891267..efb84409bc43 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        PN(se.sum_exec_runtime);
        PN(se.avg_overlap);
        PN(se.avg_wakeup);
+        PN(se.avg_running);
        nr_switches = p->nvcsw + p->nivcsw;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index aa7f84121016..10d218ab69f2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -711,7 +711,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        if (!initial) {
                /* sleeps upto a single latency don't count. */
-                if (sched_feat(NEW_FAIR_SLEEPERS)) {
+                if (sched_feat(FAIR_SLEEPERS)) {
                        unsigned long thresh = sysctl_sched_latency;
                        /*
@@ -725,6 +725,13 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
                                         task_of(se)->policy != SCHED_IDLE))
                                thresh = calc_delta_fair(thresh, se);
+                        /*
+                         * Halve their sleep time's effect, to allow
+                         * for a gentler effect of sleepers:
+                         */
+                        if (sched_feat(GENTLE_FAIR_SLEEPERS))
+                                thresh >>= 1;
                        vruntime -= thresh;
                }
        }
@@ -757,10 +764,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        if (cfs_rq->last == se)
+        if (!se || cfs_rq->last == se)
                cfs_rq->last = NULL;
-        if (cfs_rq->next == se)
+        if (!se || cfs_rq->next == se)
                cfs_rq->next = NULL;
 }
@@ -1062,83 +1069,6 @@ static void yield_task_fair(struct rq *rq)
        se->vruntime = rightmost->vruntime + 1;
 }
-/*
- * wake_idle() will wake a task on an idle cpu if task->cpu is
- * not idle and an idle cpu is available.  The span of cpus to
- * search starts with cpus closest then further out as needed,
- * so we always favor a closer, idle cpu.
- * Domains may include CPUs that are not usable for migration,
- * hence we need to mask them out (rq->rd->online)
- *
- * Returns the CPU we should wake onto.
- */
-#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
-static int wake_idle(int cpu, struct task_struct *p)
-{
-        struct sched_domain *sd;
-        int i;
-        unsigned int chosen_wakeup_cpu;
-        int this_cpu;
-        struct rq *task_rq = task_rq(p);
-        /*
-         * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
-         * are idle and this is not a kernel thread and this task's affinity
-         * allows it to be moved to preferred cpu, then just move!
-         */
-        this_cpu = smp_processor_id();
-        chosen_wakeup_cpu =
-                cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
-        if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
-                idle_cpu(cpu) && idle_cpu(this_cpu) &&
-                p->mm && !(p->flags & PF_KTHREAD) &&
-                cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
-                return chosen_wakeup_cpu;
-        /*
-         * If it is idle, then it is the best cpu to run this task.
-         *
-         * This cpu is also the best, if it has more than one task already.
-         * Siblings must be also busy(in most cases) as they didn't already
-         * pickup the extra load from this cpu and hence we need not check
-         * sibling runqueue info. This will avoid the checks and cache miss
-         * penalities associated with that.
-         */
-        if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
-                return cpu;
-        for_each_domain(cpu, sd) {
-                if ((sd->flags & SD_WAKE_IDLE)
-                    || ((sd->flags & SD_WAKE_IDLE_FAR)
-                        && !task_hot(p, task_rq->clock, sd))) {
-                        for_each_cpu_and(i, sched_domain_span(sd),
-                                         &p->cpus_allowed) {
-                                if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
-                                        if (i != task_cpu(p)) {
-                                                schedstat_inc(p,
-                                                       se.nr_wakeups_idle);
-                                        }
-                                        return i;
-                                }
-                        }
-                } else {
-                        break;
-                }
-        }
-        return cpu;
-}
-#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
-static inline int wake_idle(int cpu, struct task_struct *p)
-{
-        return cpu;
-}
-#endif
 #ifdef CONFIG_SMP
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1225,25 +1155,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 #endif
-static int
+static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
-wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
-            struct task_struct *p, int prev_cpu, int this_cpu, int sync,
-            int idx, unsigned long load, unsigned long this_load,
-            unsigned int imbalance)
 {
-        struct task_struct *curr = this_rq->curr;
+        struct task_struct *curr = current;
-        struct task_group *tg;
+        unsigned long this_load, load;
-        unsigned long tl = this_load;
+        int idx, this_cpu, prev_cpu;
        unsigned long tl_per_task;
+        unsigned int imbalance;
+        struct task_group *tg;
        unsigned long weight;
        int balanced;
-        if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
+        idx       = sd->wake_idx;
-                return 0;
+        this_cpu  = smp_processor_id();
+        prev_cpu  = task_cpu(p);
+        load      = source_load(prev_cpu, idx);
+        this_load = target_load(this_cpu, idx);
-        if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
+        if (sync) {
-                        p->se.avg_overlap > sysctl_sched_migration_cost))
+               if (sched_feat(SYNC_LESS) &&
-                sync = 0;
+                   (curr->se.avg_overlap > sysctl_sched_migration_cost ||
+                    p->se.avg_overlap > sysctl_sched_migration_cost))
+                       sync = 0;
+        } else {
+                if (sched_feat(SYNC_MORE) &&
+                    (curr->se.avg_overlap < sysctl_sched_migration_cost &&
+                     p->se.avg_overlap < sysctl_sched_migration_cost))
+                        sync = 1;
+        }
        /*
         * If sync wakeup then subtract the (maximum possible)
@@ -1254,24 +1193,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
                tg = task_group(current);
                weight = current->se.load.weight;
-                tl += effective_load(tg, this_cpu, -weight, -weight);
+                this_load += effective_load(tg, this_cpu, -weight, -weight);
                load += effective_load(tg, prev_cpu, 0, -weight);
        }
        tg = task_group(p);
        weight = p->se.load.weight;
+        imbalance = 100 + (sd->imbalance_pct - 100) / 2;
        /*
         * In low-load situations, where prev_cpu is idle and this_cpu is idle
-         * due to the sync cause above having dropped tl to 0, we'll always have
+         * due to the sync cause above having dropped this_load to 0, we'll
-         * an imbalance, but there's really nothing you can do about that, so
+         * always have an imbalance, but there's really nothing you can do
-         * that's good too.
+         * about that, so that's good too.
         *
         * Otherwise check if either cpus are near enough in load to allow this
         * task to be woken on this_cpu.
         */
-        balanced = !tl ||
+        balanced = !this_load ||
-                100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+                100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
                imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
        /*
@@ -1285,14 +1226,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
        schedstat_inc(p, se.nr_wakeups_affine_attempts);
        tl_per_task = cpu_avg_load_per_task(this_cpu);
-        if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
+        if (balanced ||
-                        tl_per_task)) {
+            (this_load <= load &&
+             this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
                /*
                 * This domain has SD_WAKE_AFFINE and
                 * p is cache cold in this domain, and
                 * there is no bad imbalance.
                 */
-                schedstat_inc(this_sd, ttwu_move_affine);
+                schedstat_inc(sd, ttwu_move_affine);
                schedstat_inc(p, se.nr_wakeups_affine);
                return 1;
@@ -1300,65 +1242,215 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
        return 0;
 }
-static int select_task_rq_fair(struct task_struct *p, int sync)
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p,
+                  int this_cpu, int load_idx)
 {
-        struct sched_domain *sd, *this_sd = NULL;
+        struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
-        int prev_cpu, this_cpu, new_cpu;
+        unsigned long min_load = ULONG_MAX, this_load = 0;
-        unsigned long load, this_load;
+        int imbalance = 100 + (sd->imbalance_pct-100)/2;
-        struct rq *this_rq;
-        unsigned int imbalance;
-        int idx;
-        prev_cpu        = task_cpu(p);
+        do {
-        this_cpu        = smp_processor_id();
+                unsigned long load, avg_load;
-        this_rq         = cpu_rq(this_cpu);
+                int local_group;
-        new_cpu         = prev_cpu;
+                int i;
-        /*
+                /* Skip over this group if it has no CPUs allowed */
-         * 'this_sd' is the first domain that both
+                if (!cpumask_intersects(sched_group_cpus(group),
-         * this_cpu and prev_cpu are present in:
+                                        &p->cpus_allowed))
-         */
+                        continue;
-        for_each_domain(this_cpu, sd) {
-                if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
+                local_group = cpumask_test_cpu(this_cpu,
-                        this_sd = sd;
+                                               sched_group_cpus(group));
-                        break;
+                /* Tally up the load of all CPUs in the group */
+                avg_load = 0;
+                for_each_cpu(i, sched_group_cpus(group)) {
+                        /* Bias balancing toward cpus of our domain */
+                        if (local_group)
+                                load = source_load(i, load_idx);
+                        else
+                                load = target_load(i, load_idx);
+                        avg_load += load;
+                }
+                /* Adjust by relative CPU power of the group */
+                avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+                if (local_group) {
+                        this_load = avg_load;
+                        this = group;
+                } else if (avg_load < min_load) {
+                        min_load = avg_load;
+                        idlest = group;
+                }
+        } while (group = group->next, group != sd->groups);
+        if (!idlest || 100*this_load < imbalance*min_load)
+                return NULL;
+        return idlest;
+}
+/*
+ * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ */
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+{
+        unsigned long load, min_load = ULONG_MAX;
+        int idlest = -1;
+        int i;
+        /* Traverse only the allowed CPUs */
+        for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
+                load = weighted_cpuload(i);
+                if (load < min_load || (load == min_load && i == this_cpu)) {
+                        min_load = load;
+                        idlest = i;
                }
        }
-        if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
+        return idlest;
-                goto out;
+}
-        /*
+/*
-         * Check for affine wakeup and passive balancing possibilities.
+ * sched_balance_self: balance the current task (running on cpu) in domains
-         */
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
-        if (!this_sd)
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+{
+        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
+        int cpu = smp_processor_id();
+        int prev_cpu = task_cpu(p);
+        int new_cpu = cpu;
+        int want_affine = 0;
+        int want_sd = 1;
+        int sync = wake_flags & WF_SYNC;
+        if (sd_flag & SD_BALANCE_WAKE) {
+                if (sched_feat(AFFINE_WAKEUPS))
+                        want_affine = 1;
+                new_cpu = prev_cpu;
+        }
+        rcu_read_lock();
+        for_each_domain(cpu, tmp) {
+                /*
+                 * If power savings logic is enabled for a domain, see if we
+                 * are not overloaded, if so, don't balance wider.
+                 */
+                if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
+                        unsigned long power = 0;
+                        unsigned long nr_running = 0;
+                        unsigned long capacity;
+                        int i;
+                        for_each_cpu(i, sched_domain_span(tmp)) {
+                                power += power_of(i);
+                                nr_running += cpu_rq(i)->cfs.nr_running;
+                        }
+                        capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+                        if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+                                nr_running /= 2;
+                        if (nr_running < capacity)
+                                want_sd = 0;
+                }
+                if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+                    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+                        affine_sd = tmp;
+                        want_affine = 0;
+                }
+                if (!want_sd && !want_affine)
+                        break;
+                if (!(tmp->flags & sd_flag))
+                        continue;
+                if (want_sd)
+                        sd = tmp;
+        }
+        if (sched_feat(LB_SHARES_UPDATE)) {
+                /*
+                 * Pick the largest domain to update shares over
+                 */
+                tmp = sd;
+                if (affine_sd && (!tmp ||
+                                  cpumask_weight(sched_domain_span(affine_sd)) >
+                                  cpumask_weight(sched_domain_span(sd))))
+                        tmp = affine_sd;
+                if (tmp)
+                        update_shares(tmp);
+        }
+        if (affine_sd && wake_affine(affine_sd, p, sync)) {
+                new_cpu = cpu;
                goto out;
+        }
-        idx = this_sd->wake_idx;
+        while (sd) {
+                int load_idx = sd->forkexec_idx;
+                struct sched_group *group;
+                int weight;
-        imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+                if (!(sd->flags & sd_flag)) {
+                        sd = sd->child;
+                        continue;
+                }
-        load = source_load(prev_cpu, idx);
+                if (sd_flag & SD_BALANCE_WAKE)
-        this_load = target_load(this_cpu, idx);
+                        load_idx = sd->wake_idx;
-        if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
+                group = find_idlest_group(sd, p, cpu, load_idx);
-                                     load, this_load, imbalance))
+                if (!group) {
-                return this_cpu;
+                        sd = sd->child;
+                        continue;
+                }
-        /*
+                new_cpu = find_idlest_cpu(group, p, cpu);
-         * Start passive balancing when half the imbalance_pct
+                if (new_cpu == -1 || new_cpu == cpu) {
-         * limit is reached.
+                        /* Now try balancing at a lower domain level of cpu */
-         */
+                        sd = sd->child;
-        if (this_sd->flags & SD_WAKE_BALANCE) {
+                        continue;
-                if (imbalance*this_load <= 100*load) {
-                        schedstat_inc(this_sd, ttwu_move_balance);
-                        schedstat_inc(p, se.nr_wakeups_passive);
-                        return this_cpu;
                }
+                /* Now try balancing at a lower domain level of new_cpu */
+                cpu = new_cpu;
+                weight = cpumask_weight(sched_domain_span(sd));
+                sd = NULL;
+                for_each_domain(cpu, tmp) {
+                        if (weight <= cpumask_weight(sched_domain_span(tmp)))
+                                break;
+                        if (tmp->flags & sd_flag)
+                                sd = tmp;
+                }
+                /* while loop will break here if sd == NULL */
        }
 out:
-        return wake_idle(new_cpu, p);
+        rcu_read_unlock();
+        return new_cpu;
 }
 #endif /* CONFIG_SMP */
@@ -1471,11 +1563,12 @@ static void set_next_buddy(struct sched_entity *se)
 /*
 * Preempt the current task with a newly woken task if needed:
 */
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
        struct task_struct *curr = rq->curr;
        struct sched_entity *se = &curr->se, *pse = &p->se;
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+        int sync = wake_flags & WF_SYNC;
        update_curr(cfs_rq);
@@ -1501,7 +1594,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
         */
        if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
                set_last_buddy(se);
-        set_next_buddy(pse);
+        if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
+                set_next_buddy(pse);
        /*
         * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1523,16 +1617,25 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
                return;
        }
-        if (!sched_feat(WAKEUP_PREEMPT))
+        if ((sched_feat(WAKEUP_SYNC) && sync) ||
-                return;
+            (sched_feat(WAKEUP_OVERLAP) &&
+             (se->avg_overlap < sysctl_sched_migration_cost &&
-        if (sched_feat(WAKEUP_OVERLAP) && (sync ||
+              pse->avg_overlap < sysctl_sched_migration_cost))) {
-                        (se->avg_overlap < sysctl_sched_migration_cost &&
-                         pse->avg_overlap < sysctl_sched_migration_cost))) {
                resched_task(curr);
                return;
        }
+        if (sched_feat(WAKEUP_RUNNING)) {
+                if (pse->avg_running < se->avg_running) {
+                        set_next_buddy(pse);
+                        resched_task(curr);
+                        return;
+                }
+        }
+        if (!sched_feat(WAKEUP_PREEMPT))
+                return;
        find_matching_se(&se, &pse);
        BUG_ON(!pse);
@@ -1555,8 +1658,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
                /*
                 * If se was a buddy, clear it so that it will have to earn
                 * the favour again.
+                 *
+                 * If se was not a buddy, clear the buddies because neither
+                 * was elegible to run, let them earn it again.
+                 *
+                 * IOW. unconditionally clear buddies.
                 */
-                __clear_buddies(cfs_rq, se);
+                __clear_buddies(cfs_rq, NULL);
                set_next_entity(cfs_rq, se);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index e2dc63a5815d..0d94083582c7 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,17 +1,123 @@
-SCHED_FEAT(NEW_FAIR_SLEEPERS, 0)
+/*
+ * Disregards a certain amount of sleep time (sched_latency_ns) and
+ * considers the task to be running during that period. This gives it
+ * a service deficit on wakeup, allowing it to run sooner.
+ */
+SCHED_FEAT(FAIR_SLEEPERS, 1)
+/*
+ * Only give sleepers 50% of their service deficit. This allows
+ * them to run sooner, but does not allow tons of sleepers to
+ * rip the spread apart.
+ */
+SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
+/*
+ * By not normalizing the sleep time, heavy tasks get an effective
+ * longer period, and lighter task an effective shorter period they
+ * are considered running.
+ */
 SCHED_FEAT(NORMALIZED_SLEEPER, 0)
-SCHED_FEAT(ADAPTIVE_GRAN, 1)
-SCHED_FEAT(WAKEUP_PREEMPT, 1)
+/*
+ * Place new tasks ahead so that they do not starve already running
+ * tasks
+ */
 SCHED_FEAT(START_DEBIT, 1)
+/*
+ * Should wakeups try to preempt running tasks.
+ */
+SCHED_FEAT(WAKEUP_PREEMPT, 1)
+/*
+ * Compute wakeup_gran based on task behaviour, clipped to
+ *  [0, sched_wakeup_gran_ns]
+ */
+SCHED_FEAT(ADAPTIVE_GRAN, 1)
+/*
+ * When converting the wakeup granularity to virtual time, do it such
+ * that heavier tasks preempting a lighter task have an edge.
+ */
+SCHED_FEAT(ASYM_GRAN, 1)
+/*
+ * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
+ */
+SCHED_FEAT(WAKEUP_SYNC, 0)
+/*
+ * Wakeup preempt based on task behaviour. Tasks that do not overlap
+ * don't get preempted.
+ */
+SCHED_FEAT(WAKEUP_OVERLAP, 0)
+/*
+ * Wakeup preemption towards tasks that run short
+ */
+SCHED_FEAT(WAKEUP_RUNNING, 0)
+/*
+ * Use the SYNC wakeup hint, pipes and the likes use this to indicate
+ * the remote end is likely to consume the data we just wrote, and
+ * therefore has cache benefit from being placed on the same cpu, see
+ * also AFFINE_WAKEUPS.
+ */
+SCHED_FEAT(SYNC_WAKEUPS, 1)
+/*
+ * Based on load and program behaviour, see if it makes sense to place
+ * a newly woken task on the same cpu as the task that woke it --
+ * improve cache locality. Typically used with SYNC wakeups as
+ * generated by pipes and the like, see also SYNC_WAKEUPS.
+ */
 SCHED_FEAT(AFFINE_WAKEUPS, 1)
+/*
+ * Weaken SYNC hint based on overlap
+ */
+SCHED_FEAT(SYNC_LESS, 1)
+/*
+ * Add SYNC hint based on overlap
+ */
+SCHED_FEAT(SYNC_MORE, 0)
+/*
+ * Prefer to schedule the task we woke last (assuming it failed
+ * wakeup-preemption), since its likely going to consume data we
+ * touched, increases cache locality.
+ */
+SCHED_FEAT(NEXT_BUDDY, 0)
+/*
+ * Prefer to schedule the task that ran last (when we did
+ * wake-preempt) as that likely will touch the same data, increases
+ * cache locality.
+ */
+SCHED_FEAT(LAST_BUDDY, 1)
+/*
+ * Consider buddies to be cache hot, decreases the likelyness of a
+ * cache buddy being migrated away, increases cache locality.
+ */
 SCHED_FEAT(CACHE_HOT_BUDDY, 1)
-SCHED_FEAT(SYNC_WAKEUPS, 1)
+/*
+ * Use arch dependent cpu power functions
+ */
+SCHED_FEAT(ARCH_POWER, 0)
 SCHED_FEAT(HRTICK, 0)
 SCHED_FEAT(DOUBLE_TICK, 0)
-SCHED_FEAT(ASYM_GRAN, 1)
 SCHED_FEAT(LB_BIAS, 1)
-SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
+SCHED_FEAT(LB_SHARES_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
-SCHED_FEAT(WAKEUP_OVERLAP, 0)
-SCHED_FEAT(LAST_BUDDY, 1)
+/*
+ * Spin-wait on mutex acquisition when the mutex owner is running on
+ * another cpu -- assumes that when the owner is running, it will soon
+ * release the lock. Decreases scheduling overhead.
+ */
 SCHED_FEAT(OWNER_SPIN, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 499672c10cbd..a8b448af004b 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,7 @@
 */
 #ifdef CONFIG_SMP
-static int select_task_rq_idle(struct task_struct *p, int sync)
+static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
 {
        return task_cpu(p); /* IDLE tasks as never migrated */
 }
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
 /*
 * Idle tasks are unconditionally rescheduled:
 */
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
 {
        resched_task(rq->idle);
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2eb4bd6a526c..13de7126a6ab 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -938,10 +938,13 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
-static int select_task_rq_rt(struct task_struct *p, int sync)
+static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
 {
        struct rq *rq = task_rq(p);
+        if (sd_flag != SD_BALANCE_WAKE)
+                return smp_processor_id();
        /*
         * If the current task is an RT task, then
         * try to see if we can wake this RT task up on another
@@ -999,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 /*
 * Preempt the current task with a newly woken task if needed:
 */
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
+static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
 {
        if (p->prio < rq->curr->prio) {
                resched_task(rq->curr);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 687699d365ae..2547d8813cf0 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -11,7 +11,6 @@
 #include <linux/ftrace.h>
 #include <linux/string.h>
 #include <linux/module.h>
-#include <linux/marker.h>
 #include <linux/mutex.h>
 #include <linux/ctype.h>
 #include <linux/list.h>