Merge branch 'linus' into core/rcu

Merge reason: Backmerge latest upstream to queue up dependent fix in the scheduler. Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2010-03-01 03:28:53 -0500
committer: Ingo Molnar <mingo@elte.hu> 2010-03-01 03:28:58 -0500
commit: e2f4699ac15fe36de1288505bc6e6e5a8603ab1b (patch)
tree: 8078d3ff21eaa0a0ed6e446ac94f3681e831cad1 /kernel
parent: 1883c79a57a5fe25309007590cccb1b2782c41b2 (diff)
parent: 30ff056c42c665b9ea535d8515890857ae382540 (diff)
35 files changed, 3685 insertions, 3340 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 864ff75d65f2..6aebdeb2aa34 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
+obj-$(CONFIG_PADATA) += padata.o
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b7df302a0204..ccec774c716d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -44,6 +44,7 @@
 #include <linux/debugfs.h>
 #include <linux/kdebug.h>
 #include <linux/memory.h>
+#include <linux/ftrace.h>
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
@@ -93,6 +94,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
        {"native_get_debugreg",},
        {"irq_entries_start",},
        {"common_interrupt",},
+        {"mcount",},    /* mcount can be called from everywhere */
        {NULL}    /* Terminator */
 };
@@ -124,30 +126,6 @@ static LIST_HEAD(kprobe_insn_pages);
 static int kprobe_garbage_slots;
 static int collect_garbage_slots(void);
-static int __kprobes check_safety(void)
-{
-        int ret = 0;
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER)
-        ret = freeze_processes();
-        if (ret == 0) {
-                struct task_struct *p, *q;
-                do_each_thread(p, q) {
-                        if (p != current && p->state == TASK_RUNNING &&
-                            p->pid != 0) {
-                                printk("Check failed: %s is running\n",p->comm);
-                                ret = -1;
-                                goto loop_end;
-                        }
-                } while_each_thread(p, q);
-        }
-loop_end:
-        thaw_processes();
-#else
-        synchronize_sched();
-#endif
-        return ret;
-}
 /**
 * __get_insn_slot() - Find a slot on an executable page for an instruction.
 * We allocate an executable page if there's no room on existing ones.
@@ -235,9 +213,8 @@ static int __kprobes collect_garbage_slots(void)
 {
        struct kprobe_insn_page *kip, *next;
-        /* Ensure no-one is preepmted on the garbages */
+        /* Ensure no-one is interrupted on the garbages */
-        if (check_safety())
+        synchronize_sched();
-                return -EAGAIN;
        list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
                int i;
@@ -728,7 +705,8 @@ int __kprobes register_kprobe(struct kprobe *p)
        preempt_disable();
        if (!kernel_text_address((unsigned long) p->addr) ||
-            in_kprobes_functions((unsigned long) p->addr)) {
+            in_kprobes_functions((unsigned long) p->addr) ||
+            ftrace_text_reserved(p->addr, p->addr)) {
                preempt_enable();
                return -EINVAL;
        }
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3feaf5a74514..6b1ccc3f0205 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -197,16 +197,8 @@ static int __init ksysfs_init(void)
                        goto group_exit;
        }
-        /* create the /sys/kernel/uids/ directory */
-        error = uids_sysfs_init();
-        if (error)
-                goto notes_exit;
        return 0;
-notes_exit:
-        if (notes_size > 0)
-                sysfs_remove_bin_file(kernel_kobj, &notes_attr);
 group_exit:
        sysfs_remove_group(kernel_kobj, &kernel_attr_group);
 kset_exit:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fbb6222fe7e0..82ed0ea15194 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create)
 *
 * Description: This helper function creates and names a kernel
 * thread.  The thread will be stopped: use wake_up_process() to start
- * it.  See also kthread_run(), kthread_create_on_cpu().
+ * it.  See also kthread_run().
 *
 * When woken, the thread will run @threadfn() with @data as its
 * argument. @threadfn() can either call do_exit() directly if it is a
diff --git a/kernel/padata.c b/kernel/padata.c
new file mode 100644
index 000000000000..6f9bcb8313d6
--- /dev/null
+++ b/kernel/padata.c
@@ -0,0 +1,690 @@
+/*
+ * padata.c - generic interface to process data streams in parallel
+ *
+ * Copyright (C) 2008, 2009 secunet Security Networks AG
+ * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/err.h>
+#include <linux/cpu.h>
+#include <linux/padata.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/rcupdate.h>
+#define MAX_SEQ_NR INT_MAX - NR_CPUS
+#define MAX_OBJ_NUM 10000 * NR_CPUS
+static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
+{
+        int cpu, target_cpu;
+        target_cpu = cpumask_first(pd->cpumask);
+        for (cpu = 0; cpu < cpu_index; cpu++)
+                target_cpu = cpumask_next(target_cpu, pd->cpumask);
+        return target_cpu;
+}
+static int padata_cpu_hash(struct padata_priv *padata)
+{
+        int cpu_index;
+        struct parallel_data *pd;
+        pd =  padata->pd;
+        /*
+         * Hash the sequence numbers to the cpus by taking
+         * seq_nr mod. number of cpus in use.
+         */
+        cpu_index =  padata->seq_nr % cpumask_weight(pd->cpumask);
+        return padata_index_to_cpu(pd, cpu_index);
+}
+static void padata_parallel_worker(struct work_struct *work)
+{
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        struct padata_instance *pinst;
+        LIST_HEAD(local_list);
+        local_bh_disable();
+        queue = container_of(work, struct padata_queue, pwork);
+        pd = queue->pd;
+        pinst = pd->pinst;
+        spin_lock(&queue->parallel.lock);
+        list_replace_init(&queue->parallel.list, &local_list);
+        spin_unlock(&queue->parallel.lock);
+        while (!list_empty(&local_list)) {
+                struct padata_priv *padata;
+                padata = list_entry(local_list.next,
+                                    struct padata_priv, list);
+                list_del_init(&padata->list);
+                padata->parallel(padata);
+        }
+        local_bh_enable();
+}
+/*
+ * padata_do_parallel - padata parallelization function
+ *
+ * @pinst: padata instance
+ * @padata: object to be parallelized
+ * @cb_cpu: cpu the serialization callback function will run on,
+ *          must be in the cpumask of padata.
+ *
+ * The parallelization callback function will run with BHs off.
+ * Note: Every object which is parallelized by padata_do_parallel
+ * must be seen by padata_do_serial.
+ */
+int padata_do_parallel(struct padata_instance *pinst,
+                       struct padata_priv *padata, int cb_cpu)
+{
+        int target_cpu, err;
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        rcu_read_lock_bh();
+        pd = rcu_dereference(pinst->pd);
+        err = 0;
+        if (!(pinst->flags & PADATA_INIT))
+                goto out;
+        err =  -EBUSY;
+        if ((pinst->flags & PADATA_RESET))
+                goto out;
+        if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
+                goto out;
+        err = -EINVAL;
+        if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
+                goto out;
+        err = -EINPROGRESS;
+        atomic_inc(&pd->refcnt);
+        padata->pd = pd;
+        padata->cb_cpu = cb_cpu;
+        if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
+                atomic_set(&pd->seq_nr, -1);
+        padata->seq_nr = atomic_inc_return(&pd->seq_nr);
+        target_cpu = padata_cpu_hash(padata);
+        queue = per_cpu_ptr(pd->queue, target_cpu);
+        spin_lock(&queue->parallel.lock);
+        list_add_tail(&padata->list, &queue->parallel.list);
+        spin_unlock(&queue->parallel.lock);
+        queue_work_on(target_cpu, pinst->wq, &queue->pwork);
+out:
+        rcu_read_unlock_bh();
+        return err;
+}
+EXPORT_SYMBOL(padata_do_parallel);
+static struct padata_priv *padata_get_next(struct parallel_data *pd)
+{
+        int cpu, num_cpus, empty, calc_seq_nr;
+        int seq_nr, next_nr, overrun, next_overrun;
+        struct padata_queue *queue, *next_queue;
+        struct padata_priv *padata;
+        struct padata_list *reorder;
+        empty = 0;
+        next_nr = -1;
+        next_overrun = 0;
+        next_queue = NULL;
+        num_cpus = cpumask_weight(pd->cpumask);
+        for_each_cpu(cpu, pd->cpumask) {
+                queue = per_cpu_ptr(pd->queue, cpu);
+                reorder = &queue->reorder;
+                /*
+                 * Calculate the seq_nr of the object that should be
+                 * next in this queue.
+                 */
+                overrun = 0;
+                calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
+                               + queue->cpu_index;
+                if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
+                        calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
+                        overrun = 1;
+                }
+                if (!list_empty(&reorder->list)) {
+                        padata = list_entry(reorder->list.next,
+                                            struct padata_priv, list);
+                        seq_nr  = padata->seq_nr;
+                        BUG_ON(calc_seq_nr != seq_nr);
+                } else {
+                        seq_nr = calc_seq_nr;
+                        empty++;
+                }
+                if (next_nr < 0 || seq_nr < next_nr
+                    || (next_overrun && !overrun)) {
+                        next_nr = seq_nr;
+                        next_overrun = overrun;
+                        next_queue = queue;
+                }
+        }
+        padata = NULL;
+        if (empty == num_cpus)
+                goto out;
+        reorder = &next_queue->reorder;
+        if (!list_empty(&reorder->list)) {
+                padata = list_entry(reorder->list.next,
+                                    struct padata_priv, list);
+                if (unlikely(next_overrun)) {
+                        for_each_cpu(cpu, pd->cpumask) {
+                                queue = per_cpu_ptr(pd->queue, cpu);
+                                atomic_set(&queue->num_obj, 0);
+                        }
+                }
+                spin_lock(&reorder->lock);
+                list_del_init(&padata->list);
+                atomic_dec(&pd->reorder_objects);
+                spin_unlock(&reorder->lock);
+                atomic_inc(&next_queue->num_obj);
+                goto out;
+        }
+        if (next_nr % num_cpus == next_queue->cpu_index) {
+                padata = ERR_PTR(-ENODATA);
+                goto out;
+        }
+        padata = ERR_PTR(-EINPROGRESS);
+out:
+        return padata;
+}
+static void padata_reorder(struct parallel_data *pd)
+{
+        struct padata_priv *padata;
+        struct padata_queue *queue;
+        struct padata_instance *pinst = pd->pinst;
+try_again:
+        if (!spin_trylock_bh(&pd->lock))
+                goto out;
+        while (1) {
+                padata = padata_get_next(pd);
+                if (!padata || PTR_ERR(padata) == -EINPROGRESS)
+                        break;
+                if (PTR_ERR(padata) == -ENODATA) {
+                        spin_unlock_bh(&pd->lock);
+                        goto out;
+                }
+                queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
+                spin_lock(&queue->serial.lock);
+                list_add_tail(&padata->list, &queue->serial.list);
+                spin_unlock(&queue->serial.lock);
+                queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork);
+        }
+        spin_unlock_bh(&pd->lock);
+        if (atomic_read(&pd->reorder_objects))
+                goto try_again;
+out:
+        return;
+}
+static void padata_serial_worker(struct work_struct *work)
+{
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        LIST_HEAD(local_list);
+        local_bh_disable();
+        queue = container_of(work, struct padata_queue, swork);
+        pd = queue->pd;
+        spin_lock(&queue->serial.lock);
+        list_replace_init(&queue->serial.list, &local_list);
+        spin_unlock(&queue->serial.lock);
+        while (!list_empty(&local_list)) {
+                struct padata_priv *padata;
+                padata = list_entry(local_list.next,
+                                    struct padata_priv, list);
+                list_del_init(&padata->list);
+                padata->serial(padata);
+                atomic_dec(&pd->refcnt);
+        }
+        local_bh_enable();
+}
+/*
+ * padata_do_serial - padata serialization function
+ *
+ * @padata: object to be serialized.
+ *
+ * padata_do_serial must be called for every parallelized object.
+ * The serialization callback function will run with BHs off.
+ */
+void padata_do_serial(struct padata_priv *padata)
+{
+        int cpu;
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        pd = padata->pd;
+        cpu = get_cpu();
+        queue = per_cpu_ptr(pd->queue, cpu);
+        spin_lock(&queue->reorder.lock);
+        atomic_inc(&pd->reorder_objects);
+        list_add_tail(&padata->list, &queue->reorder.list);
+        spin_unlock(&queue->reorder.lock);
+        put_cpu();
+        padata_reorder(pd);
+}
+EXPORT_SYMBOL(padata_do_serial);
+static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
+                                             const struct cpumask *cpumask)
+{
+        int cpu, cpu_index, num_cpus;
+        struct padata_queue *queue;
+        struct parallel_data *pd;
+        cpu_index = 0;
+        pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
+        if (!pd)
+                goto err;
+        pd->queue = alloc_percpu(struct padata_queue);
+        if (!pd->queue)
+                goto err_free_pd;
+        if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
+                goto err_free_queue;
+        for_each_possible_cpu(cpu) {
+                queue = per_cpu_ptr(pd->queue, cpu);
+                queue->pd = pd;
+                if (cpumask_test_cpu(cpu, cpumask)
+                    && cpumask_test_cpu(cpu, cpu_active_mask)) {
+                        queue->cpu_index = cpu_index;
+                        cpu_index++;
+                } else
+                        queue->cpu_index = -1;
+                INIT_LIST_HEAD(&queue->reorder.list);
+                INIT_LIST_HEAD(&queue->parallel.list);
+                INIT_LIST_HEAD(&queue->serial.list);
+                spin_lock_init(&queue->reorder.lock);
+                spin_lock_init(&queue->parallel.lock);
+                spin_lock_init(&queue->serial.lock);
+                INIT_WORK(&queue->pwork, padata_parallel_worker);
+                INIT_WORK(&queue->swork, padata_serial_worker);
+                atomic_set(&queue->num_obj, 0);
+        }
+        cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
+        num_cpus = cpumask_weight(pd->cpumask);
+        pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
+        atomic_set(&pd->seq_nr, -1);
+        atomic_set(&pd->reorder_objects, 0);
+        atomic_set(&pd->refcnt, 0);
+        pd->pinst = pinst;
+        spin_lock_init(&pd->lock);
+        return pd;
+err_free_queue:
+        free_percpu(pd->queue);
+err_free_pd:
+        kfree(pd);
+err:
+        return NULL;
+}
+static void padata_free_pd(struct parallel_data *pd)
+{
+        free_cpumask_var(pd->cpumask);
+        free_percpu(pd->queue);
+        kfree(pd);
+}
+static void padata_replace(struct padata_instance *pinst,
+                           struct parallel_data *pd_new)
+{
+        struct parallel_data *pd_old = pinst->pd;
+        pinst->flags |= PADATA_RESET;
+        rcu_assign_pointer(pinst->pd, pd_new);
+        synchronize_rcu();
+        while (atomic_read(&pd_old->refcnt) != 0)
+                yield();
+        flush_workqueue(pinst->wq);
+        padata_free_pd(pd_old);
+        pinst->flags &= ~PADATA_RESET;
+}
+/*
+ * padata_set_cpumask - set the cpumask that padata should use
+ *
+ * @pinst: padata instance
+ * @cpumask: the cpumask to use
+ */
+int padata_set_cpumask(struct padata_instance *pinst,
+                        cpumask_var_t cpumask)
+{
+        struct parallel_data *pd;
+        int err = 0;
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        pd = padata_alloc_pd(pinst, cpumask);
+        if (!pd) {
+                err = -ENOMEM;
+                goto out;
+        }
+        cpumask_copy(pinst->cpumask, cpumask);
+        padata_replace(pinst, pd);
+out:
+        mutex_unlock(&pinst->lock);
+        return err;
+}
+EXPORT_SYMBOL(padata_set_cpumask);
+static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
+{
+        struct parallel_data *pd;
+        if (cpumask_test_cpu(cpu, cpu_active_mask)) {
+                pd = padata_alloc_pd(pinst, pinst->cpumask);
+                if (!pd)
+                        return -ENOMEM;
+                padata_replace(pinst, pd);
+        }
+        return 0;
+}
+/*
+ * padata_add_cpu - add a cpu to the padata cpumask
+ *
+ * @pinst: padata instance
+ * @cpu: cpu to add
+ */
+int padata_add_cpu(struct padata_instance *pinst, int cpu)
+{
+        int err;
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        cpumask_set_cpu(cpu, pinst->cpumask);
+        err = __padata_add_cpu(pinst, cpu);
+        mutex_unlock(&pinst->lock);
+        return err;
+}
+EXPORT_SYMBOL(padata_add_cpu);
+static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
+{
+        struct parallel_data *pd;
+        if (cpumask_test_cpu(cpu, cpu_online_mask)) {
+                pd = padata_alloc_pd(pinst, pinst->cpumask);
+                if (!pd)
+                        return -ENOMEM;
+                padata_replace(pinst, pd);
+        }
+        return 0;
+}
+/*
+ * padata_remove_cpu - remove a cpu from the padata cpumask
+ *
+ * @pinst: padata instance
+ * @cpu: cpu to remove
+ */
+int padata_remove_cpu(struct padata_instance *pinst, int cpu)
+{
+        int err;
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        cpumask_clear_cpu(cpu, pinst->cpumask);
+        err = __padata_remove_cpu(pinst, cpu);
+        mutex_unlock(&pinst->lock);
+        return err;
+}
+EXPORT_SYMBOL(padata_remove_cpu);
+/*
+ * padata_start - start the parallel processing
+ *
+ * @pinst: padata instance to start
+ */
+void padata_start(struct padata_instance *pinst)
+{
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        pinst->flags |= PADATA_INIT;
+        mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_start);
+/*
+ * padata_stop - stop the parallel processing
+ *
+ * @pinst: padata instance to stop
+ */
+void padata_stop(struct padata_instance *pinst)
+{
+        might_sleep();
+        mutex_lock(&pinst->lock);
+        pinst->flags &= ~PADATA_INIT;
+        mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_stop);
+static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
+                                         unsigned long action, void *hcpu)
+{
+        int err;
+        struct padata_instance *pinst;
+        int cpu = (unsigned long)hcpu;
+        pinst = container_of(nfb, struct padata_instance, cpu_notifier);
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                err = __padata_add_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+                if (err)
+                        return NOTIFY_BAD;
+                break;
+        case CPU_DOWN_PREPARE:
+        case CPU_DOWN_PREPARE_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                err = __padata_remove_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+                if (err)
+                        return NOTIFY_BAD;
+                break;
+        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                __padata_remove_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
+                if (!cpumask_test_cpu(cpu, pinst->cpumask))
+                        break;
+                mutex_lock(&pinst->lock);
+                __padata_add_cpu(pinst, cpu);
+                mutex_unlock(&pinst->lock);
+        }
+        return NOTIFY_OK;
+}
+/*
+ * padata_alloc - allocate and initialize a padata instance
+ *
+ * @cpumask: cpumask that padata uses for parallelization
+ * @wq: workqueue to use for the allocated padata instance
+ */
+struct padata_instance *padata_alloc(const struct cpumask *cpumask,
+                                     struct workqueue_struct *wq)
+{
+        int err;
+        struct padata_instance *pinst;
+        struct parallel_data *pd;
+        pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
+        if (!pinst)
+                goto err;
+        pd = padata_alloc_pd(pinst, cpumask);
+        if (!pd)
+                goto err_free_inst;
+        rcu_assign_pointer(pinst->pd, pd);
+        pinst->wq = wq;
+        cpumask_copy(pinst->cpumask, cpumask);
+        pinst->flags = 0;
+        pinst->cpu_notifier.notifier_call = padata_cpu_callback;
+        pinst->cpu_notifier.priority = 0;
+        err = register_hotcpu_notifier(&pinst->cpu_notifier);
+        if (err)
+                goto err_free_pd;
+        mutex_init(&pinst->lock);
+        return pinst;
+err_free_pd:
+        padata_free_pd(pd);
+err_free_inst:
+        kfree(pinst);
+err:
+        return NULL;
+}
+EXPORT_SYMBOL(padata_alloc);
+/*
+ * padata_free - free a padata instance
+ *
+ * @ padata_inst: padata instance to free
+ */
+void padata_free(struct padata_instance *pinst)
+{
+        padata_stop(pinst);
+        synchronize_rcu();
+        while (atomic_read(&pinst->pd->refcnt) != 0)
+                yield();
+        unregister_hotcpu_notifier(&pinst->cpu_notifier);
+        padata_free_pd(pinst->pd);
+        kfree(pinst);
+}
+EXPORT_SYMBOL(padata_free);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2ae7409bf38f..a661e7991865 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -98,11 +98,12 @@ void __weak hw_perf_enable(void)		{ barrier(); }
 void __weak hw_perf_event_setup(int cpu)        { barrier(); }
 void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
+void __weak hw_perf_event_setup_offline(int cpu)        { barrier(); }
 int __weak
 hw_perf_group_sched_in(struct perf_event *group_leader,
               struct perf_cpu_context *cpuctx,
-               struct perf_event_context *ctx, int cpu)
+               struct perf_event_context *ctx)
 {
        return 0;
 }
@@ -248,7 +249,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
 static inline u64 perf_clock(void)
 {
-        return cpu_clock(smp_processor_id());
+        return cpu_clock(raw_smp_processor_id());
 }
 /*
@@ -289,6 +290,15 @@ static void update_event_times(struct perf_event *event)
        event->total_time_running = run_end - event->tstamp_running;
 }
+static struct list_head *
+ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+{
+        if (event->attr.pinned)
+                return &ctx->pinned_groups;
+        else
+                return &ctx->flexible_groups;
+}
 /*
 * Add a event from the lists for its context.
 * Must be called with ctx->mutex and ctx->lock held.
@@ -303,9 +313,19 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
         * add it straight to the context's event list, or to the group
         * leader's sibling list:
         */
-        if (group_leader == event)
+        if (group_leader == event) {
-                list_add_tail(&event->group_entry, &ctx->group_list);
+                struct list_head *list;
-        else {
+                if (is_software_event(event))
+                        event->group_flags |= PERF_GROUP_SOFTWARE;
+                list = ctx_group_list(event, ctx);
+                list_add_tail(&event->group_entry, list);
+        } else {
+                if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
+                    !is_software_event(event))
+                        group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
                list_add_tail(&event->group_entry, &group_leader->sibling_list);
                group_leader->nr_siblings++;
        }
@@ -355,9 +375,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
         * to the context list directly:
         */
        list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
+                struct list_head *list;
-                list_move_tail(&sibling->group_entry, &ctx->group_list);
+                list = ctx_group_list(event, ctx);
+                list_move_tail(&sibling->group_entry, list);
                sibling->group_leader = sibling;
+                /* Inherit group flags from the previous leader */
+                sibling->group_flags = event->group_flags;
        }
 }
@@ -608,14 +633,13 @@ void perf_event_disable(struct perf_event *event)
 static int
 event_sched_in(struct perf_event *event,
                 struct perf_cpu_context *cpuctx,
-                 struct perf_event_context *ctx,
+                 struct perf_event_context *ctx)
-                 int cpu)
 {
        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;
        event->state = PERF_EVENT_STATE_ACTIVE;
-        event->oncpu = cpu;     /* TODO: put 'cpu' into cpuctx->cpu */
+        event->oncpu = smp_processor_id();
        /*
         * The new state must be visible before we turn it on in the hardware:
         */
@@ -642,8 +666,7 @@ event_sched_in(struct perf_event *event,
 static int
 group_sched_in(struct perf_event *group_event,
               struct perf_cpu_context *cpuctx,
-               struct perf_event_context *ctx,
+               struct perf_event_context *ctx)
-               int cpu)
 {
        struct perf_event *event, *partial_group;
        int ret;
@@ -651,18 +674,18 @@ group_sched_in(struct perf_event *group_event,
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
-        ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
+        ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
        if (ret)
                return ret < 0 ? ret : 0;
-        if (event_sched_in(group_event, cpuctx, ctx, cpu))
+        if (event_sched_in(group_event, cpuctx, ctx))
                return -EAGAIN;
        /*
         * Schedule in siblings as one group (if any):
         */
        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
-                if (event_sched_in(event, cpuctx, ctx, cpu)) {
+                if (event_sched_in(event, cpuctx, ctx)) {
                        partial_group = event;
                        goto group_error;
                }
@@ -686,24 +709,6 @@ group_error:
 }
 /*
- * Return 1 for a group consisting entirely of software events,
- * 0 if the group contains any hardware events.
- */
-static int is_software_only_group(struct perf_event *leader)
-{
-        struct perf_event *event;
-        if (!is_software_event(leader))
-                return 0;
-        list_for_each_entry(event, &leader->sibling_list, group_entry)
-                if (!is_software_event(event))
-                        return 0;
-        return 1;
-}
-/*
 * Work out whether we can put this event group on the CPU now.
 */
 static int group_can_go_on(struct perf_event *event,
@@ -713,7 +718,7 @@ static int group_can_go_on(struct perf_event *event,
        /*
         * Groups consisting entirely of software events can always go on.
         */
-        if (is_software_only_group(event))
+        if (event->group_flags & PERF_GROUP_SOFTWARE)
                return 1;
        /*
         * If an exclusive group is already on, no other hardware
@@ -754,7 +759,6 @@ static void __perf_install_in_context(void *info)
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *leader = event->group_leader;
-        int cpu = smp_processor_id();
        int err;
        /*
@@ -801,7 +805,7 @@ static void __perf_install_in_context(void *info)
        if (!group_can_go_on(event, cpuctx, 1))
                err = -EEXIST;
        else
-                err = event_sched_in(event, cpuctx, ctx, cpu);
+                err = event_sched_in(event, cpuctx, ctx);
        if (err) {
                /*
@@ -943,11 +947,9 @@ static void __perf_event_enable(void *info)
        } else {
                perf_disable();
                if (event == leader)
-                        err = group_sched_in(event, cpuctx, ctx,
+                        err = group_sched_in(event, cpuctx, ctx);
-                                             smp_processor_id());
                else
-                        err = event_sched_in(event, cpuctx, ctx,
+                        err = event_sched_in(event, cpuctx, ctx);
-                                               smp_processor_id());
                perf_enable();
        }
@@ -1043,8 +1045,15 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
        return 0;
 }
-void __perf_event_sched_out(struct perf_event_context *ctx,
+enum event_type_t {
-                              struct perf_cpu_context *cpuctx)
+        EVENT_FLEXIBLE = 0x1,
+        EVENT_PINNED = 0x2,
+        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+};
+static void ctx_sched_out(struct perf_event_context *ctx,
+                          struct perf_cpu_context *cpuctx,
+                          enum event_type_t event_type)
 {
        struct perf_event *event;
@@ -1055,10 +1064,18 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
        update_context_time(ctx);
        perf_disable();
-        if (ctx->nr_active) {
+        if (!ctx->nr_active)
-                list_for_each_entry(event, &ctx->group_list, group_entry)
+                goto out_enable;
+        if (event_type & EVENT_PINNED)
+                list_for_each_entry(event, &ctx->pinned_groups, group_entry)
                        group_sched_out(event, cpuctx, ctx);
-        }
+        if (event_type & EVENT_FLEXIBLE)
+                list_for_each_entry(event, &ctx->flexible_groups, group_entry)
+                        group_sched_out(event, cpuctx, ctx);
+ out_enable:
        perf_enable();
 out:
        raw_spin_unlock(&ctx->lock);
@@ -1170,9 +1187,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
 * not restart the event.
 */
 void perf_event_task_sched_out(struct task_struct *task,
-                                 struct task_struct *next, int cpu)
+                                 struct task_struct *next)
 {
-        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event_context *ctx = task->perf_event_ctxp;
        struct perf_event_context *next_ctx;
        struct perf_event_context *parent;
@@ -1220,15 +1237,13 @@ void perf_event_task_sched_out(struct task_struct *task,
        rcu_read_unlock();
        if (do_switch) {
-                __perf_event_sched_out(ctx, cpuctx);
+                ctx_sched_out(ctx, cpuctx, EVENT_ALL);
                cpuctx->task_ctx = NULL;
        }
 }
-/*
+static void task_ctx_sched_out(struct perf_event_context *ctx,
- * Called with IRQs disabled
+                               enum event_type_t event_type)
- */
-static void __perf_event_task_sched_out(struct perf_event_context *ctx)
 {
        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
@@ -1238,47 +1253,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx)
        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
                return;
-        __perf_event_sched_out(ctx, cpuctx);
+        ctx_sched_out(ctx, cpuctx, event_type);
        cpuctx->task_ctx = NULL;
 }
 /*
 * Called with IRQs disabled
 */
-static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
+static void __perf_event_task_sched_out(struct perf_event_context *ctx)
+{
+        task_ctx_sched_out(ctx, EVENT_ALL);
+}
+/*
+ * Called with IRQs disabled
+ */
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+                              enum event_type_t event_type)
 {
-        __perf_event_sched_out(&cpuctx->ctx, cpuctx);
+        ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
 }
 static void
-__perf_event_sched_in(struct perf_event_context *ctx,
+ctx_pinned_sched_in(struct perf_event_context *ctx,
-                        struct perf_cpu_context *cpuctx, int cpu)
+                    struct perf_cpu_context *cpuctx)
 {
        struct perf_event *event;
-        int can_add_hw = 1;
-        raw_spin_lock(&ctx->lock);
-        ctx->is_active = 1;
-        if (likely(!ctx->nr_events))
-                goto out;
-        ctx->timestamp = perf_clock();
+        list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
+                if (event->state <= PERF_EVENT_STATE_OFF)
-        perf_disable();
-        /*
-         * First go through the list and put on any pinned groups
-         * in order to give them the best chance of going on.
-         */
-        list_for_each_entry(event, &ctx->group_list, group_entry) {
-                if (event->state <= PERF_EVENT_STATE_OFF ||
-                    !event->attr.pinned)
                        continue;
-                if (event->cpu != -1 && event->cpu != cpu)
+                if (event->cpu != -1 && event->cpu != smp_processor_id())
                        continue;
                if (group_can_go_on(event, cpuctx, 1))
-                        group_sched_in(event, cpuctx, ctx, cpu);
+                        group_sched_in(event, cpuctx, ctx);
                /*
                 * If this pinned group hasn't been scheduled,
@@ -1289,32 +1298,83 @@ __perf_event_sched_in(struct perf_event_context *ctx,
                        event->state = PERF_EVENT_STATE_ERROR;
                }
        }
+}
-        list_for_each_entry(event, &ctx->group_list, group_entry) {
+static void
-                /*
+ctx_flexible_sched_in(struct perf_event_context *ctx,
-                 * Ignore events in OFF or ERROR state, and
+                      struct perf_cpu_context *cpuctx)
-                 * ignore pinned events since we did them already.
+{
-                 */
+        struct perf_event *event;
-                if (event->state <= PERF_EVENT_STATE_OFF ||
+        int can_add_hw = 1;
-                    event->attr.pinned)
-                        continue;
+        list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
+                /* Ignore events in OFF or ERROR state */
+                if (event->state <= PERF_EVENT_STATE_OFF)
+                        continue;
                /*
                 * Listen to the 'cpu' scheduling filter constraint
                 * of events:
                 */
-                if (event->cpu != -1 && event->cpu != cpu)
+                if (event->cpu != -1 && event->cpu != smp_processor_id())
                        continue;
                if (group_can_go_on(event, cpuctx, can_add_hw))
-                        if (group_sched_in(event, cpuctx, ctx, cpu))
+                        if (group_sched_in(event, cpuctx, ctx))
                                can_add_hw = 0;
        }
+}
+static void
+ctx_sched_in(struct perf_event_context *ctx,
+             struct perf_cpu_context *cpuctx,
+             enum event_type_t event_type)
+{
+        raw_spin_lock(&ctx->lock);
+        ctx->is_active = 1;
+        if (likely(!ctx->nr_events))
+                goto out;
+        ctx->timestamp = perf_clock();
+        perf_disable();
+        /*
+         * First go through the list and put on any pinned groups
+         * in order to give them the best chance of going on.
+         */
+        if (event_type & EVENT_PINNED)
+                ctx_pinned_sched_in(ctx, cpuctx);
+        /* Then walk through the lower prio flexible groups */
+        if (event_type & EVENT_FLEXIBLE)
+                ctx_flexible_sched_in(ctx, cpuctx);
        perf_enable();
 out:
        raw_spin_unlock(&ctx->lock);
 }
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+                             enum event_type_t event_type)
+{
+        struct perf_event_context *ctx = &cpuctx->ctx;
+        ctx_sched_in(ctx, cpuctx, event_type);
+}
+static void task_ctx_sched_in(struct task_struct *task,
+                              enum event_type_t event_type)
+{
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+        struct perf_event_context *ctx = task->perf_event_ctxp;
+        if (likely(!ctx))
+                return;
+        if (cpuctx->task_ctx == ctx)
+                return;
+        ctx_sched_in(ctx, cpuctx, event_type);
+        cpuctx->task_ctx = ctx;
+}
 /*
 * Called from scheduler to add the events of the current task
 * with interrupts disabled.
@@ -1326,38 +1386,128 @@ __perf_event_sched_in(struct perf_event_context *ctx,
 * accessing the event control register. If a NMI hits, then it will
 * keep the event running.
 */
-void perf_event_task_sched_in(struct task_struct *task, int cpu)
+void perf_event_task_sched_in(struct task_struct *task)
 {
-        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
        struct perf_event_context *ctx = task->perf_event_ctxp;
        if (likely(!ctx))
                return;
        if (cpuctx->task_ctx == ctx)
                return;
-        __perf_event_sched_in(ctx, cpuctx, cpu);
+        /*
+         * We want to keep the following priority order:
+         * cpu pinned (that don't need to move), task pinned,
+         * cpu flexible, task flexible.
+         */
+        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+        ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
+        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+        ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
        cpuctx->task_ctx = ctx;
 }
-static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+#define MAX_INTERRUPTS (~0ULL)
+static void perf_log_throttle(struct perf_event *event, int enable);
+static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
 {
-        struct perf_event_context *ctx = &cpuctx->ctx;
+        u64 frequency = event->attr.sample_freq;
+        u64 sec = NSEC_PER_SEC;
+        u64 divisor, dividend;
+        int count_fls, nsec_fls, frequency_fls, sec_fls;
+        count_fls = fls64(count);
+        nsec_fls = fls64(nsec);
+        frequency_fls = fls64(frequency);
+        sec_fls = 30;
-        __perf_event_sched_in(ctx, cpuctx, cpu);
+        /*
+         * We got @count in @nsec, with a target of sample_freq HZ
+         * the target period becomes:
+         *
+         *             @count * 10^9
+         * period = -------------------
+         *          @nsec * sample_freq
+         *
+         */
+        /*
+         * Reduce accuracy by one bit such that @a and @b converge
+         * to a similar magnitude.
+         */
+#define REDUCE_FLS(a, b)                \
+do {                                    \
+        if (a##_fls > b##_fls) {        \
+                a >>= 1;                \
+                a##_fls--;              \
+        } else {                        \
+                b >>= 1;                \
+                b##_fls--;              \
+        }                               \
+} while (0)
+        /*
+         * Reduce accuracy until either term fits in a u64, then proceed with
+         * the other, so that finally we can do a u64/u64 division.
+         */
+        while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
+                REDUCE_FLS(nsec, frequency);
+                REDUCE_FLS(sec, count);
+        }
+        if (count_fls + sec_fls > 64) {
+                divisor = nsec * frequency;
+                while (count_fls + sec_fls > 64) {
+                        REDUCE_FLS(count, sec);
+                        divisor >>= 1;
+                }
+                dividend = count * sec;
+        } else {
+                dividend = count * sec;
+                while (nsec_fls + frequency_fls > 64) {
+                        REDUCE_FLS(nsec, frequency);
+                        dividend >>= 1;
+                }
+                divisor = nsec * frequency;
+        }
+        return div64_u64(dividend, divisor);
 }
-#define MAX_INTERRUPTS (~0ULL)
+static void perf_event_stop(struct perf_event *event)
+{
+        if (!event->pmu->stop)
+                return event->pmu->disable(event);
-static void perf_log_throttle(struct perf_event *event, int enable);
+        return event->pmu->stop(event);
+}
+static int perf_event_start(struct perf_event *event)
+{
+        if (!event->pmu->start)
+                return event->pmu->enable(event);
-static void perf_adjust_period(struct perf_event *event, u64 events)
+        return event->pmu->start(event);
+}
+static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 {
        struct hw_perf_event *hwc = &event->hw;
        u64 period, sample_period;
        s64 delta;
-        events *= hwc->sample_period;
+        period = perf_calculate_period(event, nsec, count);
-        period = div64_u64(events, event->attr.sample_freq);
        delta = (s64)(period - hwc->sample_period);
        delta = (delta + 7) / 8; /* low pass filter */
@@ -1368,13 +1518,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
                sample_period = 1;
        hwc->sample_period = sample_period;
+        if (atomic64_read(&hwc->period_left) > 8*sample_period) {
+                perf_disable();
+                perf_event_stop(event);
+                atomic64_set(&hwc->period_left, 0);
+                perf_event_start(event);
+                perf_enable();
+        }
 }
 static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 {
        struct perf_event *event;
        struct hw_perf_event *hwc;
-        u64 interrupts, freq;
+        u64 interrupts, now;
+        s64 delta;
        raw_spin_lock(&ctx->lock);
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
@@ -1395,44 +1554,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
                if (interrupts == MAX_INTERRUPTS) {
                        perf_log_throttle(event, 1);
                        event->pmu->unthrottle(event);
-                        interrupts = 2*sysctl_perf_event_sample_rate/HZ;
                }
                if (!event->attr.freq || !event->attr.sample_freq)
                        continue;
-                /*
+                event->pmu->read(event);
-                 * if the specified freq < HZ then we need to skip ticks
+                now = atomic64_read(&event->count);
-                 */
+                delta = now - hwc->freq_count_stamp;
-                if (event->attr.sample_freq < HZ) {
+                hwc->freq_count_stamp = now;
-                        freq = event->attr.sample_freq;
-                        hwc->freq_count += freq;
-                        hwc->freq_interrupts += interrupts;
-                        if (hwc->freq_count < HZ)
-                                continue;
-                        interrupts = hwc->freq_interrupts;
-                        hwc->freq_interrupts = 0;
-                        hwc->freq_count -= HZ;
-                } else
-                        freq = HZ;
-                perf_adjust_period(event, freq * interrupts);
-                /*
+                if (delta > 0)
-                 * In order to avoid being stalled by an (accidental) huge
+                        perf_adjust_period(event, TICK_NSEC, delta);
-                 * sample period, force reset the sample period if we didn't
-                 * get any events in this freq period.
-                 */
-                if (!interrupts) {
-                        perf_disable();
-                        event->pmu->disable(event);
-                        atomic64_set(&hwc->period_left, 0);
-                        event->pmu->enable(event);
-                        perf_enable();
-                }
        }
        raw_spin_unlock(&ctx->lock);
 }
@@ -1442,26 +1575,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 */
 static void rotate_ctx(struct perf_event_context *ctx)
 {
-        struct perf_event *event;
        if (!ctx->nr_events)
                return;
        raw_spin_lock(&ctx->lock);
-        /*
-         * Rotate the first entry last (works just fine for group events too):
+        /* Rotate the first entry last of non-pinned groups */
-         */
+        list_rotate_left(&ctx->flexible_groups);
-        perf_disable();
-        list_for_each_entry(event, &ctx->group_list, group_entry) {
-                list_move_tail(&event->group_entry, &ctx->group_list);
-                break;
-        }
-        perf_enable();
        raw_spin_unlock(&ctx->lock);
 }
-void perf_event_task_tick(struct task_struct *curr, int cpu)
+void perf_event_task_tick(struct task_struct *curr)
 {
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
@@ -1469,24 +1594,43 @@ void perf_event_task_tick(struct task_struct *curr, int cpu)
        if (!atomic_read(&nr_events))
                return;
-        cpuctx = &per_cpu(perf_cpu_context, cpu);
+        cpuctx = &__get_cpu_var(perf_cpu_context);
        ctx = curr->perf_event_ctxp;
+        perf_disable();
        perf_ctx_adjust_freq(&cpuctx->ctx);
        if (ctx)
                perf_ctx_adjust_freq(ctx);
-        perf_event_cpu_sched_out(cpuctx);
+        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
-                __perf_event_task_sched_out(ctx);
+                task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
        rotate_ctx(&cpuctx->ctx);
        if (ctx)
                rotate_ctx(ctx);
-        perf_event_cpu_sched_in(cpuctx, cpu);
+        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
-                perf_event_task_sched_in(curr, cpu);
+                task_ctx_sched_in(curr, EVENT_FLEXIBLE);
+        perf_enable();
+}
+static int event_enable_on_exec(struct perf_event *event,
+                                struct perf_event_context *ctx)
+{
+        if (!event->attr.enable_on_exec)
+                return 0;
+        event->attr.enable_on_exec = 0;
+        if (event->state >= PERF_EVENT_STATE_INACTIVE)
+                return 0;
+        __perf_event_mark_enabled(event, ctx);
+        return 1;
 }
 /*
@@ -1499,6 +1643,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
        struct perf_event *event;
        unsigned long flags;
        int enabled = 0;
+        int ret;
        local_irq_save(flags);
        ctx = task->perf_event_ctxp;
@@ -1509,14 +1654,16 @@ static void perf_event_enable_on_exec(struct task_struct *task)
        raw_spin_lock(&ctx->lock);
-        list_for_each_entry(event, &ctx->group_list, group_entry) {
+        list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
-                if (!event->attr.enable_on_exec)
+                ret = event_enable_on_exec(event, ctx);
-                        continue;
+                if (ret)
-                event->attr.enable_on_exec = 0;
+                        enabled = 1;
-                if (event->state >= PERF_EVENT_STATE_INACTIVE)
+        }
-                        continue;
-                __perf_event_mark_enabled(event, ctx);
+        list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
-                enabled = 1;
+                ret = event_enable_on_exec(event, ctx);
+                if (ret)
+                        enabled = 1;
        }
        /*
@@ -1527,7 +1674,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
        raw_spin_unlock(&ctx->lock);
-        perf_event_task_sched_in(task, smp_processor_id());
+        perf_event_task_sched_in(task);
 out:
        local_irq_restore(flags);
 }
@@ -1590,7 +1737,8 @@ __perf_event_init_context(struct perf_event_context *ctx,
 {
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
-        INIT_LIST_HEAD(&ctx->group_list);
+        INIT_LIST_HEAD(&ctx->pinned_groups);
+        INIT_LIST_HEAD(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
        atomic_set(&ctx->refcount, 1);
        ctx->task = task;
@@ -3608,7 +3756,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
                        /* .tid */
                        .start  = vma->vm_start,
                        .len    = vma->vm_end - vma->vm_start,
-                        .pgoff  = vma->vm_pgoff,
+                        .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
                },
        };
@@ -3688,12 +3836,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
        if (event->attr.freq) {
                u64 now = perf_clock();
-                s64 delta = now - hwc->freq_stamp;
+                s64 delta = now - hwc->freq_time_stamp;
-                hwc->freq_stamp = now;
+                hwc->freq_time_stamp = now;
-                if (delta > 0 && delta < TICK_NSEC)
+                if (delta > 0 && delta < 2*TICK_NSEC)
-                        perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
+                        perf_adjust_period(event, delta, hwc->last_period);
        }
        /*
@@ -4184,7 +4332,7 @@ static const struct pmu perf_ops_task_clock = {
        .read           = task_clock_perf_event_read,
 };
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_EVENT_TRACING
 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
                          int entry_size)
@@ -4289,7 +4437,7 @@ static void perf_event_free_filter(struct perf_event *event)
 {
 }
-#endif /* CONFIG_EVENT_PROFILE */
+#endif /* CONFIG_EVENT_TRACING */
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 static void bp_perf_event_destroy(struct perf_event *event)
@@ -4870,8 +5018,15 @@ inherit_event(struct perf_event *parent_event,
        else
                child_event->state = PERF_EVENT_STATE_OFF;
-        if (parent_event->attr.freq)
+        if (parent_event->attr.freq) {
-                child_event->hw.sample_period = parent_event->hw.sample_period;
+                u64 sample_period = parent_event->hw.sample_period;
+                struct hw_perf_event *hwc = &child_event->hw;
+                hwc->sample_period = sample_period;
+                hwc->last_period   = sample_period;
+                atomic64_set(&hwc->period_left, sample_period);
+        }
        child_event->overflow_handler = parent_event->overflow_handler;
@@ -5039,7 +5194,11 @@ void perf_event_exit_task(struct task_struct *child)
        mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
 again:
-        list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
+        list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
+                                 group_entry)
+                __perf_event_exit_task(child_event, child_ctx, child);
+        list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
                                 group_entry)
                __perf_event_exit_task(child_event, child_ctx, child);
@@ -5048,7 +5207,8 @@ again:
         * its siblings to the list, but we obtained 'tmp' before that which
         * will still point to the list head terminating the iteration.
         */
-        if (!list_empty(&child_ctx->group_list))
+        if (!list_empty(&child_ctx->pinned_groups) ||
+            !list_empty(&child_ctx->flexible_groups))
                goto again;
        mutex_unlock(&child_ctx->mutex);
@@ -5056,6 +5216,24 @@ again:
        put_ctx(child_ctx);
 }
+static void perf_free_event(struct perf_event *event,
+                            struct perf_event_context *ctx)
+{
+        struct perf_event *parent = event->parent;
+        if (WARN_ON_ONCE(!parent))
+                return;
+        mutex_lock(&parent->child_mutex);
+        list_del_init(&event->child_list);
+        mutex_unlock(&parent->child_mutex);
+        fput(parent->filp);
+        list_del_event(event, ctx);
+        free_event(event);
+}
 /*
 * free an unexposed, unused context as created by inheritance by
 * init_task below, used by fork() in case of fail.
@@ -5070,36 +5248,70 @@ void perf_event_free_task(struct task_struct *task)
        mutex_lock(&ctx->mutex);
 again:
-        list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
+        list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
-                struct perf_event *parent = event->parent;
+                perf_free_event(event, ctx);
-                if (WARN_ON_ONCE(!parent))
+        list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
-                        continue;
+                                 group_entry)
+                perf_free_event(event, ctx);
-                mutex_lock(&parent->child_mutex);
+        if (!list_empty(&ctx->pinned_groups) ||
-                list_del_init(&event->child_list);
+            !list_empty(&ctx->flexible_groups))
-                mutex_unlock(&parent->child_mutex);
+                goto again;
-                fput(parent->filp);
+        mutex_unlock(&ctx->mutex);
-                list_del_event(event, ctx);
+        put_ctx(ctx);
-                free_event(event);
+}
+static int
+inherit_task_group(struct perf_event *event, struct task_struct *parent,
+                   struct perf_event_context *parent_ctx,
+                   struct task_struct *child,
+                   int *inherited_all)
+{
+        int ret;
+        struct perf_event_context *child_ctx = child->perf_event_ctxp;
+        if (!event->attr.inherit) {
+                *inherited_all = 0;
+                return 0;
        }
-        if (!list_empty(&ctx->group_list))
+        if (!child_ctx) {
-                goto again;
+                /*
+                 * This is executed from the parent task context, so
+                 * inherit events that have been marked for cloning.
+                 * First allocate and initialize a context for the
+                 * child.
+                 */
-        mutex_unlock(&ctx->mutex);
+                child_ctx = kzalloc(sizeof(struct perf_event_context),
+                                    GFP_KERNEL);
+                if (!child_ctx)
+                        return -ENOMEM;
-        put_ctx(ctx);
+                __perf_event_init_context(child_ctx, child);
+                child->perf_event_ctxp = child_ctx;
+                get_task_struct(child);
+        }
+        ret = inherit_group(event, parent, parent_ctx,
+                            child, child_ctx);
+        if (ret)
+                *inherited_all = 0;
+        return ret;
 }
 /*
 * Initialize the perf_event context in task_struct
 */
 int perf_event_init_task(struct task_struct *child)
 {
-        struct perf_event_context *child_ctx = NULL, *parent_ctx;
+        struct perf_event_context *child_ctx, *parent_ctx;
        struct perf_event_context *cloned_ctx;
        struct perf_event *event;
        struct task_struct *parent = current;
@@ -5137,41 +5349,22 @@ int perf_event_init_task(struct task_struct *child)
         * We dont have to disable NMIs - we are only looking at
         * the list, not manipulating it:
         */
-        list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
+        list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
+                ret = inherit_task_group(event, parent, parent_ctx, child,
-                if (!event->attr.inherit) {
+                                         &inherited_all);
-                        inherited_all = 0;
+                if (ret)
-                        continue;
+                        break;
-                }
+        }
-                if (!child->perf_event_ctxp) {
-                        /*
-                         * This is executed from the parent task context, so
-                         * inherit events that have been marked for cloning.
-                         * First allocate and initialize a context for the
-                         * child.
-                         */
-                        child_ctx = kzalloc(sizeof(struct perf_event_context),
-                                            GFP_KERNEL);
-                        if (!child_ctx) {
-                                ret = -ENOMEM;
-                                break;
-                        }
-                        __perf_event_init_context(child_ctx, child);
-                        child->perf_event_ctxp = child_ctx;
-                        get_task_struct(child);
-                }
-                ret = inherit_group(event, parent, parent_ctx,
+        list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
-                                             child, child_ctx);
+                ret = inherit_task_group(event, parent, parent_ctx, child,
-                if (ret) {
+                                         &inherited_all);
-                        inherited_all = 0;
+                if (ret)
                        break;
-                }
        }
+        child_ctx = child->perf_event_ctxp;
        if (child_ctx && inherited_all) {
                /*
                 * Mark the child context as a clone of the parent
@@ -5220,7 +5413,9 @@ static void __perf_event_exit_cpu(void *info)
        struct perf_event_context *ctx = &cpuctx->ctx;
        struct perf_event *event, *tmp;
-        list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
+        list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
+                __perf_event_remove_from_context(event);
+        list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
                __perf_event_remove_from_context(event);
 }
 static void perf_event_exit_cpu(int cpu)
@@ -5258,6 +5453,10 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
                perf_event_exit_cpu(cpu);
                break;
+        case CPU_DEAD:
+                hw_perf_event_setup_offline(cpu);
+                break;
        default:
                break;
        }
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 91e09d3b2eb2..5c36ea9d55d2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,15 @@ config PM_DEBUG
        code. This is helpful when debugging and reporting PM bugs, like
        suspend support.
+config PM_ADVANCED_DEBUG
+        bool "Extra PM attributes in sysfs for low-level debugging/testing"
+        depends on PM_DEBUG
+        default n
+        ---help---
+        Add extra sysfs attributes allowing one to access some Power Management
+        fields of device objects from user space.  If you are not a kernel
+        developer interested in debugging/testing Power Management, say "no".
 config PM_VERBOSE
        bool "Verbose Power Management debugging"
        depends on PM_DEBUG
@@ -85,6 +94,11 @@ config PM_SLEEP
        depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
        default y
+config PM_SLEEP_ADVANCED_DEBUG
+        bool
+        depends on PM_ADVANCED_DEBUG
+        default n
 config SUSPEND
        bool "Suspend to RAM and standby"
        depends on PM && ARCH_SUSPEND_POSSIBLE
@@ -222,3 +236,8 @@ config PM_RUNTIME
          and the bus type drivers of the buses the devices are on are
          responsible for the actual handling of the autosuspend requests and
          wake-up events.
+config PM_OPS
+        bool
+        depends on PM_SLEEP || PM_RUNTIME
+        default y
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0998c7139053..b58800b21fc0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val)
                        == NOTIFY_BAD) ? -EINVAL : 0;
 }
+/* If set, devices may be suspended and resumed asynchronously. */
+int pm_async_enabled = 1;
+static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
+                             char *buf)
+{
+        return sprintf(buf, "%d\n", pm_async_enabled);
+}
+static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
+                              const char *buf, size_t n)
+{
+        unsigned long val;
+        if (strict_strtoul(buf, 10, &val))
+                return -EINVAL;
+        if (val > 1)
+                return -EINVAL;
+        pm_async_enabled = val;
+        return n;
+}
+power_attr(pm_async);
 #ifdef CONFIG_PM_DEBUG
 int pm_test_level = TEST_NONE;
@@ -208,9 +234,12 @@ static struct attribute * g[] = {
 #ifdef CONFIG_PM_TRACE
        &pm_trace_attr.attr,
 #endif
-#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG)
+#ifdef CONFIG_PM_SLEEP
+        &pm_async_attr.attr,
+#ifdef CONFIG_PM_DEBUG
        &pm_test_attr.attr,
 #endif
+#endif
        NULL,
 };
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 36cb168e4330..830cadecbdfc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1181,7 +1181,7 @@ static void free_unnecessary_pages(void)
        memory_bm_position_reset(&copy_bm);
-        while (to_free_normal > 0 && to_free_highmem > 0) {
+        while (to_free_normal > 0 || to_free_highmem > 0) {
                unsigned long pfn = memory_bm_next_pfn(&copy_bm);
                struct page *page = pfn_to_page(pfn);
@@ -1500,7 +1500,7 @@ asmlinkage int swsusp_save(void)
 {
        unsigned int nr_pages, nr_highmem;
-        printk(KERN_INFO "PM: Creating hibernation image: \n");
+        printk(KERN_INFO "PM: Creating hibernation image:\n");
        drain_local_pages(NULL);
        nr_pages = count_data_pages();
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 09b2b0ae9e9d..1d575733d4e1 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -657,10 +657,6 @@ int swsusp_read(unsigned int *flags_p)
        struct swsusp_info *header;
        *flags_p = swsusp_header->flags;
-        if (IS_ERR(resume_bdev)) {
-                pr_debug("PM: Image device not initialised\n");
-                return PTR_ERR(resume_bdev);
-        }
        memset(&snapshot, 0, sizeof(struct snapshot_handle));
        error = snapshot_write_next(&snapshot, PAGE_SIZE);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
deleted file mode 100644
index 5b3601bd1893..000000000000
--- a/kernel/power/swsusp.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * linux/kernel/power/swsusp.c
- *
- * This file provides code to write suspend image to swap and read it back.
- *
- * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
- *
- * This file is released under the GPLv2.
- *
- * I'd like to thank the following people for their work:
- *
- * Pavel Machek <pavel@ucw.cz>:
- * Modifications, defectiveness pointing, being with me at the very beginning,
- * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
- *
- * Steve Doddi <dirk@loth.demon.co.uk>:
- * Support the possibility of hardware state restoring.
- *
- * Raph <grey.havens@earthling.net>:
- * Support for preserving states of network devices and virtual console
- * (including X and svgatextmode)
- *
- * Kurt Garloff <garloff@suse.de>:
- * Straightened the critical function in order to prevent compilers from
- * playing tricks with local variables.
- *
- * Andreas Mohr <a.mohr@mailto.de>
- *
- * Alex Badea <vampire@go.ro>:
- * Fixed runaway init
- *
- * Rafael J. Wysocki <rjw@sisk.pl>
- * Reworked the freeing of memory and the handling of swap
- *
- * More state savers are welcome. Especially for the scsi layer...
- *
- * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
- */
-#include <linux/mm.h>
-#include <linux/suspend.h>
-#include <linux/spinlock.h>
-#include <linux/kernel.h>
-#include <linux/major.h>
-#include <linux/swap.h>
-#include <linux/pm.h>
-#include <linux/swapops.h>
-#include <linux/bootmem.h>
-#include <linux/syscalls.h>
-#include <linux/highmem.h>
-#include <linux/time.h>
-#include <linux/rbtree.h>
-#include <linux/io.h>
-#include "power.h"
-int in_suspend __nosavedata = 0;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bf0014d6a5f0..4d2289626a84 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -195,6 +195,15 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
        return res;
 }
+static void snapshot_deprecated_ioctl(unsigned int cmd)
+{
+        if (printk_ratelimit())
+                printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
+                                "be removed soon, update your suspend-to-disk "
+                                "utilities\n",
+                                __builtin_return_address(0), cmd);
+}
 static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                                                        unsigned long arg)
 {
@@ -246,8 +255,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                data->frozen = 0;
                break;
-        case SNAPSHOT_CREATE_IMAGE:
        case SNAPSHOT_ATOMIC_SNAPSHOT:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_CREATE_IMAGE:
                if (data->mode != O_RDONLY || !data->frozen  || data->ready) {
                        error = -EPERM;
                        break;
@@ -275,8 +285,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                data->ready = 0;
                break;
-        case SNAPSHOT_PREF_IMAGE_SIZE:
        case SNAPSHOT_SET_IMAGE_SIZE:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_PREF_IMAGE_SIZE:
                image_size = arg;
                break;
@@ -290,15 +301,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                error = put_user(size, (loff_t __user *)arg);
                break;
-        case SNAPSHOT_AVAIL_SWAP_SIZE:
        case SNAPSHOT_AVAIL_SWAP:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_AVAIL_SWAP_SIZE:
                size = count_swap_pages(data->swap, 1);
                size <<= PAGE_SHIFT;
                error = put_user(size, (loff_t __user *)arg);
                break;
-        case SNAPSHOT_ALLOC_SWAP_PAGE:
        case SNAPSHOT_GET_SWAP_PAGE:
+                snapshot_deprecated_ioctl(cmd);
+        case SNAPSHOT_ALLOC_SWAP_PAGE:
                if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
                        error = -ENODEV;
                        break;
@@ -321,6 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                break;
        case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
+                snapshot_deprecated_ioctl(cmd);
                if (!swsusp_swap_in_use()) {
                        /*
                         * User space encodes device types as two-byte values,
@@ -362,6 +376,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                break;
        case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
+                snapshot_deprecated_ioctl(cmd);
                error = -EINVAL;
                switch (arg) {
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 23bd09cd042e..42ad8ae729a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
+#include <linux/regset.h>
 /*
@@ -511,6 +512,47 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
        return 0;
 }
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+static const struct user_regset *
+find_regset(const struct user_regset_view *view, unsigned int type)
+{
+        const struct user_regset *regset;
+        int n;
+        for (n = 0; n < view->n; ++n) {
+                regset = view->regsets + n;
+                if (regset->core_note_type == type)
+                        return regset;
+        }
+        return NULL;
+}
+static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
+                         struct iovec *kiov)
+{
+        const struct user_regset_view *view = task_user_regset_view(task);
+        const struct user_regset *regset = find_regset(view, type);
+        int regset_no;
+        if (!regset || (kiov->iov_len % regset->size) != 0)
+                return -EINVAL;
+        regset_no = regset - view->regsets;
+        kiov->iov_len = min(kiov->iov_len,
+                            (__kernel_size_t) (regset->n * regset->size));
+        if (req == PTRACE_GETREGSET)
+                return copy_regset_to_user(task, view, regset_no, 0,
+                                           kiov->iov_len, kiov->iov_base);
+        else
+                return copy_regset_from_user(task, view, regset_no, 0,
+                                             kiov->iov_len, kiov->iov_base);
+}
+#endif
 int ptrace_request(struct task_struct *child, long request,
                   long addr, long data)
 {
@@ -573,6 +615,26 @@ int ptrace_request(struct task_struct *child, long request,
                        return 0;
                return ptrace_resume(child, request, SIGKILL);
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+        case PTRACE_GETREGSET:
+        case PTRACE_SETREGSET:
+        {
+                struct iovec kiov;
+                struct iovec __user *uiov = (struct iovec __user *) data;
+                if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
+                        return -EFAULT;
+                if (__get_user(kiov.iov_base, &uiov->iov_base) ||
+                    __get_user(kiov.iov_len, &uiov->iov_len))
+                        return -EFAULT;
+                ret = ptrace_regset(child, request, addr, &kiov);
+                if (!ret)
+                        ret = __put_user(kiov.iov_len, &uiov->iov_len);
+                break;
+        }
+#endif
        default:
                break;
        }
@@ -711,6 +773,32 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
                else
                        ret = ptrace_setsiginfo(child, &siginfo);
                break;
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+        case PTRACE_GETREGSET:
+        case PTRACE_SETREGSET:
+        {
+                struct iovec kiov;
+                struct compat_iovec __user *uiov =
+                        (struct compat_iovec __user *) datap;
+                compat_uptr_t ptr;
+                compat_size_t len;
+                if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
+                        return -EFAULT;
+                if (__get_user(ptr, &uiov->iov_base) ||
+                    __get_user(len, &uiov->iov_len))
+                        return -EFAULT;
+                kiov.iov_base = compat_ptr(ptr);
+                kiov.iov_len = len;
+                ret = ptrace_regset(child, request, addr, &kiov);
+                if (!ret)
+                        ret = __put_user(kiov.iov_len, &uiov->iov_len);
+                break;
+        }
+#endif
        default:
                ret = ptrace_request(child, request, addr, data);
diff --git a/kernel/resource.c b/kernel/resource.c
index af96c1e4b54b..4e9d87fd7bc5 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -188,6 +188,36 @@ static int __release_resource(struct resource *old)
        return -EINVAL;
 }
+static void __release_child_resources(struct resource *r)
+{
+        struct resource *tmp, *p;
+        resource_size_t size;
+        p = r->child;
+        r->child = NULL;
+        while (p) {
+                tmp = p;
+                p = p->sibling;
+                tmp->parent = NULL;
+                tmp->sibling = NULL;
+                __release_child_resources(tmp);
+                printk(KERN_DEBUG "release child resource %pR\n", tmp);
+                /* need to restore size, and keep flags */
+                size = resource_size(tmp);
+                tmp->start = 0;
+                tmp->end = size - 1;
+        }
+}
+void release_child_resources(struct resource *r)
+{
+        write_lock(&resource_lock);
+        __release_child_resources(r);
+        write_unlock(&resource_lock);
+}
 /**
 * request_resource - request and reserve an I/O or memory resource
 * @root: root resource descriptor
@@ -297,14 +327,29 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 #endif
+static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
+{
+        return 1;
+}
+/*
+ * This generic page_is_ram() returns true if specified address is
+ * registered as "System RAM" in iomem_resource list.
+ */
+int __weak page_is_ram(unsigned long pfn)
+{
+        return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
+}
 /*
 * Find empty slot in the resource tree given range and alignment.
 */
 static int find_resource(struct resource *root, struct resource *new,
                         resource_size_t size, resource_size_t min,
                         resource_size_t max, resource_size_t align,
-                         void (*alignf)(void *, struct resource *,
+                         resource_size_t (*alignf)(void *,
-                                        resource_size_t, resource_size_t),
+                                                   const struct resource *,
+                                                   resource_size_t,
+                                                   resource_size_t),
                         void *alignf_data)
 {
        struct resource *this = root->child;
@@ -330,7 +375,7 @@ static int find_resource(struct resource *root, struct resource *new,
                        tmp.end = max;
                tmp.start = ALIGN(tmp.start, align);
                if (alignf)
-                        alignf(alignf_data, &tmp, size, align);
+                        tmp.start = alignf(alignf_data, &tmp, size, align);
                if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
                        new->start = tmp.start;
                        new->end = tmp.start + size - 1;
@@ -358,8 +403,10 @@ static int find_resource(struct resource *root, struct resource *new,
 int allocate_resource(struct resource *root, struct resource *new,
                      resource_size_t size, resource_size_t min,
                      resource_size_t max, resource_size_t align,
-                      void (*alignf)(void *, struct resource *,
+                      resource_size_t (*alignf)(void *,
-                                     resource_size_t, resource_size_t),
+                                                const struct resource *,
+                                                resource_size_t,
+                                                resource_size_t),
                      void *alignf_data)
 {
        int err;
diff --git a/kernel/sched.c b/kernel/sched.c
index 3218f5213717..6a212c97f523 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 */
 static DEFINE_MUTEX(sched_domains_mutex);
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
 #include <linux/cgroup.h>
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups);
 /* task group related information */
 struct task_group {
-#ifdef CONFIG_CGROUP_SCHED
        struct cgroup_subsys_state css;
-#endif
-#ifdef CONFIG_USER_SCHED
-        uid_t uid;
-#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* schedulable entities of this group on each cpu */
@@ -274,35 +268,7 @@ struct task_group {
        struct list_head children;
 };
-#ifdef CONFIG_USER_SCHED
-/* Helper function to pass uid information to create_sched_user() */
-void set_tg_uid(struct user_struct *user)
-{
-        user->tg->uid = user->uid;
-}
-/*
- * Root task group.
- *      Every UID task group (including init_task_group aka UID-0) will
- *      be a child to this group.
- */
-struct task_group root_task_group;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* Default task group's sched entity on each cpu */
-static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
-/* Default task group's cfs_rq on each cpu */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_RT_GROUP_SCHED
-static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
-#endif /* CONFIG_RT_GROUP_SCHED */
-#else /* !CONFIG_USER_SCHED */
 #define root_task_group init_task_group
-#endif /* CONFIG_USER_SCHED */
 /* task_group_lock serializes add/remove of task groups and also changes to
 * a task group's cpu shares.
@@ -318,11 +284,7 @@ static int root_task_group_empty(void)
 }
 #endif
-#ifdef CONFIG_USER_SCHED
-# define INIT_TASK_GROUP_LOAD   (2*NICE_0_LOAD)
-#else /* !CONFIG_USER_SCHED */
 # define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
-#endif /* CONFIG_USER_SCHED */
 /*
 * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p)
 {
        struct task_group *tg;
-#ifdef CONFIG_USER_SCHED
+#ifdef CONFIG_CGROUP_SCHED
-        rcu_read_lock();
-        tg = __task_cred(p)->user->tg;
-        rcu_read_unlock();
-#elif defined(CONFIG_CGROUP_SCHED)
        tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
                                struct task_group, css);
 #else
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p)
        return NULL;
 }
-#endif  /* CONFIG_GROUP_SCHED */
+#endif  /* CONFIG_CGROUP_SCHED */
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
@@ -478,7 +436,6 @@ struct rt_rq {
        struct rq *rq;
        struct list_head leaf_rt_rq_list;
        struct task_group *tg;
-        struct sched_rt_entity *rt_se;
 #endif
 };
@@ -946,16 +903,33 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
+ * Check whether the task is waking, we use this to synchronize against
+ * ttwu() so that task_cpu() reports a stable number.
+ *
+ * We need to make an exception for PF_STARTING tasks because the fork
+ * path might require task_rq_lock() to work, eg. it can call
+ * set_cpus_allowed_ptr() from the cpuset clone_ns code.
+ */
+static inline int task_is_waking(struct task_struct *p)
+{
+        return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
+}
+/*
 * __task_rq_lock - lock the runqueue a given task resides on.
 * Must be called interrupts disabled.
 */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
        __acquires(rq->lock)
 {
+        struct rq *rq;
        for (;;) {
-                struct rq *rq = task_rq(p);
+                while (task_is_waking(p))
+                        cpu_relax();
+                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-                if (likely(rq == task_rq(p)))
+                if (likely(rq == task_rq(p) && !task_is_waking(p)))
                        return rq;
                raw_spin_unlock(&rq->lock);
        }
@@ -972,10 +946,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
        struct rq *rq;
        for (;;) {
+                while (task_is_waking(p))
+                        cpu_relax();
                local_irq_save(*flags);
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-                if (likely(rq == task_rq(p)))
+                if (likely(rq == task_rq(p) && !task_is_waking(p)))
                        return rq;
                raw_spin_unlock_irqrestore(&rq->lock, *flags);
        }
@@ -1395,32 +1371,6 @@ static const u32 prio_to_wmult[40] = {
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
-/*
- * runqueue iterator, to support SMP load-balancing between different
- * scheduling classes, without having to expose their internal data
- * structures to the load-balancing proper:
- */
-struct rq_iterator {
-        void *arg;
-        struct task_struct *(*start)(void *);
-        struct task_struct *(*next)(void *);
-};
-#ifdef CONFIG_SMP
-static unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-              unsigned long max_load_move, struct sched_domain *sd,
-              enum cpu_idle_type idle, int *all_pinned,
-              int *this_best_prio, struct rq_iterator *iterator);
-static int
-iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                   struct sched_domain *sd, enum cpu_idle_type idle,
-                   struct rq_iterator *iterator);
-#endif
 /* Time spent by the tasks of the cpu accounting group executing in ... */
 enum cpuacct_stat_index {
        CPUACCT_STAT_USER,      /* ... user mode */
@@ -1706,16 +1656,6 @@ static void update_shares(struct sched_domain *sd)
        }
 }
-static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
-{
-        if (root_task_group_empty())
-                return;
-        raw_spin_unlock(&rq->lock);
-        update_shares(sd);
-        raw_spin_lock(&rq->lock);
-}
 static void update_h_load(long cpu)
 {
        if (root_task_group_empty())
@@ -1730,10 +1670,6 @@ static inline void update_shares(struct sched_domain *sd)
 {
 }
-static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
-{
-}
 #endif
 #ifdef CONFIG_PREEMPT
@@ -1810,6 +1746,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
        raw_spin_unlock(&busiest->lock);
        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
 }
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+        __acquires(rq1->lock)
+        __acquires(rq2->lock)
+{
+        BUG_ON(!irqs_disabled());
+        if (rq1 == rq2) {
+                raw_spin_lock(&rq1->lock);
+                __acquire(rq2->lock);   /* Fake it out ;) */
+        } else {
+                if (rq1 < rq2) {
+                        raw_spin_lock(&rq1->lock);
+                        raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+                } else {
+                        raw_spin_lock(&rq2->lock);
+                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+                }
+        }
+        update_rq_clock(rq1);
+        update_rq_clock(rq2);
+}
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+        __releases(rq1->lock)
+        __releases(rq2->lock)
+{
+        raw_spin_unlock(&rq1->lock);
+        if (rq1 != rq2)
+                raw_spin_unlock(&rq2->lock);
+        else
+                __release(rq2->lock);
+}
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1839,18 +1820,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 #endif
 }
-#include "sched_stats.h"
+static const struct sched_class rt_sched_class;
-#include "sched_idletask.c"
-#include "sched_fair.c"
-#include "sched_rt.c"
-#ifdef CONFIG_SCHED_DEBUG
-# include "sched_debug.c"
-#endif
 #define sched_class_highest (&rt_sched_class)
 #define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)
+#include "sched_stats.h"
 static void inc_nr_running(struct rq *rq)
 {
        rq->nr_running++;
@@ -1888,13 +1865,14 @@ static void update_avg(u64 *avg, u64 sample)
        *avg += diff >> 3;
 }
-static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
+static void
+enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 {
        if (wakeup)
                p->se.start_runtime = p->se.sum_exec_runtime;
        sched_info_queued(p);
-        p->sched_class->enqueue_task(rq, p, wakeup);
+        p->sched_class->enqueue_task(rq, p, wakeup, head);
        p->se.on_rq = 1;
 }
@@ -1917,6 +1895,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 }
 /*
+ * activate_task - move a task to the runqueue.
+ */
+static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+{
+        if (task_contributes_to_load(p))
+                rq->nr_uninterruptible--;
+        enqueue_task(rq, p, wakeup, false);
+        inc_nr_running(rq);
+}
+/*
+ * deactivate_task - remove a task from the runqueue.
+ */
+static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+{
+        if (task_contributes_to_load(p))
+                rq->nr_uninterruptible++;
+        dequeue_task(rq, p, sleep);
+        dec_nr_running(rq);
+}
+#include "sched_idletask.c"
+#include "sched_fair.c"
+#include "sched_rt.c"
+#ifdef CONFIG_SCHED_DEBUG
+# include "sched_debug.c"
+#endif
+/*
 * __normal_prio - return the priority that is based on the static prio
 */
 static inline int __normal_prio(struct task_struct *p)
@@ -1962,30 +1971,6 @@ static int effective_prio(struct task_struct *p)
        return p->prio;
 }
-/*
- * activate_task - move a task to the runqueue.
- */
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
-{
-        if (task_contributes_to_load(p))
-                rq->nr_uninterruptible--;
-        enqueue_task(rq, p, wakeup);
-        inc_nr_running(rq);
-}
-/*
- * deactivate_task - remove a task from the runqueue.
- */
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
-{
-        if (task_contributes_to_load(p))
-                rq->nr_uninterruptible++;
-        dequeue_task(rq, p, sleep);
-        dec_nr_running(rq);
-}
 /**
 * task_curr - is this task currently executing on a CPU?
 * @p: the task in question.
@@ -2413,14 +2398,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
        __task_rq_unlock(rq);
        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-        if (cpu != orig_cpu)
+        if (cpu != orig_cpu) {
+                /*
+                 * Since we migrate the task without holding any rq->lock,
+                 * we need to be careful with task_rq_lock(), since that
+                 * might end up locking an invalid rq.
+                 */
                set_task_cpu(p, cpu);
+        }
-        rq = __task_rq_lock(p);
+        rq = cpu_rq(cpu);
+        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
+        /*
+         * We migrated the task without holding either rq->lock, however
+         * since the task is not on the task list itself, nobody else
+         * will try and migrate the task, hence the rq should match the
+         * cpu we just moved it to.
+         */
+        WARN_ON(task_cpu(p) != cpu);
        WARN_ON(p->state != TASK_WAKING);
-        cpu = task_cpu(p);
 #ifdef CONFIG_SCHEDSTATS
        schedstat_inc(rq, ttwu_count);
@@ -2668,7 +2666,13 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
        set_task_cpu(p, cpu);
 #endif
-        rq = task_rq_lock(p, &flags);
+        /*
+         * Since the task is not on the rq and we still have TASK_WAKING set
+         * nobody else will migrate this task.
+         */
+        rq = cpu_rq(cpu);
+        raw_spin_lock_irqsave(&rq->lock, flags);
        BUG_ON(p->state != TASK_WAKING);
        p->state = TASK_RUNNING;
        update_rq_clock(rq);
@@ -2799,7 +2803,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         */
        prev_state = prev->state;
        finish_arch_switch(prev);
-        perf_event_task_sched_in(current, cpu_of(rq));
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+        local_irq_disable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+        perf_event_task_sched_in(current);
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+        local_irq_enable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
        finish_lock_switch(rq, prev);
        fire_sched_in_preempt_notifiers(current);
@@ -3104,50 +3114,6 @@ static void update_cpu_load(struct rq *this_rq)
 #ifdef CONFIG_SMP
 /*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static void double_rq_lock(struct rq *rq1, struct rq *rq2)
-        __acquires(rq1->lock)
-        __acquires(rq2->lock)
-{
-        BUG_ON(!irqs_disabled());
-        if (rq1 == rq2) {
-                raw_spin_lock(&rq1->lock);
-                __acquire(rq2->lock);   /* Fake it out ;) */
-        } else {
-                if (rq1 < rq2) {
-                        raw_spin_lock(&rq1->lock);
-                        raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
-                } else {
-                        raw_spin_lock(&rq2->lock);
-                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
-                }
-        }
-        update_rq_clock(rq1);
-        update_rq_clock(rq2);
-}
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
-        __releases(rq1->lock)
-        __releases(rq2->lock)
-{
-        raw_spin_unlock(&rq1->lock);
-        if (rq1 != rq2)
-                raw_spin_unlock(&rq2->lock);
-        else
-                __release(rq2->lock);
-}
-/*
 * sched_exec - execve() is a valuable balancing opportunity, because at
 * this point the task has the smallest effective memory and cache footprint.
 */
@@ -3195,1771 +3161,6 @@ again:
        task_rq_unlock(rq, &flags);
 }
-/*
- * pull_task - move a task from a remote runqueue to the local runqueue.
- * Both runqueues must be locked.
- */
-static void pull_task(struct rq *src_rq, struct task_struct *p,
-                      struct rq *this_rq, int this_cpu)
-{
-        deactivate_task(src_rq, p, 0);
-        set_task_cpu(p, this_cpu);
-        activate_task(this_rq, p, 0);
-        check_preempt_curr(this_rq, p, 0);
-}
-/*
- * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
- */
-static
-int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
-                     struct sched_domain *sd, enum cpu_idle_type idle,
-                     int *all_pinned)
-{
-        int tsk_cache_hot = 0;
-        /*
-         * We do not migrate tasks that are:
-         * 1) running (obviously), or
-         * 2) cannot be migrated to this CPU due to cpus_allowed, or
-         * 3) are cache-hot on their current CPU.
-         */
-        if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
-                schedstat_inc(p, se.nr_failed_migrations_affine);
-                return 0;
-        }
-        *all_pinned = 0;
-        if (task_running(rq, p)) {
-                schedstat_inc(p, se.nr_failed_migrations_running);
-                return 0;
-        }
-        /*
-         * Aggressive migration if:
-         * 1) task is cache cold, or
-         * 2) too many balance attempts have failed.
-         */
-        tsk_cache_hot = task_hot(p, rq->clock, sd);
-        if (!tsk_cache_hot ||
-                sd->nr_balance_failed > sd->cache_nice_tries) {
-#ifdef CONFIG_SCHEDSTATS
-                if (tsk_cache_hot) {
-                        schedstat_inc(sd, lb_hot_gained[idle]);
-                        schedstat_inc(p, se.nr_forced_migrations);
-                }
-#endif
-                return 1;
-        }
-        if (tsk_cache_hot) {
-                schedstat_inc(p, se.nr_failed_migrations_hot);
-                return 0;
-        }
-        return 1;
-}
-static unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-              unsigned long max_load_move, struct sched_domain *sd,
-              enum cpu_idle_type idle, int *all_pinned,
-              int *this_best_prio, struct rq_iterator *iterator)
-{
-        int loops = 0, pulled = 0, pinned = 0;
-        struct task_struct *p;
-        long rem_load_move = max_load_move;
-        if (max_load_move == 0)
-                goto out;
-        pinned = 1;
-        /*
-         * Start the load-balancing iterator:
-         */
-        p = iterator->start(iterator->arg);
-next:
-        if (!p || loops++ > sysctl_sched_nr_migrate)
-                goto out;
-        if ((p->se.load.weight >> 1) > rem_load_move ||
-            !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
-                p = iterator->next(iterator->arg);
-                goto next;
-        }
-        pull_task(busiest, p, this_rq, this_cpu);
-        pulled++;
-        rem_load_move -= p->se.load.weight;
-#ifdef CONFIG_PREEMPT
-        /*
-         * NEWIDLE balancing is a source of latency, so preemptible kernels
-         * will stop after the first task is pulled to minimize the critical
-         * section.
-         */
-        if (idle == CPU_NEWLY_IDLE)
-                goto out;
-#endif
-        /*
-         * We only want to steal up to the prescribed amount of weighted load.
-         */
-        if (rem_load_move > 0) {
-                if (p->prio < *this_best_prio)
-                        *this_best_prio = p->prio;
-                p = iterator->next(iterator->arg);
-                goto next;
-        }
-out:
-        /*
-         * Right now, this is one of only two places pull_task() is called,
-         * so we can safely collect pull_task() stats here rather than
-         * inside pull_task().
-         */
-        schedstat_add(sd, lb_gained[idle], pulled);
-        if (all_pinned)
-                *all_pinned = pinned;
-        return max_load_move - rem_load_move;
-}
-/*
- * move_tasks tries to move up to max_load_move weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                      unsigned long max_load_move,
-                      struct sched_domain *sd, enum cpu_idle_type idle,
-                      int *all_pinned)
-{
-        const struct sched_class *class = sched_class_highest;
-        unsigned long total_load_moved = 0;
-        int this_best_prio = this_rq->curr->prio;
-        do {
-                total_load_moved +=
-                        class->load_balance(this_rq, this_cpu, busiest,
-                                max_load_move - total_load_moved,
-                                sd, idle, all_pinned, &this_best_prio);
-                class = class->next;
-#ifdef CONFIG_PREEMPT
-                /*
-                 * NEWIDLE balancing is a source of latency, so preemptible
-                 * kernels will stop after the first task is pulled to minimize
-                 * the critical section.
-                 */
-                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
-                        break;
-#endif
-        } while (class && max_load_move > total_load_moved);
-        return total_load_moved > 0;
-}
-static int
-iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                   struct sched_domain *sd, enum cpu_idle_type idle,
-                   struct rq_iterator *iterator)
-{
-        struct task_struct *p = iterator->start(iterator->arg);
-        int pinned = 0;
-        while (p) {
-                if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
-                        pull_task(busiest, p, this_rq, this_cpu);
-                        /*
-                         * Right now, this is only the second place pull_task()
-                         * is called, so we can safely collect pull_task()
-                         * stats here rather than inside pull_task().
-                         */
-                        schedstat_inc(sd, lb_gained[idle]);
-                        return 1;
-                }
-                p = iterator->next(iterator->arg);
-        }
-        return 0;
-}
-/*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
- * part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                         struct sched_domain *sd, enum cpu_idle_type idle)
-{
-        const struct sched_class *class;
-        for_each_class(class) {
-                if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
-                        return 1;
-        }
-        return 0;
-}
-/********** Helpers for find_busiest_group ************************/
-/*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- *              during load balancing.
- */
-struct sd_lb_stats {
-        struct sched_group *busiest; /* Busiest group in this sd */
-        struct sched_group *this;  /* Local group in this sd */
-        unsigned long total_load;  /* Total load of all groups in sd */
-        unsigned long total_pwr;   /*   Total power of all groups in sd */
-        unsigned long avg_load;    /* Average load across all groups in sd */
-        /** Statistics of this group */
-        unsigned long this_load;
-        unsigned long this_load_per_task;
-        unsigned long this_nr_running;
-        /* Statistics of the busiest group */
-        unsigned long max_load;
-        unsigned long busiest_load_per_task;
-        unsigned long busiest_nr_running;
-        int group_imb; /* Is there imbalance in this sd */
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-        int power_savings_balance; /* Is powersave balance needed for this sd */
-        struct sched_group *group_min; /* Least loaded group in sd */
-        struct sched_group *group_leader; /* Group which relieves group_min */
-        unsigned long min_load_per_task; /* load_per_task in group_min */
-        unsigned long leader_nr_running; /* Nr running of group_leader */
-        unsigned long min_nr_running; /* Nr running of group_min */
-#endif
-};
-/*
- * sg_lb_stats - stats of a sched_group required for load_balancing
- */
-struct sg_lb_stats {
-        unsigned long avg_load; /*Avg load across the CPUs of the group */
-        unsigned long group_load; /* Total load over the CPUs of the group */
-        unsigned long sum_nr_running; /* Nr tasks running in the group */
-        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
-        unsigned long group_capacity;
-        int group_imb; /* Is there an imbalance in the group ? */
-};
-/**
- * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
- * @group: The group whose first cpu is to be returned.
- */
-static inline unsigned int group_first_cpu(struct sched_group *group)
-{
-        return cpumask_first(sched_group_cpus(group));
-}
-/**
- * get_sd_load_idx - Obtain the load index for a given sched domain.
- * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
- */
-static inline int get_sd_load_idx(struct sched_domain *sd,
-                                        enum cpu_idle_type idle)
-{
-        int load_idx;
-        switch (idle) {
-        case CPU_NOT_IDLE:
-                load_idx = sd->busy_idx;
-                break;
-        case CPU_NEWLY_IDLE:
-                load_idx = sd->newidle_idx;
-                break;
-        default:
-                load_idx = sd->idle_idx;
-                break;
-        }
-        return load_idx;
-}
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * init_sd_power_savings_stats - Initialize power savings statistics for
- * the given sched_domain, during load balancing.
- *
- * @sd: Sched domain whose power-savings statistics are to be initialized.
- * @sds: Variable containing the statistics for sd.
- * @idle: Idle status of the CPU at which we're performing load-balancing.
- */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-        struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
-        /*
-         * Busy processors will not participate in power savings
-         * balance.
-         */
-        if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
-                sds->power_savings_balance = 0;
-        else {
-                sds->power_savings_balance = 1;
-                sds->min_nr_running = ULONG_MAX;
-                sds->leader_nr_running = 0;
-        }
-}
-/**
- * update_sd_power_savings_stats - Update the power saving stats for a
- * sched_domain while performing load balancing.
- *
- * @group: sched_group belonging to the sched_domain under consideration.
- * @sds: Variable containing the statistics of the sched_domain
- * @local_group: Does group contain the CPU for which we're performing
- *              load balancing ?
- * @sgs: Variable containing the statistics of the group.
- */
-static inline void update_sd_power_savings_stats(struct sched_group *group,
-        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-        if (!sds->power_savings_balance)
-                return;
-        /*
-         * If the local group is idle or completely loaded
-         * no need to do power savings balance at this domain
-         */
-        if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
-                                !sds->this_nr_running))
-                sds->power_savings_balance = 0;
-        /*
-         * If a group is already running at full capacity or idle,
-         * don't include that group in power savings calculations
-         */
-        if (!sds->power_savings_balance ||
-                sgs->sum_nr_running >= sgs->group_capacity ||
-                !sgs->sum_nr_running)
-                return;
-        /*
-         * Calculate the group which has the least non-idle load.
-         * This is the group from where we need to pick up the load
-         * for saving power
-         */
-        if ((sgs->sum_nr_running < sds->min_nr_running) ||
-            (sgs->sum_nr_running == sds->min_nr_running &&
-             group_first_cpu(group) > group_first_cpu(sds->group_min))) {
-                sds->group_min = group;
-                sds->min_nr_running = sgs->sum_nr_running;
-                sds->min_load_per_task = sgs->sum_weighted_load /
-                                                sgs->sum_nr_running;
-        }
-        /*
-         * Calculate the group which is almost near its
-         * capacity but still has some space to pick up some load
-         * from other group and save more power
-         */
-        if (sgs->sum_nr_running + 1 > sgs->group_capacity)
-                return;
-        if (sgs->sum_nr_running > sds->leader_nr_running ||
-            (sgs->sum_nr_running == sds->leader_nr_running &&
-             group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
-                sds->group_leader = group;
-                sds->leader_nr_running = sgs->sum_nr_running;
-        }
-}
-/**
- * check_power_save_busiest_group - see if there is potential for some power-savings balance
- * @sds: Variable containing the statistics of the sched_domain
- *      under consideration.
- * @this_cpu: Cpu at which we're currently performing load-balancing.
- * @imbalance: Variable to store the imbalance.
- *
- * Description:
- * Check if we have potential to perform some power-savings balance.
- * If yes, set the busiest group to be the least loaded group in the
- * sched_domain, so that it's CPUs can be put to idle.
- *
- * Returns 1 if there is potential to perform power-savings balance.
- * Else returns 0.
- */
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
-                                        int this_cpu, unsigned long *imbalance)
-{
-        if (!sds->power_savings_balance)
-                return 0;
-        if (sds->this != sds->group_leader ||
-                        sds->group_leader == sds->group_min)
-                return 0;
-        *imbalance = sds->min_load_per_task;
-        sds->busiest = sds->group_min;
-        return 1;
-}
-#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-        struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
-        return;
-}
-static inline void update_sd_power_savings_stats(struct sched_group *group,
-        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-        return;
-}
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
-                                        int this_cpu, unsigned long *imbalance)
-{
-        return 0;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
-{
-        return SCHED_LOAD_SCALE;
-}
-unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
-{
-        return default_scale_freq_power(sd, cpu);
-}
-unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
-{
-        unsigned long weight = cpumask_weight(sched_domain_span(sd));
-        unsigned long smt_gain = sd->smt_gain;
-        smt_gain /= weight;
-        return smt_gain;
-}
-unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
-{
-        return default_scale_smt_power(sd, cpu);
-}
-unsigned long scale_rt_power(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        u64 total, available;
-        sched_avg_update(rq);
-        total = sched_avg_period() + (rq->clock - rq->age_stamp);
-        available = total - rq->rt_avg;
-        if (unlikely((s64)total < SCHED_LOAD_SCALE))
-                total = SCHED_LOAD_SCALE;
-        total >>= SCHED_LOAD_SHIFT;
-        return div_u64(available, total);
-}
-static void update_cpu_power(struct sched_domain *sd, int cpu)
-{
-        unsigned long weight = cpumask_weight(sched_domain_span(sd));
-        unsigned long power = SCHED_LOAD_SCALE;
-        struct sched_group *sdg = sd->groups;
-        if (sched_feat(ARCH_POWER))
-                power *= arch_scale_freq_power(sd, cpu);
-        else
-                power *= default_scale_freq_power(sd, cpu);
-        power >>= SCHED_LOAD_SHIFT;
-        if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
-                if (sched_feat(ARCH_POWER))
-                        power *= arch_scale_smt_power(sd, cpu);
-                else
-                        power *= default_scale_smt_power(sd, cpu);
-                power >>= SCHED_LOAD_SHIFT;
-        }
-        power *= scale_rt_power(cpu);
-        power >>= SCHED_LOAD_SHIFT;
-        if (!power)
-                power = 1;
-        sdg->cpu_power = power;
-}
-static void update_group_power(struct sched_domain *sd, int cpu)
-{
-        struct sched_domain *child = sd->child;
-        struct sched_group *group, *sdg = sd->groups;
-        unsigned long power;
-        if (!child) {
-                update_cpu_power(sd, cpu);
-                return;
-        }
-        power = 0;
-        group = child->groups;
-        do {
-                power += group->cpu_power;
-                group = group->next;
-        } while (group != child->groups);
-        sdg->cpu_power = power;
-}
-/**
- * update_sg_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: The sched_domain whose statistics are to be updated.
- * @group: sched_group whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
- * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @sd_idle: Idle status of the sched_domain containing group.
- * @local_group: Does group contain this_cpu.
- * @cpus: Set of cpus considered for load balancing.
- * @balance: Should we balance.
- * @sgs: variable to hold the statistics for this group.
- */
-static inline void update_sg_lb_stats(struct sched_domain *sd,
-                        struct sched_group *group, int this_cpu,
-                        enum cpu_idle_type idle, int load_idx, int *sd_idle,
-                        int local_group, const struct cpumask *cpus,
-                        int *balance, struct sg_lb_stats *sgs)
-{
-        unsigned long load, max_cpu_load, min_cpu_load;
-        int i;
-        unsigned int balance_cpu = -1, first_idle_cpu = 0;
-        unsigned long sum_avg_load_per_task;
-        unsigned long avg_load_per_task;
-        if (local_group) {
-                balance_cpu = group_first_cpu(group);
-                if (balance_cpu == this_cpu)
-                        update_group_power(sd, this_cpu);
-        }
-        /* Tally up the load of all CPUs in the group */
-        sum_avg_load_per_task = avg_load_per_task = 0;
-        max_cpu_load = 0;
-        min_cpu_load = ~0UL;
-        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
-                struct rq *rq = cpu_rq(i);
-                if (*sd_idle && rq->nr_running)
-                        *sd_idle = 0;
-                /* Bias balancing toward cpus of our domain */
-                if (local_group) {
-                        if (idle_cpu(i) && !first_idle_cpu) {
-                                first_idle_cpu = 1;
-                                balance_cpu = i;
-                        }
-                        load = target_load(i, load_idx);
-                } else {
-                        load = source_load(i, load_idx);
-                        if (load > max_cpu_load)
-                                max_cpu_load = load;
-                        if (min_cpu_load > load)
-                                min_cpu_load = load;
-                }
-                sgs->group_load += load;
-                sgs->sum_nr_running += rq->nr_running;
-                sgs->sum_weighted_load += weighted_cpuload(i);
-                sum_avg_load_per_task += cpu_avg_load_per_task(i);
-        }
-        /*
-         * First idle cpu or the first cpu(busiest) in this sched group
-         * is eligible for doing load balancing at this and above
-         * domains. In the newly idle case, we will allow all the cpu's
-         * to do the newly idle load balance.
-         */
-        if (idle != CPU_NEWLY_IDLE && local_group &&
-            balance_cpu != this_cpu && balance) {
-                *balance = 0;
-                return;
-        }
-        /* Adjust by relative CPU power of the group */
-        sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
-        /*
-         * Consider the group unbalanced when the imbalance is larger
-         * than the average weight of two tasks.
-         *
-         * APZ: with cgroup the avg task weight can vary wildly and
-         *      might not be a suitable number - should we keep a
-         *      normalized nr_running number somewhere that negates
-         *      the hierarchy?
-         */
-        avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
-                group->cpu_power;
-        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
-                sgs->group_imb = 1;
-        sgs->group_capacity =
-                DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
-}
-/**
- * update_sd_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: sched_domain whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing group.
- * @cpus: Set of cpus considered for load balancing.
- * @balance: Should we balance.
- * @sds: variable to hold the statistics for this sched_domain.
- */
-static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
-                        enum cpu_idle_type idle, int *sd_idle,
-                        const struct cpumask *cpus, int *balance,
-                        struct sd_lb_stats *sds)
-{
-        struct sched_domain *child = sd->child;
-        struct sched_group *group = sd->groups;
-        struct sg_lb_stats sgs;
-        int load_idx, prefer_sibling = 0;
-        if (child && child->flags & SD_PREFER_SIBLING)
-                prefer_sibling = 1;
-        init_sd_power_savings_stats(sd, sds, idle);
-        load_idx = get_sd_load_idx(sd, idle);
-        do {
-                int local_group;
-                local_group = cpumask_test_cpu(this_cpu,
-                                               sched_group_cpus(group));
-                memset(&sgs, 0, sizeof(sgs));
-                update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
-                                local_group, cpus, balance, &sgs);
-                if (local_group && balance && !(*balance))
-                        return;
-                sds->total_load += sgs.group_load;
-                sds->total_pwr += group->cpu_power;
-                /*
-                 * In case the child domain prefers tasks go to siblings
-                 * first, lower the group capacity to one so that we'll try
-                 * and move all the excess tasks away.
-                 */
-                if (prefer_sibling)
-                        sgs.group_capacity = min(sgs.group_capacity, 1UL);
-                if (local_group) {
-                        sds->this_load = sgs.avg_load;
-                        sds->this = group;
-                        sds->this_nr_running = sgs.sum_nr_running;
-                        sds->this_load_per_task = sgs.sum_weighted_load;
-                } else if (sgs.avg_load > sds->max_load &&
-                           (sgs.sum_nr_running > sgs.group_capacity ||
-                                sgs.group_imb)) {
-                        sds->max_load = sgs.avg_load;
-                        sds->busiest = group;
-                        sds->busiest_nr_running = sgs.sum_nr_running;
-                        sds->busiest_load_per_task = sgs.sum_weighted_load;
-                        sds->group_imb = sgs.group_imb;
-                }
-                update_sd_power_savings_stats(group, sds, local_group, &sgs);
-                group = group->next;
-        } while (group != sd->groups);
-}
-/**
- * fix_small_imbalance - Calculate the minor imbalance that exists
- *                      amongst the groups of a sched_domain, during
- *                      load balancing.
- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: Variable to store the imbalance.
- */
-static inline void fix_small_imbalance(struct sd_lb_stats *sds,
-                                int this_cpu, unsigned long *imbalance)
-{
-        unsigned long tmp, pwr_now = 0, pwr_move = 0;
-        unsigned int imbn = 2;
-        if (sds->this_nr_running) {
-                sds->this_load_per_task /= sds->this_nr_running;
-                if (sds->busiest_load_per_task >
-                                sds->this_load_per_task)
-                        imbn = 1;
-        } else
-                sds->this_load_per_task =
-                        cpu_avg_load_per_task(this_cpu);
-        if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
-                        sds->busiest_load_per_task * imbn) {
-                *imbalance = sds->busiest_load_per_task;
-                return;
-        }
-        /*
-         * OK, we don't have enough imbalance to justify moving tasks,
-         * however we may be able to increase total CPU power used by
-         * moving them.
-         */
-        pwr_now += sds->busiest->cpu_power *
-                        min(sds->busiest_load_per_task, sds->max_load);
-        pwr_now += sds->this->cpu_power *
-                        min(sds->this_load_per_task, sds->this_load);
-        pwr_now /= SCHED_LOAD_SCALE;
-        /* Amount of load we'd subtract */
-        tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
-                sds->busiest->cpu_power;
-        if (sds->max_load > tmp)
-                pwr_move += sds->busiest->cpu_power *
-                        min(sds->busiest_load_per_task, sds->max_load - tmp);
-        /* Amount of load we'd add */
-        if (sds->max_load * sds->busiest->cpu_power <
-                sds->busiest_load_per_task * SCHED_LOAD_SCALE)
-                tmp = (sds->max_load * sds->busiest->cpu_power) /
-                        sds->this->cpu_power;
-        else
-                tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
-                        sds->this->cpu_power;
-        pwr_move += sds->this->cpu_power *
-                        min(sds->this_load_per_task, sds->this_load + tmp);
-        pwr_move /= SCHED_LOAD_SCALE;
-        /* Move if we gain throughput */
-        if (pwr_move > pwr_now)
-                *imbalance = sds->busiest_load_per_task;
-}
-/**
- * calculate_imbalance - Calculate the amount of imbalance present within the
- *                       groups of a given sched_domain during load balance.
- * @sds: statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: Cpu for which currently load balance is being performed.
- * @imbalance: The variable to store the imbalance.
- */
-static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
-                unsigned long *imbalance)
-{
-        unsigned long max_pull;
-        /*
-         * In the presence of smp nice balancing, certain scenarios can have
-         * max load less than avg load(as we skip the groups at or below
-         * its cpu_power, while calculating max_load..)
-         */
-        if (sds->max_load < sds->avg_load) {
-                *imbalance = 0;
-                return fix_small_imbalance(sds, this_cpu, imbalance);
-        }
-        /* Don't want to pull so many tasks that a group would go idle */
-        max_pull = min(sds->max_load - sds->avg_load,
-                        sds->max_load - sds->busiest_load_per_task);
-        /* How much load to actually move to equalise the imbalance */
-        *imbalance = min(max_pull * sds->busiest->cpu_power,
-                (sds->avg_load - sds->this_load) * sds->this->cpu_power)
-                        / SCHED_LOAD_SCALE;
-        /*
-         * if *imbalance is less than the average load per runnable task
-         * there is no gaurantee that any tasks will be moved so we'll have
-         * a think about bumping its value to force at least one task to be
-         * moved
-         */
-        if (*imbalance < sds->busiest_load_per_task)
-                return fix_small_imbalance(sds, this_cpu, imbalance);
-}
-/******* find_busiest_group() helpers end here *********************/
-/**
- * find_busiest_group - Returns the busiest group within the sched_domain
- * if there is an imbalance. If there isn't an imbalance, and
- * the user has opted for power-savings, it returns a group whose
- * CPUs can be put to idle by rebalancing those tasks elsewhere, if
- * such a group exists.
- *
- * Also calculates the amount of weighted load which should be moved
- * to restore balance.
- *
- * @sd: The sched_domain whose busiest group is to be returned.
- * @this_cpu: The cpu for which load balancing is currently being performed.
- * @imbalance: Variable which stores amount of weighted load which should
- *              be moved to restore balance/put a group to idle.
- * @idle: The idle status of this_cpu.
- * @sd_idle: The idleness of sd
- * @cpus: The set of CPUs under consideration for load-balancing.
- * @balance: Pointer to a variable indicating if this_cpu
- *      is the appropriate cpu to perform load balancing at this_level.
- *
- * Returns:     - the busiest group if imbalance exists.
- *              - If no imbalance and user has opted for power-savings balance,
- *                 return the least loaded group whose CPUs can be
- *                 put to idle by rebalancing its tasks onto our group.
- */
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
-                   unsigned long *imbalance, enum cpu_idle_type idle,
-                   int *sd_idle, const struct cpumask *cpus, int *balance)
-{
-        struct sd_lb_stats sds;
-        memset(&sds, 0, sizeof(sds));
-        /*
-         * Compute the various statistics relavent for load balancing at
-         * this level.
-         */
-        update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
-                                        balance, &sds);
-        /* Cases where imbalance does not exist from POV of this_cpu */
-        /* 1) this_cpu is not the appropriate cpu to perform load balancing
-         *    at this level.
-         * 2) There is no busy sibling group to pull from.
-         * 3) This group is the busiest group.
-         * 4) This group is more busy than the avg busieness at this
-         *    sched_domain.
-         * 5) The imbalance is within the specified limit.
-         * 6) Any rebalance would lead to ping-pong
-         */
-        if (balance && !(*balance))
-                goto ret;
-        if (!sds.busiest || sds.busiest_nr_running == 0)
-                goto out_balanced;
-        if (sds.this_load >= sds.max_load)
-                goto out_balanced;
-        sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
-        if (sds.this_load >= sds.avg_load)
-                goto out_balanced;
-        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
-                goto out_balanced;
-        sds.busiest_load_per_task /= sds.busiest_nr_running;
-        if (sds.group_imb)
-                sds.busiest_load_per_task =
-                        min(sds.busiest_load_per_task, sds.avg_load);
-        /*
-         * We're trying to get all the cpus to the average_load, so we don't
-         * want to push ourselves above the average load, nor do we wish to
-         * reduce the max loaded cpu below the average load, as either of these
-         * actions would just result in more rebalancing later, and ping-pong
-         * tasks around. Thus we look for the minimum possible imbalance.
-         * Negative imbalances (*we* are more loaded than anyone else) will
-         * be counted as no imbalance for these purposes -- we can't fix that
-         * by pulling tasks to us. Be careful of negative numbers as they'll
-         * appear as very large values with unsigned longs.
-         */
-        if (sds.max_load <= sds.busiest_load_per_task)
-                goto out_balanced;
-        /* Looks like there is an imbalance. Compute it */
-        calculate_imbalance(&sds, this_cpu, imbalance);
-        return sds.busiest;
-out_balanced:
-        /*
-         * There is no obvious imbalance. But check if we can do some balancing
-         * to save power.
-         */
-        if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
-                return sds.busiest;
-ret:
-        *imbalance = 0;
-        return NULL;
-}
-/*
- * find_busiest_queue - find the busiest runqueue among the cpus in group.
- */
-static struct rq *
-find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
-                   unsigned long imbalance, const struct cpumask *cpus)
-{
-        struct rq *busiest = NULL, *rq;
-        unsigned long max_load = 0;
-        int i;
-        for_each_cpu(i, sched_group_cpus(group)) {
-                unsigned long power = power_of(i);
-                unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
-                unsigned long wl;
-                if (!cpumask_test_cpu(i, cpus))
-                        continue;
-                rq = cpu_rq(i);
-                wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
-                wl /= power;
-                if (capacity && rq->nr_running == 1 && wl > imbalance)
-                        continue;
-                if (wl > max_load) {
-                        max_load = wl;
-                        busiest = rq;
-                }
-        }
-        return busiest;
-}
-/*
- * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
- * so long as it is large enough.
- */
-#define MAX_PINNED_INTERVAL     512
-/* Working cpumask for load_balance and load_balance_newidle. */
-static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-/*
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
- * tasks if there is an imbalance.
- */
-static int load_balance(int this_cpu, struct rq *this_rq,
-                        struct sched_domain *sd, enum cpu_idle_type idle,
-                        int *balance)
-{
-        int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
-        struct sched_group *group;
-        unsigned long imbalance;
-        struct rq *busiest;
-        unsigned long flags;
-        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-        cpumask_copy(cpus, cpu_active_mask);
-        /*
-         * When power savings policy is enabled for the parent domain, idle
-         * sibling can pick up load irrespective of busy siblings. In this case,
-         * let the state of idle sibling percolate up as CPU_IDLE, instead of
-         * portraying it as CPU_NOT_IDLE.
-         */
-        if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                sd_idle = 1;
-        schedstat_inc(sd, lb_count[idle]);
-redo:
-        update_shares(sd);
-        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
-                                   cpus, balance);
-        if (*balance == 0)
-                goto out_balanced;
-        if (!group) {
-                schedstat_inc(sd, lb_nobusyg[idle]);
-                goto out_balanced;
-        }
-        busiest = find_busiest_queue(group, idle, imbalance, cpus);
-        if (!busiest) {
-                schedstat_inc(sd, lb_nobusyq[idle]);
-                goto out_balanced;
-        }
-        BUG_ON(busiest == this_rq);
-        schedstat_add(sd, lb_imbalance[idle], imbalance);
-        ld_moved = 0;
-        if (busiest->nr_running > 1) {
-                /*
-                 * Attempt to move tasks. If find_busiest_group has found
-                 * an imbalance but busiest->nr_running <= 1, the group is
-                 * still unbalanced. ld_moved simply stays zero, so it is
-                 * correctly treated as an imbalance.
-                 */
-                local_irq_save(flags);
-                double_rq_lock(this_rq, busiest);
-                ld_moved = move_tasks(this_rq, this_cpu, busiest,
-                                      imbalance, sd, idle, &all_pinned);
-                double_rq_unlock(this_rq, busiest);
-                local_irq_restore(flags);
-                /*
-                 * some other cpu did the load balance for us.
-                 */
-                if (ld_moved && this_cpu != smp_processor_id())
-                        resched_cpu(this_cpu);
-                /* All tasks on this runqueue were pinned by CPU affinity */
-                if (unlikely(all_pinned)) {
-                        cpumask_clear_cpu(cpu_of(busiest), cpus);
-                        if (!cpumask_empty(cpus))
-                                goto redo;
-                        goto out_balanced;
-                }
-        }
-        if (!ld_moved) {
-                schedstat_inc(sd, lb_failed[idle]);
-                sd->nr_balance_failed++;
-                if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
-                        raw_spin_lock_irqsave(&busiest->lock, flags);
-                        /* don't kick the migration_thread, if the curr
-                         * task on busiest cpu can't be moved to this_cpu
-                         */
-                        if (!cpumask_test_cpu(this_cpu,
-                                              &busiest->curr->cpus_allowed)) {
-                                raw_spin_unlock_irqrestore(&busiest->lock,
-                                                            flags);
-                                all_pinned = 1;
-                                goto out_one_pinned;
-                        }
-                        if (!busiest->active_balance) {
-                                busiest->active_balance = 1;
-                                busiest->push_cpu = this_cpu;
-                                active_balance = 1;
-                        }
-                        raw_spin_unlock_irqrestore(&busiest->lock, flags);
-                        if (active_balance)
-                                wake_up_process(busiest->migration_thread);
-                        /*
-                         * We've kicked active balancing, reset the failure
-                         * counter.
-                         */
-                        sd->nr_balance_failed = sd->cache_nice_tries+1;
-                }
-        } else
-                sd->nr_balance_failed = 0;
-        if (likely(!active_balance)) {
-                /* We were unbalanced, so reset the balancing interval */
-                sd->balance_interval = sd->min_interval;
-        } else {
-                /*
-                 * If we've begun active balancing, start to back off. This
-                 * case may not be covered by the all_pinned logic if there
-                 * is only 1 task on the busy runqueue (because we don't call
-                 * move_tasks).
-                 */
-                if (sd->balance_interval < sd->max_interval)
-                        sd->balance_interval *= 2;
-        }
-        if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                ld_moved = -1;
-        goto out;
-out_balanced:
-        schedstat_inc(sd, lb_balanced[idle]);
-        sd->nr_balance_failed = 0;
-out_one_pinned:
-        /* tune up the balancing interval */
-        if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
-                        (sd->balance_interval < sd->max_interval))
-                sd->balance_interval *= 2;
-        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                ld_moved = -1;
-        else
-                ld_moved = 0;
-out:
-        if (ld_moved)
-                update_shares(sd);
-        return ld_moved;
-}
-/*
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
- * tasks if there is an imbalance.
- *
- * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
- * this_rq is locked.
- */
-static int
-load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
-{
-        struct sched_group *group;
-        struct rq *busiest = NULL;
-        unsigned long imbalance;
-        int ld_moved = 0;
-        int sd_idle = 0;
-        int all_pinned = 0;
-        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-        cpumask_copy(cpus, cpu_active_mask);
-        /*
-         * When power savings policy is enabled for the parent domain, idle
-         * sibling can pick up load irrespective of busy siblings. In this case,
-         * let the state of idle sibling percolate up as IDLE, instead of
-         * portraying it as CPU_NOT_IDLE.
-         */
-        if (sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                sd_idle = 1;
-        schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
-redo:
-        update_shares_locked(this_rq, sd);
-        group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
-                                   &sd_idle, cpus, NULL);
-        if (!group) {
-                schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
-                goto out_balanced;
-        }
-        busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
-        if (!busiest) {
-                schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
-                goto out_balanced;
-        }
-        BUG_ON(busiest == this_rq);
-        schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
-        ld_moved = 0;
-        if (busiest->nr_running > 1) {
-                /* Attempt to move tasks */
-                double_lock_balance(this_rq, busiest);
-                /* this_rq->clock is already updated */
-                update_rq_clock(busiest);
-                ld_moved = move_tasks(this_rq, this_cpu, busiest,
-                                        imbalance, sd, CPU_NEWLY_IDLE,
-                                        &all_pinned);
-                double_unlock_balance(this_rq, busiest);
-                if (unlikely(all_pinned)) {
-                        cpumask_clear_cpu(cpu_of(busiest), cpus);
-                        if (!cpumask_empty(cpus))
-                                goto redo;
-                }
-        }
-        if (!ld_moved) {
-                int active_balance = 0;
-                schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
-                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-                    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                        return -1;
-                if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
-                        return -1;
-                if (sd->nr_balance_failed++ < 2)
-                        return -1;
-                /*
-                 * The only task running in a non-idle cpu can be moved to this
-                 * cpu in an attempt to completely freeup the other CPU
-                 * package. The same method used to move task in load_balance()
-                 * have been extended for load_balance_newidle() to speedup
-                 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
-                 *
-                 * The package power saving logic comes from
-                 * find_busiest_group().  If there are no imbalance, then
-                 * f_b_g() will return NULL.  However when sched_mc={1,2} then
-                 * f_b_g() will select a group from which a running task may be
-                 * pulled to this cpu in order to make the other package idle.
-                 * If there is no opportunity to make a package idle and if
-                 * there are no imbalance, then f_b_g() will return NULL and no
-                 * action will be taken in load_balance_newidle().
-                 *
-                 * Under normal task pull operation due to imbalance, there
-                 * will be more than one task in the source run queue and
-                 * move_tasks() will succeed.  ld_moved will be true and this
-                 * active balance code will not be triggered.
-                 */
-                /* Lock busiest in correct order while this_rq is held */
-                double_lock_balance(this_rq, busiest);
-                /*
-                 * don't kick the migration_thread, if the curr
-                 * task on busiest cpu can't be moved to this_cpu
-                 */
-                if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
-                        double_unlock_balance(this_rq, busiest);
-                        all_pinned = 1;
-                        return ld_moved;
-                }
-                if (!busiest->active_balance) {
-                        busiest->active_balance = 1;
-                        busiest->push_cpu = this_cpu;
-                        active_balance = 1;
-                }
-                double_unlock_balance(this_rq, busiest);
-                /*
-                 * Should not call ttwu while holding a rq->lock
-                 */
-                raw_spin_unlock(&this_rq->lock);
-                if (active_balance)
-                        wake_up_process(busiest->migration_thread);
-                raw_spin_lock(&this_rq->lock);
-        } else
-                sd->nr_balance_failed = 0;
-        update_shares_locked(this_rq, sd);
-        return ld_moved;
-out_balanced:
-        schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
-        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                return -1;
-        sd->nr_balance_failed = 0;
-        return 0;
-}
-/*
- * idle_balance is called by schedule() if this_cpu is about to become
- * idle. Attempts to pull tasks from other CPUs.
- */
-static void idle_balance(int this_cpu, struct rq *this_rq)
-{
-        struct sched_domain *sd;
-        int pulled_task = 0;
-        unsigned long next_balance = jiffies + HZ;
-        this_rq->idle_stamp = this_rq->clock;
-        if (this_rq->avg_idle < sysctl_sched_migration_cost)
-                return;
-        for_each_domain(this_cpu, sd) {
-                unsigned long interval;
-                if (!(sd->flags & SD_LOAD_BALANCE))
-                        continue;
-                if (sd->flags & SD_BALANCE_NEWIDLE)
-                        /* If we've pulled tasks over stop searching: */
-                        pulled_task = load_balance_newidle(this_cpu, this_rq,
-                                                           sd);
-                interval = msecs_to_jiffies(sd->balance_interval);
-                if (time_after(next_balance, sd->last_balance + interval))
-                        next_balance = sd->last_balance + interval;
-                if (pulled_task) {
-                        this_rq->idle_stamp = 0;
-                        break;
-                }
-        }
-        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
-                /*
-                 * We are going idle. next_balance may be set based on
-                 * a busy processor. So reset next_balance.
-                 */
-                this_rq->next_balance = next_balance;
-        }
-}
-/*
- * active_load_balance is run by migration threads. It pushes running tasks
- * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
- * running on each physical CPU where possible, and avoids physical /
- * logical imbalances.
- *
- * Called with busiest_rq locked.
- */
-static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
-{
-        int target_cpu = busiest_rq->push_cpu;
-        struct sched_domain *sd;
-        struct rq *target_rq;
-        /* Is there any task to move? */
-        if (busiest_rq->nr_running <= 1)
-                return;
-        target_rq = cpu_rq(target_cpu);
-        /*
-         * This condition is "impossible", if it occurs
-         * we need to fix it. Originally reported by
-         * Bjorn Helgaas on a 128-cpu setup.
-         */
-        BUG_ON(busiest_rq == target_rq);
-        /* move a task from busiest_rq to target_rq */
-        double_lock_balance(busiest_rq, target_rq);
-        update_rq_clock(busiest_rq);
-        update_rq_clock(target_rq);
-        /* Search for an sd spanning us and the target CPU. */
-        for_each_domain(target_cpu, sd) {
-                if ((sd->flags & SD_LOAD_BALANCE) &&
-                    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
-                                break;
-        }
-        if (likely(sd)) {
-                schedstat_inc(sd, alb_count);
-                if (move_one_task(target_rq, target_cpu, busiest_rq,
-                                  sd, CPU_IDLE))
-                        schedstat_inc(sd, alb_pushed);
-                else
-                        schedstat_inc(sd, alb_failed);
-        }
-        double_unlock_balance(busiest_rq, target_rq);
-}
-#ifdef CONFIG_NO_HZ
-static struct {
-        atomic_t load_balancer;
-        cpumask_var_t cpu_mask;
-        cpumask_var_t ilb_grp_nohz_mask;
-} nohz ____cacheline_aligned = {
-        .load_balancer = ATOMIC_INIT(-1),
-};
-int get_nohz_load_balancer(void)
-{
-        return atomic_read(&nohz.load_balancer);
-}
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * lowest_flag_domain - Return lowest sched_domain containing flag.
- * @cpu:        The cpu whose lowest level of sched domain is to
- *              be returned.
- * @flag:       The flag to check for the lowest sched_domain
- *              for the given cpu.
- *
- * Returns the lowest sched_domain of a cpu which contains the given flag.
- */
-static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
-{
-        struct sched_domain *sd;
-        for_each_domain(cpu, sd)
-                if (sd && (sd->flags & flag))
-                        break;
-        return sd;
-}
-/**
- * for_each_flag_domain - Iterates over sched_domains containing the flag.
- * @cpu:        The cpu whose domains we're iterating over.
- * @sd:         variable holding the value of the power_savings_sd
- *              for cpu.
- * @flag:       The flag to filter the sched_domains to be iterated.
- *
- * Iterates over all the scheduler domains for a given cpu that has the 'flag'
- * set, starting from the lowest sched_domain to the highest.
- */
-#define for_each_flag_domain(cpu, sd, flag) \
-        for (sd = lowest_flag_domain(cpu, flag); \
-                (sd && (sd->flags & flag)); sd = sd->parent)
-/**
- * is_semi_idle_group - Checks if the given sched_group is semi-idle.
- * @ilb_group:  group to be checked for semi-idleness
- *
- * Returns:     1 if the group is semi-idle. 0 otherwise.
- *
- * We define a sched_group to be semi idle if it has atleast one idle-CPU
- * and atleast one non-idle CPU. This helper function checks if the given
- * sched_group is semi-idle or not.
- */
-static inline int is_semi_idle_group(struct sched_group *ilb_group)
-{
-        cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
-                                        sched_group_cpus(ilb_group));
-        /*
-         * A sched_group is semi-idle when it has atleast one busy cpu
-         * and atleast one idle cpu.
-         */
-        if (cpumask_empty(nohz.ilb_grp_nohz_mask))
-                return 0;
-        if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
-                return 0;
-        return 1;
-}
-/**
- * find_new_ilb - Finds the optimum idle load balancer for nomination.
- * @cpu:        The cpu which is nominating a new idle_load_balancer.
- *
- * Returns:     Returns the id of the idle load balancer if it exists,
- *              Else, returns >= nr_cpu_ids.
- *
- * This algorithm picks the idle load balancer such that it belongs to a
- * semi-idle powersavings sched_domain. The idea is to try and avoid
- * completely idle packages/cores just for the purpose of idle load balancing
- * when there are other idle cpu's which are better suited for that job.
- */
-static int find_new_ilb(int cpu)
-{
-        struct sched_domain *sd;
-        struct sched_group *ilb_group;
-        /*
-         * Have idle load balancer selection from semi-idle packages only
-         * when power-aware load balancing is enabled
-         */
-        if (!(sched_smt_power_savings || sched_mc_power_savings))
-                goto out_done;
-        /*
-         * Optimize for the case when we have no idle CPUs or only one
-         * idle CPU. Don't walk the sched_domain hierarchy in such cases
-         */
-        if (cpumask_weight(nohz.cpu_mask) < 2)
-                goto out_done;
-        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
-                ilb_group = sd->groups;
-                do {
-                        if (is_semi_idle_group(ilb_group))
-                                return cpumask_first(nohz.ilb_grp_nohz_mask);
-                        ilb_group = ilb_group->next;
-                } while (ilb_group != sd->groups);
-        }
-out_done:
-        return cpumask_first(nohz.cpu_mask);
-}
-#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
-static inline int find_new_ilb(int call_cpu)
-{
-        return cpumask_first(nohz.cpu_mask);
-}
-#endif
-/*
- * This routine will try to nominate the ilb (idle load balancing)
- * owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus. If all the cpus in the system
- * go into this tickless mode, then there will be no ilb owner (as there is
- * no need for one) and all the cpus will sleep till the next wakeup event
- * arrives...
- *
- * For the ilb owner, tick is not stopped. And this tick will be used
- * for idle load balancing. ilb owner will still be part of
- * nohz.cpu_mask..
- *
- * While stopping the tick, this cpu will become the ilb owner if there
- * is no other owner. And will be the owner till that cpu becomes busy
- * or if all cpus in the system stop their ticks at which point
- * there is no need for ilb owner.
- *
- * When the ilb owner becomes busy, it nominates another owner, during the
- * next busy scheduler_tick()
- */
-int select_nohz_load_balancer(int stop_tick)
-{
-        int cpu = smp_processor_id();
-        if (stop_tick) {
-                cpu_rq(cpu)->in_nohz_recently = 1;
-                if (!cpu_active(cpu)) {
-                        if (atomic_read(&nohz.load_balancer) != cpu)
-                                return 0;
-                        /*
-                         * If we are going offline and still the leader,
-                         * give up!
-                         */
-                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
-                                BUG();
-                        return 0;
-                }
-                cpumask_set_cpu(cpu, nohz.cpu_mask);
-                /* time for ilb owner also to sleep */
-                if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
-                        if (atomic_read(&nohz.load_balancer) == cpu)
-                                atomic_set(&nohz.load_balancer, -1);
-                        return 0;
-                }
-                if (atomic_read(&nohz.load_balancer) == -1) {
-                        /* make me the ilb owner */
-                        if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
-                                return 1;
-                } else if (atomic_read(&nohz.load_balancer) == cpu) {
-                        int new_ilb;
-                        if (!(sched_smt_power_savings ||
-                                                sched_mc_power_savings))
-                                return 1;
-                        /*
-                         * Check to see if there is a more power-efficient
-                         * ilb.
-                         */
-                        new_ilb = find_new_ilb(cpu);
-                        if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
-                                atomic_set(&nohz.load_balancer, -1);
-                                resched_cpu(new_ilb);
-                                return 0;
-                        }
-                        return 1;
-                }
-        } else {
-                if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
-                        return 0;
-                cpumask_clear_cpu(cpu, nohz.cpu_mask);
-                if (atomic_read(&nohz.load_balancer) == cpu)
-                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
-                                BUG();
-        }
-        return 0;
-}
-#endif
-static DEFINE_SPINLOCK(balancing);
-/*
- * It checks each scheduling domain to see if it is due to be balanced,
- * and initiates a balancing operation if so.
- *
- * Balancing parameters are set up in arch_init_sched_domains.
- */
-static void rebalance_domains(int cpu, enum cpu_idle_type idle)
-{
-        int balance = 1;
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long interval;
-        struct sched_domain *sd;
-        /* Earliest time when we have to do rebalance again */
-        unsigned long next_balance = jiffies + 60*HZ;
-        int update_next_balance = 0;
-        int need_serialize;
-        for_each_domain(cpu, sd) {
-                if (!(sd->flags & SD_LOAD_BALANCE))
-                        continue;
-                interval = sd->balance_interval;
-                if (idle != CPU_IDLE)
-                        interval *= sd->busy_factor;
-                /* scale ms to jiffies */
-                interval = msecs_to_jiffies(interval);
-                if (unlikely(!interval))
-                        interval = 1;
-                if (interval > HZ*NR_CPUS/10)
-                        interval = HZ*NR_CPUS/10;
-                need_serialize = sd->flags & SD_SERIALIZE;
-                if (need_serialize) {
-                        if (!spin_trylock(&balancing))
-                                goto out;
-                }
-                if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                        if (load_balance(cpu, rq, sd, idle, &balance)) {
-                                /*
-                                 * We've pulled tasks over so either we're no
-                                 * longer idle, or one of our SMT siblings is
-                                 * not idle.
-                                 */
-                                idle = CPU_NOT_IDLE;
-                        }
-                        sd->last_balance = jiffies;
-                }
-                if (need_serialize)
-                        spin_unlock(&balancing);
-out:
-                if (time_after(next_balance, sd->last_balance + interval)) {
-                        next_balance = sd->last_balance + interval;
-                        update_next_balance = 1;
-                }
-                /*
-                 * Stop the load balance at this level. There is another
-                 * CPU in our sched group which is doing load balancing more
-                 * actively.
-                 */
-                if (!balance)
-                        break;
-        }
-        /*
-         * next_balance will be updated only when there is a need.
-         * When the cpu is attached to null domain for ex, it will not be
-         * updated.
-         */
-        if (likely(update_next_balance))
-                rq->next_balance = next_balance;
-}
-/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * In CONFIG_NO_HZ case, the idle load balance owner will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
- */
-static void run_rebalance_domains(struct softirq_action *h)
-{
-        int this_cpu = smp_processor_id();
-        struct rq *this_rq = cpu_rq(this_cpu);
-        enum cpu_idle_type idle = this_rq->idle_at_tick ?
-                                                CPU_IDLE : CPU_NOT_IDLE;
-        rebalance_domains(this_cpu, idle);
-#ifdef CONFIG_NO_HZ
-        /*
-         * If this cpu is the owner for idle load balancing, then do the
-         * balancing on behalf of the other idle cpus whose ticks are
-         * stopped.
-         */
-        if (this_rq->idle_at_tick &&
-            atomic_read(&nohz.load_balancer) == this_cpu) {
-                struct rq *rq;
-                int balance_cpu;
-                for_each_cpu(balance_cpu, nohz.cpu_mask) {
-                        if (balance_cpu == this_cpu)
-                                continue;
-                        /*
-                         * If this cpu gets work to do, stop the load balancing
-                         * work being done for other cpus. Next load
-                         * balancing owner will pick it up.
-                         */
-                        if (need_resched())
-                                break;
-                        rebalance_domains(balance_cpu, CPU_IDLE);
-                        rq = cpu_rq(balance_cpu);
-                        if (time_after(this_rq->next_balance, rq->next_balance))
-                                this_rq->next_balance = rq->next_balance;
-                }
-        }
-#endif
-}
-static inline int on_null_domain(int cpu)
-{
-        return !rcu_dereference_sched(cpu_rq(cpu)->sd);
-}
-/*
- * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
- *
- * In case of CONFIG_NO_HZ, this is the place where we nominate a new
- * idle load balancing owner or decide to stop the periodic load balancing,
- * if the whole system is idle.
- */
-static inline void trigger_load_balance(struct rq *rq, int cpu)
-{
-#ifdef CONFIG_NO_HZ
-        /*
-         * If we were in the nohz mode recently and busy at the current
-         * scheduler tick, then check if we need to nominate new idle
-         * load balancer.
-         */
-        if (rq->in_nohz_recently && !rq->idle_at_tick) {
-                rq->in_nohz_recently = 0;
-                if (atomic_read(&nohz.load_balancer) == cpu) {
-                        cpumask_clear_cpu(cpu, nohz.cpu_mask);
-                        atomic_set(&nohz.load_balancer, -1);
-                }
-                if (atomic_read(&nohz.load_balancer) == -1) {
-                        int ilb = find_new_ilb(cpu);
-                        if (ilb < nr_cpu_ids)
-                                resched_cpu(ilb);
-                }
-        }
-        /*
-         * If this cpu is idle and doing idle load balancing for all the
-         * cpus with ticks stopped, is it time for that to stop?
-         */
-        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
-            cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
-                resched_cpu(cpu);
-                return;
-        }
-        /*
-         * If this cpu is idle and the idle load balancing is done by
-         * someone else, then no need raise the SCHED_SOFTIRQ
-         */
-        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
-            cpumask_test_cpu(cpu, nohz.cpu_mask))
-                return;
-#endif
-        /* Don't need to rebalance while attached to NULL domain */
-        if (time_after_eq(jiffies, rq->next_balance) &&
-            likely(!on_null_domain(cpu)))
-                raise_softirq(SCHED_SOFTIRQ);
-}
-#else   /* CONFIG_SMP */
-/*
- * on UP we do not need to balance between CPUs:
- */
-static inline void idle_balance(int cpu, struct rq *rq)
-{
-}
 #endif
 DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -5314,7 +3515,7 @@ void scheduler_tick(void)
        curr->sched_class->task_tick(rq, curr, 0);
        raw_spin_unlock(&rq->lock);
-        perf_event_task_tick(curr, cpu);
+        perf_event_task_tick(curr);
 #ifdef CONFIG_SMP
        rq->idle_at_tick = idle_cpu(cpu);
@@ -5528,7 +3729,7 @@ need_resched_nonpreemptible:
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
-                perf_event_task_sched_out(prev, next, cpu);
+                perf_event_task_sched_out(prev, next);
                rq->nr_switches++;
                rq->curr = next;
@@ -6059,7 +4260,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        unsigned long flags;
        int oldprio, on_rq, running;
        struct rq *rq;
-        const struct sched_class *prev_class = p->sched_class;
+        const struct sched_class *prev_class;
        BUG_ON(prio < 0 || prio > MAX_PRIO);
@@ -6067,6 +4268,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        update_rq_clock(rq);
        oldprio = p->prio;
+        prev_class = p->sched_class;
        on_rq = p->se.on_rq;
        running = task_current(rq, p);
        if (on_rq)
@@ -6084,7 +4286,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        if (running)
                p->sched_class->set_curr_task(rq);
        if (on_rq) {
-                enqueue_task(rq, p, 0);
+                enqueue_task(rq, p, 0, oldprio < prio);
                check_class_changed(rq, p, prev_class, oldprio, running);
        }
@@ -6128,7 +4330,7 @@ void set_user_nice(struct task_struct *p, long nice)
        delta = p->prio - old_prio;
        if (on_rq) {
-                enqueue_task(rq, p, 0);
+                enqueue_task(rq, p, 0, false);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -6286,7 +4488,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
 {
        int retval, oldprio, oldpolicy = -1, on_rq, running;
        unsigned long flags;
-        const struct sched_class *prev_class = p->sched_class;
+        const struct sched_class *prev_class;
        struct rq *rq;
        int reset_on_fork;
@@ -6400,6 +4602,7 @@ recheck:
        p->sched_reset_on_fork = reset_on_fork;
        oldprio = p->prio;
+        prev_class = p->sched_class;
        __setscheduler(rq, p, policy, param->sched_priority);
        if (running)
@@ -7150,27 +5353,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        struct rq *rq;
        int ret = 0;
-        /*
-         * Since we rely on wake-ups to migrate sleeping tasks, don't change
-         * the ->cpus_allowed mask from under waking tasks, which would be
-         * possible when we change rq->lock in ttwu(), so synchronize against
-         * TASK_WAKING to avoid that.
-         *
-         * Make an exception for freshly cloned tasks, since cpuset namespaces
-         * might move the task about, we have to validate the target in
-         * wake_up_new_task() anyway since the cpu might have gone away.
-         */
-again:
-        while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
-                cpu_relax();
        rq = task_rq_lock(p, &flags);
-        if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
-                task_rq_unlock(rq, &flags);
-                goto again;
-        }
        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                ret = -EINVAL;
                goto out;
@@ -9457,7 +7641,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
        tg->rt_rq[cpu] = rt_rq;
        init_rt_rq(rt_rq, rq);
        rt_rq->tg = tg;
-        rt_rq->rt_se = rt_se;
        rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
        if (add)
                list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9488,9 +7671,6 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
        alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
-#ifdef CONFIG_USER_SCHED
-        alloc_size *= 2;
-#endif
 #ifdef CONFIG_CPUMASK_OFFSTACK
        alloc_size += num_possible_cpus() * cpumask_size();
 #endif
@@ -9504,13 +7684,6 @@ void __init sched_init(void)
                init_task_group.cfs_rq = (struct cfs_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
-#ifdef CONFIG_USER_SCHED
-                root_task_group.se = (struct sched_entity **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-                root_task_group.cfs_rq = (struct cfs_rq **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-#endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
                init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9519,13 +7692,6 @@ void __init sched_init(void)
                init_task_group.rt_rq = (struct rt_rq **)ptr;
                ptr += nr_cpu_ids * sizeof(void **);
-#ifdef CONFIG_USER_SCHED
-                root_task_group.rt_se = (struct sched_rt_entity **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-                root_task_group.rt_rq = (struct rt_rq **)ptr;
-                ptr += nr_cpu_ids * sizeof(void **);
-#endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CPUMASK_OFFSTACK
                for_each_possible_cpu(i) {
@@ -9545,22 +7711,13 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
        init_rt_bandwidth(&init_task_group.rt_bandwidth,
                        global_rt_period(), global_rt_runtime());
-#ifdef CONFIG_USER_SCHED
-        init_rt_bandwidth(&root_task_group.rt_bandwidth,
-                        global_rt_period(), RUNTIME_INF);
-#endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
        list_add(&init_task_group.list, &task_groups);
        INIT_LIST_HEAD(&init_task_group.children);
-#ifdef CONFIG_USER_SCHED
+#endif /* CONFIG_CGROUP_SCHED */
-        INIT_LIST_HEAD(&root_task_group.children);
-        init_task_group.parent = &root_task_group;
-        list_add(&init_task_group.siblings, &root_task_group.children);
-#endif /* CONFIG_USER_SCHED */
-#endif /* CONFIG_GROUP_SCHED */
 #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
        update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9600,25 +7757,6 @@ void __init sched_init(void)
                 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
                 */
                init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
-#elif defined CONFIG_USER_SCHED
-                root_task_group.shares = NICE_0_LOAD;
-                init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
-                /*
-                 * In case of task-groups formed thr' the user id of tasks,
-                 * init_task_group represents tasks belonging to root user.
-                 * Hence it forms a sibling of all subsequent groups formed.
-                 * In this case, init_task_group gets only a fraction of overall
-                 * system cpu resource, based on the weight assigned to root
-                 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
-                 * by letting tasks of init_task_group sit in a separate cfs_rq
-                 * (init_tg_cfs_rq) and having one entity represent this group of
-                 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
-                 */
-                init_tg_cfs_entry(&init_task_group,
-                                &per_cpu(init_tg_cfs_rq, i),
-                                &per_cpu(init_sched_entity, i), i, 1,
-                                root_task_group.se[i]);
 #endif
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -9627,12 +7765,6 @@ void __init sched_init(void)
                INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 #ifdef CONFIG_CGROUP_SCHED
                init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
-#elif defined CONFIG_USER_SCHED
-                init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
-                init_tg_rt_entry(&init_task_group,
-                                &per_cpu(init_rt_rq_var, i),
-                                &per_cpu(init_sched_rt_entity, i), i, 1,
-                                root_task_group.rt_se[i]);
 #endif
 #endif
@@ -9717,7 +7849,7 @@ static inline int preempt_count_equals(int preempt_offset)
        return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
 }
-void __might_sleep(char *file, int line, int preempt_offset)
+void __might_sleep(const char *file, int line, int preempt_offset)
 {
 #ifdef in_atomic
        static unsigned long prev_jiffy;        /* ratelimiting */
@@ -10028,7 +8160,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_GROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
 static void free_sched_group(struct task_group *tg)
 {
        free_fair_sched_group(tg);
@@ -10133,11 +8265,11 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
        if (on_rq)
-                enqueue_task(rq, tsk, 0);
+                enqueue_task(rq, tsk, 0, false);
        task_rq_unlock(rq, &flags);
 }
-#endif /* CONFIG_GROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10279,13 +8411,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
                runtime = d->rt_runtime;
        }
-#ifdef CONFIG_USER_SCHED
-        if (tg == &root_task_group) {
-                period = global_rt_period();
-                runtime = global_rt_runtime();
-        }
-#endif
        /*
         * Cannot have more runtime than the period.
         */
@@ -10905,12 +9030,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 }
 /*
+ * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
+ * in cputime_t units. As a result, cpuacct_update_stats calls
+ * percpu_counter_add with values large enough to always overflow the
+ * per cpu batch limit causing bad SMP scalability.
+ *
+ * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
+ * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
+ * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
+ */
+#ifdef CONFIG_SMP
+#define CPUACCT_BATCH   \
+        min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
+#else
+#define CPUACCT_BATCH   0
+#endif
+/*
 * Charge the system/user time to the task's accounting group.
 */
 static void cpuacct_update_stats(struct task_struct *tsk,
                enum cpuacct_stat_index idx, cputime_t val)
 {
        struct cpuacct *ca;
+        int batch = CPUACCT_BATCH;
        if (unlikely(!cpuacct_subsys.active))
                return;
@@ -10919,7 +9062,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
        ca = task_ca(tsk);
        do {
-                percpu_counter_add(&ca->cpustat[idx], val);
+                __percpu_counter_add(&ca->cpustat[idx], val, batch);
                ca = ca->parent;
        } while (ca);
        rcu_read_unlock();
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 597b33099dfa..eeb3506c4834 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,7 @@ static int convert_prio(int prio)
 }
 #define for_each_cpupri_active(array, idx)                    \
-  for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES);     \
+        for_each_bit(idx, array, CPUPRI_NR_PRIORITIES)
-       idx < CPUPRI_NR_PRIORITIES;                            \
-       idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
 /**
 * cpupri_find - find the best (lowest-pri) CPU in the system
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8fe7ee81c552..3e1fd96c6cf9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1053,7 +1053,8 @@ static inline void hrtick_update(struct rq *rq)
 * increased. Here we update the fair scheduling stats and
 * then put the task into the rbtree:
 */
-static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
+static void
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
@@ -1815,57 +1816,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 */
 /*
- * Load-balancing iterator. Note: while the runqueue stays locked
+ * pull_task - move a task from a remote runqueue to the local runqueue.
- * during the whole iteration, the current task might be
+ * Both runqueues must be locked.
- * dequeued so the iterator has to be dequeue-safe. Here we
- * achieve that by always pre-iterating before returning
- * the current task:
 */
-static struct task_struct *
+static void pull_task(struct rq *src_rq, struct task_struct *p,
-__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
+                      struct rq *this_rq, int this_cpu)
 {
-        struct task_struct *p = NULL;
+        deactivate_task(src_rq, p, 0);
-        struct sched_entity *se;
+        set_task_cpu(p, this_cpu);
+        activate_task(this_rq, p, 0);
+        check_preempt_curr(this_rq, p, 0);
+}
-        if (next == &cfs_rq->tasks)
+/*
-                return NULL;
+ * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ */
+static
+int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
+                     struct sched_domain *sd, enum cpu_idle_type idle,
+                     int *all_pinned)
+{
+        int tsk_cache_hot = 0;
+        /*
+         * We do not migrate tasks that are:
+         * 1) running (obviously), or
+         * 2) cannot be migrated to this CPU due to cpus_allowed, or
+         * 3) are cache-hot on their current CPU.
+         */
+        if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
+                schedstat_inc(p, se.nr_failed_migrations_affine);
+                return 0;
+        }
+        *all_pinned = 0;
-        se = list_entry(next, struct sched_entity, group_node);
+        if (task_running(rq, p)) {
-        p = task_of(se);
+                schedstat_inc(p, se.nr_failed_migrations_running);
-        cfs_rq->balance_iterator = next->next;
+                return 0;
+        }
-        return p;
+        /*
-}
+         * Aggressive migration if:
+         * 1) task is cache cold, or
+         * 2) too many balance attempts have failed.
+         */
-static struct task_struct *load_balance_start_fair(void *arg)
+        tsk_cache_hot = task_hot(p, rq->clock, sd);
-{
+        if (!tsk_cache_hot ||
-        struct cfs_rq *cfs_rq = arg;
+                sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+                if (tsk_cache_hot) {
+                        schedstat_inc(sd, lb_hot_gained[idle]);
+                        schedstat_inc(p, se.nr_forced_migrations);
+                }
+#endif
+                return 1;
+        }
-        return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next);
+        if (tsk_cache_hot) {
+                schedstat_inc(p, se.nr_failed_migrations_hot);
+                return 0;
+        }
+        return 1;
 }
-static struct task_struct *load_balance_next_fair(void *arg)
+/*
+ * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * part of active balancing operations within "domain".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int
+move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+              struct sched_domain *sd, enum cpu_idle_type idle)
 {
-        struct cfs_rq *cfs_rq = arg;
+        struct task_struct *p, *n;
+        struct cfs_rq *cfs_rq;
+        int pinned = 0;
+        for_each_leaf_cfs_rq(busiest, cfs_rq) {
+                list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
+                        if (!can_migrate_task(p, busiest, this_cpu,
+                                                sd, idle, &pinned))
+                                continue;
-        return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
+                        pull_task(busiest, p, this_rq, this_cpu);
+                        /*
+                         * Right now, this is only the second place pull_task()
+                         * is called, so we can safely collect pull_task()
+                         * stats here rather than inside pull_task().
+                         */
+                        schedstat_inc(sd, lb_gained[idle]);
+                        return 1;
+                }
+        }
+        return 0;
 }
 static unsigned long
-__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                unsigned long max_load_move, struct sched_domain *sd,
+              unsigned long max_load_move, struct sched_domain *sd,
-                enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
+              enum cpu_idle_type idle, int *all_pinned,
-                struct cfs_rq *cfs_rq)
+              int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
 {
-        struct rq_iterator cfs_rq_iterator;
+        int loops = 0, pulled = 0, pinned = 0;
+        long rem_load_move = max_load_move;
+        struct task_struct *p, *n;
-        cfs_rq_iterator.start = load_balance_start_fair;
+        if (max_load_move == 0)
-        cfs_rq_iterator.next = load_balance_next_fair;
+                goto out;
-        cfs_rq_iterator.arg = cfs_rq;
-        return balance_tasks(this_rq, this_cpu, busiest,
+        pinned = 1;
-                        max_load_move, sd, idle, all_pinned,
-                        this_best_prio, &cfs_rq_iterator);
+        list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
+                if (loops++ > sysctl_sched_nr_migrate)
+                        break;
+                if ((p->se.load.weight >> 1) > rem_load_move ||
+                    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
+                        continue;
+                pull_task(busiest, p, this_rq, this_cpu);
+                pulled++;
+                rem_load_move -= p->se.load.weight;
+#ifdef CONFIG_PREEMPT
+                /*
+                 * NEWIDLE balancing is a source of latency, so preemptible
+                 * kernels will stop after the first task is pulled to minimize
+                 * the critical section.
+                 */
+                if (idle == CPU_NEWLY_IDLE)
+                        break;
+#endif
+                /*
+                 * We only want to steal up to the prescribed amount of
+                 * weighted load.
+                 */
+                if (rem_load_move <= 0)
+                        break;
+                if (p->prio < *this_best_prio)
+                        *this_best_prio = p->prio;
+        }
+out:
+        /*
+         * Right now, this is one of only two places pull_task() is called,
+         * so we can safely collect pull_task() stats here rather than
+         * inside pull_task().
+         */
+        schedstat_add(sd, lb_gained[idle], pulled);
+        if (all_pinned)
+                *all_pinned = pinned;
+        return max_load_move - rem_load_move;
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1897,9 +2005,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                rem_load = (u64)rem_load_move * busiest_weight;
                rem_load = div_u64(rem_load, busiest_h_load + 1);
-                moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
+                moved_load = balance_tasks(this_rq, this_cpu, busiest,
                                rem_load, sd, idle, all_pinned, this_best_prio,
-                                tg->cfs_rq[busiest_cpu]);
+                                busiest_cfs_rq);
                if (!moved_load)
                        continue;
@@ -1922,35 +2030,1509 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  struct sched_domain *sd, enum cpu_idle_type idle,
                  int *all_pinned, int *this_best_prio)
 {
-        return __load_balance_fair(this_rq, this_cpu, busiest,
+        return balance_tasks(this_rq, this_cpu, busiest,
                        max_load_move, sd, idle, all_pinned,
                        this_best_prio, &busiest->cfs);
 }
 #endif
-static int
+/*
-move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ * move_tasks tries to move up to max_load_move weighted load from busiest to
-                   struct sched_domain *sd, enum cpu_idle_type idle)
+ * this_rq, as part of a balancing operation within domain "sd".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                      unsigned long max_load_move,
+                      struct sched_domain *sd, enum cpu_idle_type idle,
+                      int *all_pinned)
 {
-        struct cfs_rq *busy_cfs_rq;
+        unsigned long total_load_moved = 0, load_moved;
-        struct rq_iterator cfs_rq_iterator;
+        int this_best_prio = this_rq->curr->prio;
-        cfs_rq_iterator.start = load_balance_start_fair;
+        do {
-        cfs_rq_iterator.next = load_balance_next_fair;
+                load_moved = load_balance_fair(this_rq, this_cpu, busiest,
+                                max_load_move - total_load_moved,
+                                sd, idle, all_pinned, &this_best_prio);
-        for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+                total_load_moved += load_moved;
+#ifdef CONFIG_PREEMPT
                /*
-                 * pass busy_cfs_rq argument into
+                 * NEWIDLE balancing is a source of latency, so preemptible
-                 * load_balance_[start|next]_fair iterators
+                 * kernels will stop after the first task is pulled to minimize
+                 * the critical section.
                 */
-                cfs_rq_iterator.arg = busy_cfs_rq;
+                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
-                if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
+                        break;
-                                       &cfs_rq_iterator))
-                    return 1;
+                if (raw_spin_is_contended(&this_rq->lock) ||
+                                raw_spin_is_contended(&busiest->lock))
+                        break;
+#endif
+        } while (load_moved && max_load_move > total_load_moved);
+        return total_load_moved > 0;
+}
+/********** Helpers for find_busiest_group ************************/
+/*
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ *              during load balancing.
+ */
+struct sd_lb_stats {
+        struct sched_group *busiest; /* Busiest group in this sd */
+        struct sched_group *this;  /* Local group in this sd */
+        unsigned long total_load;  /* Total load of all groups in sd */
+        unsigned long total_pwr;   /*   Total power of all groups in sd */
+        unsigned long avg_load;    /* Average load across all groups in sd */
+        /** Statistics of this group */
+        unsigned long this_load;
+        unsigned long this_load_per_task;
+        unsigned long this_nr_running;
+        /* Statistics of the busiest group */
+        unsigned long max_load;
+        unsigned long busiest_load_per_task;
+        unsigned long busiest_nr_running;
+        unsigned long busiest_group_capacity;
+        int group_imb; /* Is there imbalance in this sd */
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+        int power_savings_balance; /* Is powersave balance needed for this sd */
+        struct sched_group *group_min; /* Least loaded group in sd */
+        struct sched_group *group_leader; /* Group which relieves group_min */
+        unsigned long min_load_per_task; /* load_per_task in group_min */
+        unsigned long leader_nr_running; /* Nr running of group_leader */
+        unsigned long min_nr_running; /* Nr running of group_min */
+#endif
+};
+/*
+ * sg_lb_stats - stats of a sched_group required for load_balancing
+ */
+struct sg_lb_stats {
+        unsigned long avg_load; /*Avg load across the CPUs of the group */
+        unsigned long group_load; /* Total load over the CPUs of the group */
+        unsigned long sum_nr_running; /* Nr tasks running in the group */
+        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+        unsigned long group_capacity;
+        int group_imb; /* Is there an imbalance in the group ? */
+};
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+        return cpumask_first(sched_group_cpus(group));
+}
+/**
+ * get_sd_load_idx - Obtain the load index for a given sched domain.
+ * @sd: The sched_domain whose load_idx is to be obtained.
+ * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ */
+static inline int get_sd_load_idx(struct sched_domain *sd,
+                                        enum cpu_idle_type idle)
+{
+        int load_idx;
+        switch (idle) {
+        case CPU_NOT_IDLE:
+                load_idx = sd->busy_idx;
+                break;
+        case CPU_NEWLY_IDLE:
+                load_idx = sd->newidle_idx;
+                break;
+        default:
+                load_idx = sd->idle_idx;
+                break;
        }
+        return load_idx;
+}
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * init_sd_power_savings_stats - Initialize power savings statistics for
+ * the given sched_domain, during load balancing.
+ *
+ * @sd: Sched domain whose power-savings statistics are to be initialized.
+ * @sds: Variable containing the statistics for sd.
+ * @idle: Idle status of the CPU at which we're performing load-balancing.
+ */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+        struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+        /*
+         * Busy processors will not participate in power savings
+         * balance.
+         */
+        if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+                sds->power_savings_balance = 0;
+        else {
+                sds->power_savings_balance = 1;
+                sds->min_nr_running = ULONG_MAX;
+                sds->leader_nr_running = 0;
+        }
+}
+/**
+ * update_sd_power_savings_stats - Update the power saving stats for a
+ * sched_domain while performing load balancing.
+ *
+ * @group: sched_group belonging to the sched_domain under consideration.
+ * @sds: Variable containing the statistics of the sched_domain
+ * @local_group: Does group contain the CPU for which we're performing
+ *              load balancing ?
+ * @sgs: Variable containing the statistics of the group.
+ */
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+        if (!sds->power_savings_balance)
+                return;
+        /*
+         * If the local group is idle or completely loaded
+         * no need to do power savings balance at this domain
+         */
+        if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+                                !sds->this_nr_running))
+                sds->power_savings_balance = 0;
+        /*
+         * If a group is already running at full capacity or idle,
+         * don't include that group in power savings calculations
+         */
+        if (!sds->power_savings_balance ||
+                sgs->sum_nr_running >= sgs->group_capacity ||
+                !sgs->sum_nr_running)
+                return;
+        /*
+         * Calculate the group which has the least non-idle load.
+         * This is the group from where we need to pick up the load
+         * for saving power
+         */
+        if ((sgs->sum_nr_running < sds->min_nr_running) ||
+            (sgs->sum_nr_running == sds->min_nr_running &&
+             group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+                sds->group_min = group;
+                sds->min_nr_running = sgs->sum_nr_running;
+                sds->min_load_per_task = sgs->sum_weighted_load /
+                                                sgs->sum_nr_running;
+        }
+        /*
+         * Calculate the group which is almost near its
+         * capacity but still has some space to pick up some load
+         * from other group and save more power
+         */
+        if (sgs->sum_nr_running + 1 > sgs->group_capacity)
+                return;
+        if (sgs->sum_nr_running > sds->leader_nr_running ||
+            (sgs->sum_nr_running == sds->leader_nr_running &&
+             group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+                sds->group_leader = group;
+                sds->leader_nr_running = sgs->sum_nr_running;
+        }
+}
+/**
+ * check_power_save_busiest_group - see if there is potential for some power-savings balance
+ * @sds: Variable containing the statistics of the sched_domain
+ *      under consideration.
+ * @this_cpu: Cpu at which we're currently performing load-balancing.
+ * @imbalance: Variable to store the imbalance.
+ *
+ * Description:
+ * Check if we have potential to perform some power-savings balance.
+ * If yes, set the busiest group to be the least loaded group in the
+ * sched_domain, so that it's CPUs can be put to idle.
+ *
+ * Returns 1 if there is potential to perform power-savings balance.
+ * Else returns 0.
+ */
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                        int this_cpu, unsigned long *imbalance)
+{
+        if (!sds->power_savings_balance)
+                return 0;
+        if (sds->this != sds->group_leader ||
+                        sds->group_leader == sds->group_min)
+                return 0;
+        *imbalance = sds->min_load_per_task;
+        sds->busiest = sds->group_min;
+        return 1;
+}
+#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+        struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+        return;
+}
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+        return;
+}
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                        int this_cpu, unsigned long *imbalance)
+{
        return 0;
 }
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+        return SCHED_LOAD_SCALE;
+}
+unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+        return default_scale_freq_power(sd, cpu);
+}
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+        unsigned long weight = cpumask_weight(sched_domain_span(sd));
+        unsigned long smt_gain = sd->smt_gain;
+        smt_gain /= weight;
+        return smt_gain;
+}
+unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+        return default_scale_smt_power(sd, cpu);
+}
+unsigned long scale_rt_power(int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        u64 total, available;
+        sched_avg_update(rq);
+        total = sched_avg_period() + (rq->clock - rq->age_stamp);
+        available = total - rq->rt_avg;
+        if (unlikely((s64)total < SCHED_LOAD_SCALE))
+                total = SCHED_LOAD_SCALE;
+        total >>= SCHED_LOAD_SHIFT;
+        return div_u64(available, total);
+}
+static void update_cpu_power(struct sched_domain *sd, int cpu)
+{
+        unsigned long weight = cpumask_weight(sched_domain_span(sd));
+        unsigned long power = SCHED_LOAD_SCALE;
+        struct sched_group *sdg = sd->groups;
+        if (sched_feat(ARCH_POWER))
+                power *= arch_scale_freq_power(sd, cpu);
+        else
+                power *= default_scale_freq_power(sd, cpu);
+        power >>= SCHED_LOAD_SHIFT;
+        if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
+                if (sched_feat(ARCH_POWER))
+                        power *= arch_scale_smt_power(sd, cpu);
+                else
+                        power *= default_scale_smt_power(sd, cpu);
+                power >>= SCHED_LOAD_SHIFT;
+        }
+        power *= scale_rt_power(cpu);
+        power >>= SCHED_LOAD_SHIFT;
+        if (!power)
+                power = 1;
+        sdg->cpu_power = power;
+}
+static void update_group_power(struct sched_domain *sd, int cpu)
+{
+        struct sched_domain *child = sd->child;
+        struct sched_group *group, *sdg = sd->groups;
+        unsigned long power;
+        if (!child) {
+                update_cpu_power(sd, cpu);
+                return;
+        }
+        power = 0;
+        group = child->groups;
+        do {
+                power += group->cpu_power;
+                group = group->next;
+        } while (group != child->groups);
+        sdg->cpu_power = power;
+}
+/**
+ * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: The sched_domain whose statistics are to be updated.
+ * @group: sched_group whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @local_group: Does group contain this_cpu.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sgs: variable to hold the statistics for this group.
+ */
+static inline void update_sg_lb_stats(struct sched_domain *sd,
+                        struct sched_group *group, int this_cpu,
+                        enum cpu_idle_type idle, int load_idx, int *sd_idle,
+                        int local_group, const struct cpumask *cpus,
+                        int *balance, struct sg_lb_stats *sgs)
+{
+        unsigned long load, max_cpu_load, min_cpu_load;
+        int i;
+        unsigned int balance_cpu = -1, first_idle_cpu = 0;
+        unsigned long avg_load_per_task = 0;
+        if (local_group)
+                balance_cpu = group_first_cpu(group);
+        /* Tally up the load of all CPUs in the group */
+        max_cpu_load = 0;
+        min_cpu_load = ~0UL;
+        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+                struct rq *rq = cpu_rq(i);
+                if (*sd_idle && rq->nr_running)
+                        *sd_idle = 0;
+                /* Bias balancing toward cpus of our domain */
+                if (local_group) {
+                        if (idle_cpu(i) && !first_idle_cpu) {
+                                first_idle_cpu = 1;
+                                balance_cpu = i;
+                        }
+                        load = target_load(i, load_idx);
+                } else {
+                        load = source_load(i, load_idx);
+                        if (load > max_cpu_load)
+                                max_cpu_load = load;
+                        if (min_cpu_load > load)
+                                min_cpu_load = load;
+                }
+                sgs->group_load += load;
+                sgs->sum_nr_running += rq->nr_running;
+                sgs->sum_weighted_load += weighted_cpuload(i);
+        }
+        /*
+         * First idle cpu or the first cpu(busiest) in this sched group
+         * is eligible for doing load balancing at this and above
+         * domains. In the newly idle case, we will allow all the cpu's
+         * to do the newly idle load balance.
+         */
+        if (idle != CPU_NEWLY_IDLE && local_group &&
+            balance_cpu != this_cpu) {
+                *balance = 0;
+                return;
+        }
+        update_group_power(sd, this_cpu);
+        /* Adjust by relative CPU power of the group */
+        sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
+        /*
+         * Consider the group unbalanced when the imbalance is larger
+         * than the average weight of two tasks.
+         *
+         * APZ: with cgroup the avg task weight can vary wildly and
+         *      might not be a suitable number - should we keep a
+         *      normalized nr_running number somewhere that negates
+         *      the hierarchy?
+         */
+        if (sgs->sum_nr_running)
+                avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+                sgs->group_imb = 1;
+        sgs->group_capacity =
+                DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+}
+/**
+ * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: sched_domain whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sds: variable to hold the statistics for this sched_domain.
+ */
+static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+                        enum cpu_idle_type idle, int *sd_idle,
+                        const struct cpumask *cpus, int *balance,
+                        struct sd_lb_stats *sds)
+{
+        struct sched_domain *child = sd->child;
+        struct sched_group *group = sd->groups;
+        struct sg_lb_stats sgs;
+        int load_idx, prefer_sibling = 0;
+        if (child && child->flags & SD_PREFER_SIBLING)
+                prefer_sibling = 1;
+        init_sd_power_savings_stats(sd, sds, idle);
+        load_idx = get_sd_load_idx(sd, idle);
+        do {
+                int local_group;
+                local_group = cpumask_test_cpu(this_cpu,
+                                               sched_group_cpus(group));
+                memset(&sgs, 0, sizeof(sgs));
+                update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
+                                local_group, cpus, balance, &sgs);
+                if (local_group && !(*balance))
+                        return;
+                sds->total_load += sgs.group_load;
+                sds->total_pwr += group->cpu_power;
+                /*
+                 * In case the child domain prefers tasks go to siblings
+                 * first, lower the group capacity to one so that we'll try
+                 * and move all the excess tasks away.
+                 */
+                if (prefer_sibling)
+                        sgs.group_capacity = min(sgs.group_capacity, 1UL);
+                if (local_group) {
+                        sds->this_load = sgs.avg_load;
+                        sds->this = group;
+                        sds->this_nr_running = sgs.sum_nr_running;
+                        sds->this_load_per_task = sgs.sum_weighted_load;
+                } else if (sgs.avg_load > sds->max_load &&
+                           (sgs.sum_nr_running > sgs.group_capacity ||
+                                sgs.group_imb)) {
+                        sds->max_load = sgs.avg_load;
+                        sds->busiest = group;
+                        sds->busiest_nr_running = sgs.sum_nr_running;
+                        sds->busiest_group_capacity = sgs.group_capacity;
+                        sds->busiest_load_per_task = sgs.sum_weighted_load;
+                        sds->group_imb = sgs.group_imb;
+                }
+                update_sd_power_savings_stats(group, sds, local_group, &sgs);
+                group = group->next;
+        } while (group != sd->groups);
+}
+/**
+ * fix_small_imbalance - Calculate the minor imbalance that exists
+ *                      amongst the groups of a sched_domain, during
+ *                      load balancing.
+ * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: Variable to store the imbalance.
+ */
+static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+                                int this_cpu, unsigned long *imbalance)
+{
+        unsigned long tmp, pwr_now = 0, pwr_move = 0;
+        unsigned int imbn = 2;
+        unsigned long scaled_busy_load_per_task;
+        if (sds->this_nr_running) {
+                sds->this_load_per_task /= sds->this_nr_running;
+                if (sds->busiest_load_per_task >
+                                sds->this_load_per_task)
+                        imbn = 1;
+        } else
+                sds->this_load_per_task =
+                        cpu_avg_load_per_task(this_cpu);
+        scaled_busy_load_per_task = sds->busiest_load_per_task
+                                                 * SCHED_LOAD_SCALE;
+        scaled_busy_load_per_task /= sds->busiest->cpu_power;
+        if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
+                        (scaled_busy_load_per_task * imbn)) {
+                *imbalance = sds->busiest_load_per_task;
+                return;
+        }
+        /*
+         * OK, we don't have enough imbalance to justify moving tasks,
+         * however we may be able to increase total CPU power used by
+         * moving them.
+         */
+        pwr_now += sds->busiest->cpu_power *
+                        min(sds->busiest_load_per_task, sds->max_load);
+        pwr_now += sds->this->cpu_power *
+                        min(sds->this_load_per_task, sds->this_load);
+        pwr_now /= SCHED_LOAD_SCALE;
+        /* Amount of load we'd subtract */
+        tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+                sds->busiest->cpu_power;
+        if (sds->max_load > tmp)
+                pwr_move += sds->busiest->cpu_power *
+                        min(sds->busiest_load_per_task, sds->max_load - tmp);
+        /* Amount of load we'd add */
+        if (sds->max_load * sds->busiest->cpu_power <
+                sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+                tmp = (sds->max_load * sds->busiest->cpu_power) /
+                        sds->this->cpu_power;
+        else
+                tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+                        sds->this->cpu_power;
+        pwr_move += sds->this->cpu_power *
+                        min(sds->this_load_per_task, sds->this_load + tmp);
+        pwr_move /= SCHED_LOAD_SCALE;
+        /* Move if we gain throughput */
+        if (pwr_move > pwr_now)
+                *imbalance = sds->busiest_load_per_task;
+}
+/**
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ *                       groups of a given sched_domain during load balance.
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: Cpu for which currently load balance is being performed.
+ * @imbalance: The variable to store the imbalance.
+ */
+static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+                unsigned long *imbalance)
+{
+        unsigned long max_pull, load_above_capacity = ~0UL;
+        sds->busiest_load_per_task /= sds->busiest_nr_running;
+        if (sds->group_imb) {
+                sds->busiest_load_per_task =
+                        min(sds->busiest_load_per_task, sds->avg_load);
+        }
+        /*
+         * In the presence of smp nice balancing, certain scenarios can have
+         * max load less than avg load(as we skip the groups at or below
+         * its cpu_power, while calculating max_load..)
+         */
+        if (sds->max_load < sds->avg_load) {
+                *imbalance = 0;
+                return fix_small_imbalance(sds, this_cpu, imbalance);
+        }
+        if (!sds->group_imb) {
+                /*
+                 * Don't want to pull so many tasks that a group would go idle.
+                 */
+                load_above_capacity = (sds->busiest_nr_running -
+                                                sds->busiest_group_capacity);
+                load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
+                load_above_capacity /= sds->busiest->cpu_power;
+        }
+        /*
+         * We're trying to get all the cpus to the average_load, so we don't
+         * want to push ourselves above the average load, nor do we wish to
+         * reduce the max loaded cpu below the average load. At the same time,
+         * we also don't want to reduce the group load below the group capacity
+         * (so that we can implement power-savings policies etc). Thus we look
+         * for the minimum possible imbalance.
+         * Be careful of negative numbers as they'll appear as very large values
+         * with unsigned longs.
+         */
+        max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
+        /* How much load to actually move to equalise the imbalance */
+        *imbalance = min(max_pull * sds->busiest->cpu_power,
+                (sds->avg_load - sds->this_load) * sds->this->cpu_power)
+                        / SCHED_LOAD_SCALE;
+        /*
+         * if *imbalance is less than the average load per runnable task
+         * there is no gaurantee that any tasks will be moved so we'll have
+         * a think about bumping its value to force at least one task to be
+         * moved
+         */
+        if (*imbalance < sds->busiest_load_per_task)
+                return fix_small_imbalance(sds, this_cpu, imbalance);
+}
+/******* find_busiest_group() helpers end here *********************/
+/**
+ * find_busiest_group - Returns the busiest group within the sched_domain
+ * if there is an imbalance. If there isn't an imbalance, and
+ * the user has opted for power-savings, it returns a group whose
+ * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+ * such a group exists.
+ *
+ * Also calculates the amount of weighted load which should be moved
+ * to restore balance.
+ *
+ * @sd: The sched_domain whose busiest group is to be returned.
+ * @this_cpu: The cpu for which load balancing is currently being performed.
+ * @imbalance: Variable which stores amount of weighted load which should
+ *              be moved to restore balance/put a group to idle.
+ * @idle: The idle status of this_cpu.
+ * @sd_idle: The idleness of sd
+ * @cpus: The set of CPUs under consideration for load-balancing.
+ * @balance: Pointer to a variable indicating if this_cpu
+ *      is the appropriate cpu to perform load balancing at this_level.
+ *
+ * Returns:     - the busiest group if imbalance exists.
+ *              - If no imbalance and user has opted for power-savings balance,
+ *                 return the least loaded group whose CPUs can be
+ *                 put to idle by rebalancing its tasks onto our group.
+ */
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+                   unsigned long *imbalance, enum cpu_idle_type idle,
+                   int *sd_idle, const struct cpumask *cpus, int *balance)
+{
+        struct sd_lb_stats sds;
+        memset(&sds, 0, sizeof(sds));
+        /*
+         * Compute the various statistics relavent for load balancing at
+         * this level.
+         */
+        update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+                                        balance, &sds);
+        /* Cases where imbalance does not exist from POV of this_cpu */
+        /* 1) this_cpu is not the appropriate cpu to perform load balancing
+         *    at this level.
+         * 2) There is no busy sibling group to pull from.
+         * 3) This group is the busiest group.
+         * 4) This group is more busy than the avg busieness at this
+         *    sched_domain.
+         * 5) The imbalance is within the specified limit.
+         */
+        if (!(*balance))
+                goto ret;
+        if (!sds.busiest || sds.busiest_nr_running == 0)
+                goto out_balanced;
+        if (sds.this_load >= sds.max_load)
+                goto out_balanced;
+        sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+        if (sds.this_load >= sds.avg_load)
+                goto out_balanced;
+        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+                goto out_balanced;
+        /* Looks like there is an imbalance. Compute it */
+        calculate_imbalance(&sds, this_cpu, imbalance);
+        return sds.busiest;
+out_balanced:
+        /*
+         * There is no obvious imbalance. But check if we can do some balancing
+         * to save power.
+         */
+        if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+                return sds.busiest;
+ret:
+        *imbalance = 0;
+        return NULL;
+}
+/*
+ * find_busiest_queue - find the busiest runqueue among the cpus in group.
+ */
+static struct rq *
+find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
+                   unsigned long imbalance, const struct cpumask *cpus)
+{
+        struct rq *busiest = NULL, *rq;
+        unsigned long max_load = 0;
+        int i;
+        for_each_cpu(i, sched_group_cpus(group)) {
+                unsigned long power = power_of(i);
+                unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+                unsigned long wl;
+                if (!cpumask_test_cpu(i, cpus))
+                        continue;
+                rq = cpu_rq(i);
+                wl = weighted_cpuload(i);
+                /*
+                 * When comparing with imbalance, use weighted_cpuload()
+                 * which is not scaled with the cpu power.
+                 */
+                if (capacity && rq->nr_running == 1 && wl > imbalance)
+                        continue;
+                /*
+                 * For the load comparisons with the other cpu's, consider
+                 * the weighted_cpuload() scaled with the cpu power, so that
+                 * the load can be moved away from the cpu that is potentially
+                 * running at a lower capacity.
+                 */
+                wl = (wl * SCHED_LOAD_SCALE) / power;
+                if (wl > max_load) {
+                        max_load = wl;
+                        busiest = rq;
+                }
+        }
+        return busiest;
+}
+/*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL     512
+/* Working cpumask for load_balance and load_balance_newidle. */
+static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
+{
+        if (idle == CPU_NEWLY_IDLE) {
+                /*
+                 * The only task running in a non-idle cpu can be moved to this
+                 * cpu in an attempt to completely freeup the other CPU
+                 * package.
+                 *
+                 * The package power saving logic comes from
+                 * find_busiest_group(). If there are no imbalance, then
+                 * f_b_g() will return NULL. However when sched_mc={1,2} then
+                 * f_b_g() will select a group from which a running task may be
+                 * pulled to this cpu in order to make the other package idle.
+                 * If there is no opportunity to make a package idle and if
+                 * there are no imbalance, then f_b_g() will return NULL and no
+                 * action will be taken in load_balance_newidle().
+                 *
+                 * Under normal task pull operation due to imbalance, there
+                 * will be more than one task in the source run queue and
+                 * move_tasks() will succeed.  ld_moved will be true and this
+                 * active balance code will not be triggered.
+                 */
+                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+                    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+                        return 0;
+                if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
+                        return 0;
+        }
+        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
+}
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ */
+static int load_balance(int this_cpu, struct rq *this_rq,
+                        struct sched_domain *sd, enum cpu_idle_type idle,
+                        int *balance)
+{
+        int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+        struct sched_group *group;
+        unsigned long imbalance;
+        struct rq *busiest;
+        unsigned long flags;
+        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
+        cpumask_copy(cpus, cpu_active_mask);
+        /*
+         * When power savings policy is enabled for the parent domain, idle
+         * sibling can pick up load irrespective of busy siblings. In this case,
+         * let the state of idle sibling percolate up as CPU_IDLE, instead of
+         * portraying it as CPU_NOT_IDLE.
+         */
+        if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+                sd_idle = 1;
+        schedstat_inc(sd, lb_count[idle]);
+redo:
+        update_shares(sd);
+        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+                                   cpus, balance);
+        if (*balance == 0)
+                goto out_balanced;
+        if (!group) {
+                schedstat_inc(sd, lb_nobusyg[idle]);
+                goto out_balanced;
+        }
+        busiest = find_busiest_queue(group, idle, imbalance, cpus);
+        if (!busiest) {
+                schedstat_inc(sd, lb_nobusyq[idle]);
+                goto out_balanced;
+        }
+        BUG_ON(busiest == this_rq);
+        schedstat_add(sd, lb_imbalance[idle], imbalance);
+        ld_moved = 0;
+        if (busiest->nr_running > 1) {
+                /*
+                 * Attempt to move tasks. If find_busiest_group has found
+                 * an imbalance but busiest->nr_running <= 1, the group is
+                 * still unbalanced. ld_moved simply stays zero, so it is
+                 * correctly treated as an imbalance.
+                 */
+                local_irq_save(flags);
+                double_rq_lock(this_rq, busiest);
+                ld_moved = move_tasks(this_rq, this_cpu, busiest,
+                                      imbalance, sd, idle, &all_pinned);
+                double_rq_unlock(this_rq, busiest);
+                local_irq_restore(flags);
+                /*
+                 * some other cpu did the load balance for us.
+                 */
+                if (ld_moved && this_cpu != smp_processor_id())
+                        resched_cpu(this_cpu);
+                /* All tasks on this runqueue were pinned by CPU affinity */
+                if (unlikely(all_pinned)) {
+                        cpumask_clear_cpu(cpu_of(busiest), cpus);
+                        if (!cpumask_empty(cpus))
+                                goto redo;
+                        goto out_balanced;
+                }
+        }
+        if (!ld_moved) {
+                schedstat_inc(sd, lb_failed[idle]);
+                sd->nr_balance_failed++;
+                if (need_active_balance(sd, sd_idle, idle)) {
+                        raw_spin_lock_irqsave(&busiest->lock, flags);
+                        /* don't kick the migration_thread, if the curr
+                         * task on busiest cpu can't be moved to this_cpu
+                         */
+                        if (!cpumask_test_cpu(this_cpu,
+                                              &busiest->curr->cpus_allowed)) {
+                                raw_spin_unlock_irqrestore(&busiest->lock,
+                                                            flags);
+                                all_pinned = 1;
+                                goto out_one_pinned;
+                        }
+                        if (!busiest->active_balance) {
+                                busiest->active_balance = 1;
+                                busiest->push_cpu = this_cpu;
+                                active_balance = 1;
+                        }
+                        raw_spin_unlock_irqrestore(&busiest->lock, flags);
+                        if (active_balance)
+                                wake_up_process(busiest->migration_thread);
+                        /*
+                         * We've kicked active balancing, reset the failure
+                         * counter.
+                         */
+                        sd->nr_balance_failed = sd->cache_nice_tries+1;
+                }
+        } else
+                sd->nr_balance_failed = 0;
+        if (likely(!active_balance)) {
+                /* We were unbalanced, so reset the balancing interval */
+                sd->balance_interval = sd->min_interval;
+        } else {
+                /*
+                 * If we've begun active balancing, start to back off. This
+                 * case may not be covered by the all_pinned logic if there
+                 * is only 1 task on the busy runqueue (because we don't call
+                 * move_tasks).
+                 */
+                if (sd->balance_interval < sd->max_interval)
+                        sd->balance_interval *= 2;
+        }
+        if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+                ld_moved = -1;
+        goto out;
+out_balanced:
+        schedstat_inc(sd, lb_balanced[idle]);
+        sd->nr_balance_failed = 0;
+out_one_pinned:
+        /* tune up the balancing interval */
+        if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+                        (sd->balance_interval < sd->max_interval))
+                sd->balance_interval *= 2;
+        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
+                ld_moved = -1;
+        else
+                ld_moved = 0;
+out:
+        if (ld_moved)
+                update_shares(sd);
+        return ld_moved;
+}
+/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+static void idle_balance(int this_cpu, struct rq *this_rq)
+{
+        struct sched_domain *sd;
+        int pulled_task = 0;
+        unsigned long next_balance = jiffies + HZ;
+        this_rq->idle_stamp = this_rq->clock;
+        if (this_rq->avg_idle < sysctl_sched_migration_cost)
+                return;
+        /*
+         * Drop the rq->lock, but keep IRQ/preempt disabled.
+         */
+        raw_spin_unlock(&this_rq->lock);
+        for_each_domain(this_cpu, sd) {
+                unsigned long interval;
+                int balance = 1;
+                if (!(sd->flags & SD_LOAD_BALANCE))
+                        continue;
+                if (sd->flags & SD_BALANCE_NEWIDLE) {
+                        /* If we've pulled tasks over stop searching: */
+                        pulled_task = load_balance(this_cpu, this_rq,
+                                                   sd, CPU_NEWLY_IDLE, &balance);
+                }
+                interval = msecs_to_jiffies(sd->balance_interval);
+                if (time_after(next_balance, sd->last_balance + interval))
+                        next_balance = sd->last_balance + interval;
+                if (pulled_task) {
+                        this_rq->idle_stamp = 0;
+                        break;
+                }
+        }
+        raw_spin_lock(&this_rq->lock);
+        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
+                /*
+                 * We are going idle. next_balance may be set based on
+                 * a busy processor. So reset next_balance.
+                 */
+                this_rq->next_balance = next_balance;
+        }
+}
+/*
+ * active_load_balance is run by migration threads. It pushes running tasks
+ * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
+ * running on each physical CPU where possible, and avoids physical /
+ * logical imbalances.
+ *
+ * Called with busiest_rq locked.
+ */
+static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
+{
+        int target_cpu = busiest_rq->push_cpu;
+        struct sched_domain *sd;
+        struct rq *target_rq;
+        /* Is there any task to move? */
+        if (busiest_rq->nr_running <= 1)
+                return;
+        target_rq = cpu_rq(target_cpu);
+        /*
+         * This condition is "impossible", if it occurs
+         * we need to fix it. Originally reported by
+         * Bjorn Helgaas on a 128-cpu setup.
+         */
+        BUG_ON(busiest_rq == target_rq);
+        /* move a task from busiest_rq to target_rq */
+        double_lock_balance(busiest_rq, target_rq);
+        update_rq_clock(busiest_rq);
+        update_rq_clock(target_rq);
+        /* Search for an sd spanning us and the target CPU. */
+        for_each_domain(target_cpu, sd) {
+                if ((sd->flags & SD_LOAD_BALANCE) &&
+                    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+                                break;
+        }
+        if (likely(sd)) {
+                schedstat_inc(sd, alb_count);
+                if (move_one_task(target_rq, target_cpu, busiest_rq,
+                                  sd, CPU_IDLE))
+                        schedstat_inc(sd, alb_pushed);
+                else
+                        schedstat_inc(sd, alb_failed);
+        }
+        double_unlock_balance(busiest_rq, target_rq);
+}
+#ifdef CONFIG_NO_HZ
+static struct {
+        atomic_t load_balancer;
+        cpumask_var_t cpu_mask;
+        cpumask_var_t ilb_grp_nohz_mask;
+} nohz ____cacheline_aligned = {
+        .load_balancer = ATOMIC_INIT(-1),
+};
+int get_nohz_load_balancer(void)
+{
+        return atomic_read(&nohz.load_balancer);
+}
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:        The cpu whose lowest level of sched domain is to
+ *              be returned.
+ * @flag:       The flag to check for the lowest sched_domain
+ *              for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+        struct sched_domain *sd;
+        for_each_domain(cpu, sd)
+                if (sd && (sd->flags & flag))
+                        break;
+        return sd;
+}
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:        The cpu whose domains we're iterating over.
+ * @sd:         variable holding the value of the power_savings_sd
+ *              for cpu.
+ * @flag:       The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+        for (sd = lowest_flag_domain(cpu, flag); \
+                (sd && (sd->flags & flag)); sd = sd->parent)
+/**
+ * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ * @ilb_group:  group to be checked for semi-idleness
+ *
+ * Returns:     1 if the group is semi-idle. 0 otherwise.
+ *
+ * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ * and atleast one non-idle CPU. This helper function checks if the given
+ * sched_group is semi-idle or not.
+ */
+static inline int is_semi_idle_group(struct sched_group *ilb_group)
+{
+        cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+                                        sched_group_cpus(ilb_group));
+        /*
+         * A sched_group is semi-idle when it has atleast one busy cpu
+         * and atleast one idle cpu.
+         */
+        if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+                return 0;
+        if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+                return 0;
+        return 1;
+}
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu:        The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns:     Returns the id of the idle load balancer if it exists,
+ *              Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+        struct sched_domain *sd;
+        struct sched_group *ilb_group;
+        /*
+         * Have idle load balancer selection from semi-idle packages only
+         * when power-aware load balancing is enabled
+         */
+        if (!(sched_smt_power_savings || sched_mc_power_savings))
+                goto out_done;
+        /*
+         * Optimize for the case when we have no idle CPUs or only one
+         * idle CPU. Don't walk the sched_domain hierarchy in such cases
+         */
+        if (cpumask_weight(nohz.cpu_mask) < 2)
+                goto out_done;
+        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+                ilb_group = sd->groups;
+                do {
+                        if (is_semi_idle_group(ilb_group))
+                                return cpumask_first(nohz.ilb_grp_nohz_mask);
+                        ilb_group = ilb_group->next;
+                } while (ilb_group != sd->groups);
+        }
+out_done:
+        return cpumask_first(nohz.cpu_mask);
+}
+#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+        return cpumask_first(nohz.cpu_mask);
+}
+#endif
+/*
+ * This routine will try to nominate the ilb (idle load balancing)
+ * owner among the cpus whose ticks are stopped. ilb owner will do the idle
+ * load balancing on behalf of all those cpus. If all the cpus in the system
+ * go into this tickless mode, then there will be no ilb owner (as there is
+ * no need for one) and all the cpus will sleep till the next wakeup event
+ * arrives...
+ *
+ * For the ilb owner, tick is not stopped. And this tick will be used
+ * for idle load balancing. ilb owner will still be part of
+ * nohz.cpu_mask..
+ *
+ * While stopping the tick, this cpu will become the ilb owner if there
+ * is no other owner. And will be the owner till that cpu becomes busy
+ * or if all cpus in the system stop their ticks at which point
+ * there is no need for ilb owner.
+ *
+ * When the ilb owner becomes busy, it nominates another owner, during the
+ * next busy scheduler_tick()
+ */
+int select_nohz_load_balancer(int stop_tick)
+{
+        int cpu = smp_processor_id();
+        if (stop_tick) {
+                cpu_rq(cpu)->in_nohz_recently = 1;
+                if (!cpu_active(cpu)) {
+                        if (atomic_read(&nohz.load_balancer) != cpu)
+                                return 0;
+                        /*
+                         * If we are going offline and still the leader,
+                         * give up!
+                         */
+                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+                                BUG();
+                        return 0;
+                }
+                cpumask_set_cpu(cpu, nohz.cpu_mask);
+                /* time for ilb owner also to sleep */
+                if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
+                        if (atomic_read(&nohz.load_balancer) == cpu)
+                                atomic_set(&nohz.load_balancer, -1);
+                        return 0;
+                }
+                if (atomic_read(&nohz.load_balancer) == -1) {
+                        /* make me the ilb owner */
+                        if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
+                                return 1;
+                } else if (atomic_read(&nohz.load_balancer) == cpu) {
+                        int new_ilb;
+                        if (!(sched_smt_power_savings ||
+                                                sched_mc_power_savings))
+                                return 1;
+                        /*
+                         * Check to see if there is a more power-efficient
+                         * ilb.
+                         */
+                        new_ilb = find_new_ilb(cpu);
+                        if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+                                atomic_set(&nohz.load_balancer, -1);
+                                resched_cpu(new_ilb);
+                                return 0;
+                        }
+                        return 1;
+                }
+        } else {
+                if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
+                        return 0;
+                cpumask_clear_cpu(cpu, nohz.cpu_mask);
+                if (atomic_read(&nohz.load_balancer) == cpu)
+                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+                                BUG();
+        }
+        return 0;
+}
+#endif
+static DEFINE_SPINLOCK(balancing);
+/*
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+static void rebalance_domains(int cpu, enum cpu_idle_type idle)
+{
+        int balance = 1;
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long interval;
+        struct sched_domain *sd;
+        /* Earliest time when we have to do rebalance again */
+        unsigned long next_balance = jiffies + 60*HZ;
+        int update_next_balance = 0;
+        int need_serialize;
+        for_each_domain(cpu, sd) {
+                if (!(sd->flags & SD_LOAD_BALANCE))
+                        continue;
+                interval = sd->balance_interval;
+                if (idle != CPU_IDLE)
+                        interval *= sd->busy_factor;
+                /* scale ms to jiffies */
+                interval = msecs_to_jiffies(interval);
+                if (unlikely(!interval))
+                        interval = 1;
+                if (interval > HZ*NR_CPUS/10)
+                        interval = HZ*NR_CPUS/10;
+                need_serialize = sd->flags & SD_SERIALIZE;
+                if (need_serialize) {
+                        if (!spin_trylock(&balancing))
+                                goto out;
+                }
+                if (time_after_eq(jiffies, sd->last_balance + interval)) {
+                        if (load_balance(cpu, rq, sd, idle, &balance)) {
+                                /*
+                                 * We've pulled tasks over so either we're no
+                                 * longer idle, or one of our SMT siblings is
+                                 * not idle.
+                                 */
+                                idle = CPU_NOT_IDLE;
+                        }
+                        sd->last_balance = jiffies;
+                }
+                if (need_serialize)
+                        spin_unlock(&balancing);
+out:
+                if (time_after(next_balance, sd->last_balance + interval)) {
+                        next_balance = sd->last_balance + interval;
+                        update_next_balance = 1;
+                }
+                /*
+                 * Stop the load balance at this level. There is another
+                 * CPU in our sched group which is doing load balancing more
+                 * actively.
+                 */
+                if (!balance)
+                        break;
+        }
+        /*
+         * next_balance will be updated only when there is a need.
+         * When the cpu is attached to null domain for ex, it will not be
+         * updated.
+         */
+        if (likely(update_next_balance))
+                rq->next_balance = next_balance;
+}
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static void run_rebalance_domains(struct softirq_action *h)
+{
+        int this_cpu = smp_processor_id();
+        struct rq *this_rq = cpu_rq(this_cpu);
+        enum cpu_idle_type idle = this_rq->idle_at_tick ?
+                                                CPU_IDLE : CPU_NOT_IDLE;
+        rebalance_domains(this_cpu, idle);
+#ifdef CONFIG_NO_HZ
+        /*
+         * If this cpu is the owner for idle load balancing, then do the
+         * balancing on behalf of the other idle cpus whose ticks are
+         * stopped.
+         */
+        if (this_rq->idle_at_tick &&
+            atomic_read(&nohz.load_balancer) == this_cpu) {
+                struct rq *rq;
+                int balance_cpu;
+                for_each_cpu(balance_cpu, nohz.cpu_mask) {
+                        if (balance_cpu == this_cpu)
+                                continue;
+                        /*
+                         * If this cpu gets work to do, stop the load balancing
+                         * work being done for other cpus. Next load
+                         * balancing owner will pick it up.
+                         */
+                        if (need_resched())
+                                break;
+                        rebalance_domains(balance_cpu, CPU_IDLE);
+                        rq = cpu_rq(balance_cpu);
+                        if (time_after(this_rq->next_balance, rq->next_balance))
+                                this_rq->next_balance = rq->next_balance;
+                }
+        }
+#endif
+}
+static inline int on_null_domain(int cpu)
+{
+        return !rcu_dereference(cpu_rq(cpu)->sd);
+}
+/*
+ * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+ *
+ * In case of CONFIG_NO_HZ, this is the place where we nominate a new
+ * idle load balancing owner or decide to stop the periodic load balancing,
+ * if the whole system is idle.
+ */
+static inline void trigger_load_balance(struct rq *rq, int cpu)
+{
+#ifdef CONFIG_NO_HZ
+        /*
+         * If we were in the nohz mode recently and busy at the current
+         * scheduler tick, then check if we need to nominate new idle
+         * load balancer.
+         */
+        if (rq->in_nohz_recently && !rq->idle_at_tick) {
+                rq->in_nohz_recently = 0;
+                if (atomic_read(&nohz.load_balancer) == cpu) {
+                        cpumask_clear_cpu(cpu, nohz.cpu_mask);
+                        atomic_set(&nohz.load_balancer, -1);
+                }
+                if (atomic_read(&nohz.load_balancer) == -1) {
+                        int ilb = find_new_ilb(cpu);
+                        if (ilb < nr_cpu_ids)
+                                resched_cpu(ilb);
+                }
+        }
+        /*
+         * If this cpu is idle and doing idle load balancing for all the
+         * cpus with ticks stopped, is it time for that to stop?
+         */
+        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
+            cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
+                resched_cpu(cpu);
+                return;
+        }
+        /*
+         * If this cpu is idle and the idle load balancing is done by
+         * someone else, then no need raise the SCHED_SOFTIRQ
+         */
+        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
+            cpumask_test_cpu(cpu, nohz.cpu_mask))
+                return;
+#endif
+        /* Don't need to rebalance while attached to NULL domain */
+        if (time_after_eq(jiffies, rq->next_balance) &&
+            likely(!on_null_domain(cpu)))
+                raise_softirq(SCHED_SOFTIRQ);
+}
 static void rq_online_fair(struct rq *rq)
 {
@@ -1962,6 +3544,15 @@ static void rq_offline_fair(struct rq *rq)
        update_sysctl();
 }
+#else   /* CONFIG_SMP */
+/*
+ * on UP we do not need to balance between CPUs:
+ */
+static inline void idle_balance(int cpu, struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 /*
@@ -2076,7 +3667,7 @@ static void moved_group_fair(struct task_struct *p, int on_rq)
 }
 #endif
-unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
+static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
 {
        struct sched_entity *se = &task->se;
        unsigned int rr_interval = 0;
@@ -2108,8 +3699,6 @@ static const struct sched_class fair_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_fair,
-        .load_balance           = load_balance_fair,
-        .move_one_task          = move_one_task_fair,
        .rq_online              = rq_online_fair,
        .rq_offline             = rq_offline_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5f93b570d383..a8a6d8a50947 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
 }
-#ifdef CONFIG_SMP
-static unsigned long
-load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                  unsigned long max_load_move,
-                  struct sched_domain *sd, enum cpu_idle_type idle,
-                  int *all_pinned, int *this_best_prio)
-{
-        return 0;
-}
-static int
-move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                   struct sched_domain *sd, enum cpu_idle_type idle)
-{
-        return 0;
-}
-#endif
 static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
 {
 }
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
                check_preempt_curr(rq, p, 0);
 }
-unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
+static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
 {
        return 0;
 }
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_idle,
-        .load_balance           = load_balance_idle,
-        .move_one_task          = move_one_task_idle,
 #endif
        .set_curr_task          = set_curr_task_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f48328ac216f..bf3e38fdbe6d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
        return rt_se->my_q;
 }
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
+        int this_cpu = smp_processor_id();
        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
-        struct sched_rt_entity *rt_se = rt_rq->rt_se;
+        struct sched_rt_entity *rt_se;
+        rt_se = rt_rq->tg->rt_se[this_cpu];
        if (rt_rq->rt_nr_running) {
                if (rt_se && !on_rt_rq(rt_se))
-                        enqueue_rt_entity(rt_se);
+                        enqueue_rt_entity(rt_se, false);
                if (rt_rq->highest_prio.curr < curr->prio)
                        resched_task(curr);
        }
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
-        struct sched_rt_entity *rt_se = rt_rq->rt_se;
+        int this_cpu = smp_processor_id();
+        struct sched_rt_entity *rt_se;
+        rt_se = rt_rq->tg->rt_se[this_cpu];
        if (rt_se && on_rt_rq(rt_se))
                dequeue_rt_entity(rt_se);
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        dec_rt_group(rt_se, rt_rq);
 }
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 {
        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
        struct rt_prio_array *array = &rt_rq->active;
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                return;
-        list_add_tail(&rt_se->run_list, queue);
+        if (head)
+                list_add(&rt_se->run_list, queue);
+        else
+                list_add_tail(&rt_se->run_list, queue);
        __set_bit(rt_se_prio(rt_se), array->bitmap);
        inc_rt_tasks(rt_se, rt_rq);
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
        }
 }
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 {
        dequeue_rt_stack(rt_se);
        for_each_sched_rt_entity(rt_se)
-                __enqueue_rt_entity(rt_se);
+                __enqueue_rt_entity(rt_se, head);
 }
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
@@ -871,21 +880,22 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
                struct rt_rq *rt_rq = group_rt_rq(rt_se);
                if (rt_rq && rt_rq->rt_nr_running)
-                        __enqueue_rt_entity(rt_se);
+                        __enqueue_rt_entity(rt_se, false);
        }
 }
 /*
 * Adding/removing a task to/from a priority array:
 */
-static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
+static void
+enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 {
        struct sched_rt_entity *rt_se = &p->rt;
        if (wakeup)
                rt_se->timeout = 0;
-        enqueue_rt_entity(rt_se);
+        enqueue_rt_entity(rt_se, head);
        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
@@ -1481,24 +1491,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
                push_rt_tasks(rq);
 }
-static unsigned long
-load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                unsigned long max_load_move,
-                struct sched_domain *sd, enum cpu_idle_type idle,
-                int *all_pinned, int *this_best_prio)
-{
-        /* don't touch RT tasks */
-        return 0;
-}
-static int
-move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                 struct sched_domain *sd, enum cpu_idle_type idle)
-{
-        /* don't touch RT tasks */
-        return 0;
-}
 static void set_cpus_allowed_rt(struct task_struct *p,
                                const struct cpumask *new_mask)
 {
@@ -1721,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
        dequeue_pushable_task(rq, p);
 }
-unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
+static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
 {
        /*
         * Time slice is 0 for SCHED_FIFO tasks
@@ -1746,8 +1738,6 @@ static const struct sched_class rt_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_rt,
-        .load_balance           = load_balance_rt,
-        .move_one_task          = move_one_task_rt,
        .set_cpus_allowed       = set_cpus_allowed_rt,
        .rq_online              = rq_online_rt,
        .rq_offline             = rq_offline_rt,
diff --git a/kernel/smp.c b/kernel/smp.c
index f10408422444..9867b6bfefce 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -12,8 +12,6 @@
 #include <linux/smp.h>
 #include <linux/cpu.h>
-static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
 static struct {
        struct list_head        queue;
        raw_spinlock_t          lock;
@@ -33,12 +31,14 @@ struct call_function_data {
        cpumask_var_t           cpumask;
 };
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
 struct call_single_queue {
        struct list_head        list;
        raw_spinlock_t          lock;
 };
-static DEFINE_PER_CPU(struct call_function_data, cfd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue);
 static int
 hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -256,7 +256,7 @@ void generic_smp_call_function_single_interrupt(void)
        }
 }
-static DEFINE_PER_CPU(struct call_single_data, csd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
 /*
 * smp_call_function_single - Run a function on a specific CPU
diff --git a/kernel/sys.c b/kernel/sys.c
index 18bde979f346..877fe4f8e05e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -571,11 +571,6 @@ static int set_user(struct cred *new)
        if (!new_user)
                return -EAGAIN;
-        if (!task_can_switch_user(new_user, current)) {
-                free_uid(new_user);
-                return -EINVAL;
-        }
        if (atomic_read(&new_user->processes) >=
                                current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
                        new_user != INIT_USER) {
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 60e2ce0181ee..13e13d428cd3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -328,15 +328,6 @@ config BRANCH_TRACER
          Say N if unsure.
-config POWER_TRACER
-        bool "Trace power consumption behavior"
-        depends on X86
-        select GENERIC_TRACER
-        help
-          This tracer helps developers to analyze and optimize the kernel's
-          power management decisions, specifically the C-state and P-state
-          behavior.
 config KSYM_TRACER
        bool "Trace read and write access on kernel memory locations"
        depends on HAVE_HW_BREAKPOINT
@@ -449,7 +440,7 @@ config BLK_DEV_IO_TRACE
 config KPROBE_EVENT
        depends on KPROBES
-        depends on X86
+        depends on HAVE_REGS_AND_STACK_ACCESS_API
        bool "Enable kprobes-based dynamic events"
        select TRACING
        default y
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cd9ecd89ec77..d00c6fe23f54 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -51,7 +51,9 @@ endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
-obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
+ifeq ($(CONFIG_PERF_EVENTS),y)
+obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o
+endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1e6640f80454..83783579378f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,7 +22,6 @@
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
 #include <linux/uaccess.h>
-#include <linux/kprobes.h>
 #include <linux/ftrace.h>
 #include <linux/sysctl.h>
 #include <linux/ctype.h>
@@ -898,36 +897,6 @@ static struct dyn_ftrace *ftrace_free_records;
                }                               \
        }
-#ifdef CONFIG_KPROBES
-static int frozen_record_count;
-static inline void freeze_record(struct dyn_ftrace *rec)
-{
-        if (!(rec->flags & FTRACE_FL_FROZEN)) {
-                rec->flags |= FTRACE_FL_FROZEN;
-                frozen_record_count++;
-        }
-}
-static inline void unfreeze_record(struct dyn_ftrace *rec)
-{
-        if (rec->flags & FTRACE_FL_FROZEN) {
-                rec->flags &= ~FTRACE_FL_FROZEN;
-                frozen_record_count--;
-        }
-}
-static inline int record_frozen(struct dyn_ftrace *rec)
-{
-        return rec->flags & FTRACE_FL_FROZEN;
-}
-#else
-# define freeze_record(rec)                     ({ 0; })
-# define unfreeze_record(rec)                   ({ 0; })
-# define record_frozen(rec)                     ({ 0; })
-#endif /* CONFIG_KPROBES */
 static void ftrace_free_rec(struct dyn_ftrace *rec)
 {
        rec->freelist = ftrace_free_records;
@@ -1025,6 +994,21 @@ static void ftrace_bug(int failed, unsigned long ip)
 }
+/* Return 1 if the address range is reserved for ftrace */
+int ftrace_text_reserved(void *start, void *end)
+{
+        struct dyn_ftrace *rec;
+        struct ftrace_page *pg;
+        do_for_each_ftrace_rec(pg, rec) {
+                if (rec->ip <= (unsigned long)end &&
+                    rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
+                        return 1;
+        } while_for_each_ftrace_rec();
+        return 0;
+}
 static int
 __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
@@ -1076,14 +1060,6 @@ static void ftrace_replace_code(int enable)
                    !(rec->flags & FTRACE_FL_CONVERTED))
                        continue;
-                /* ignore updates to this record's mcount site */
-                if (get_kprobe((void *)rec->ip)) {
-                        freeze_record(rec);
-                        continue;
-                } else {
-                        unfreeze_record(rec);
-                }
                failed = __ftrace_replace_code(rec, enable);
                if (failed) {
                        rec->flags |= FTRACE_FL_FAILED;
@@ -2426,6 +2402,7 @@ static const struct file_operations ftrace_notrace_fops = {
 static DEFINE_MUTEX(graph_lock);
 int ftrace_graph_count;
+int ftrace_graph_filter_enabled;
 unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
 static void *
@@ -2448,7 +2425,7 @@ static void *g_start(struct seq_file *m, loff_t *pos)
        mutex_lock(&graph_lock);
        /* Nothing, tell g_show to print all functions are enabled */
-        if (!ftrace_graph_count && !*pos)
+        if (!ftrace_graph_filter_enabled && !*pos)
                return (void *)1;
        return __g_next(m, pos);
@@ -2494,6 +2471,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
        mutex_lock(&graph_lock);
        if ((file->f_mode & FMODE_WRITE) &&
            (file->f_flags & O_TRUNC)) {
+                ftrace_graph_filter_enabled = 0;
                ftrace_graph_count = 0;
                memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
        }
@@ -2519,7 +2497,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
        struct dyn_ftrace *rec;
        struct ftrace_page *pg;
        int search_len;
-        int found = 0;
+        int fail = 1;
        int type, not;
        char *search;
        bool exists;
@@ -2530,37 +2508,51 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
        /* decode regex */
        type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
-        if (not)
+        if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
-                return -EINVAL;
+                return -EBUSY;
        search_len = strlen(search);
        mutex_lock(&ftrace_lock);
        do_for_each_ftrace_rec(pg, rec) {
-                if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
-                        break;
                if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
                        continue;
                if (ftrace_match_record(rec, search, search_len, type)) {
-                        /* ensure it is not already in the array */
+                        /* if it is in the array */
                        exists = false;
-                        for (i = 0; i < *idx; i++)
+                        for (i = 0; i < *idx; i++) {
                                if (array[i] == rec->ip) {
                                        exists = true;
                                        break;
                                }
-                        if (!exists)
+                        }
-                                array[(*idx)++] = rec->ip;
-                        found = 1;
+                        if (!not) {
+                                fail = 0;
+                                if (!exists) {
+                                        array[(*idx)++] = rec->ip;
+                                        if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
+                                                goto out;
+                                }
+                        } else {
+                                if (exists) {
+                                        array[i] = array[--(*idx)];
+                                        array[*idx] = 0;
+                                        fail = 0;
+                                }
+                        }
                }
        } while_for_each_ftrace_rec();
+out:
        mutex_unlock(&ftrace_lock);
-        return found ? 0 : -EINVAL;
+        if (fail)
+                return -EINVAL;
+        ftrace_graph_filter_enabled = 1;
+        return 0;
 }
 static ssize_t
@@ -2570,16 +2562,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
        struct trace_parser parser;
        ssize_t read, ret;
-        if (!cnt || cnt < 0)
+        if (!cnt)
                return 0;
        mutex_lock(&graph_lock);
-        if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
-                ret = -EBUSY;
-                goto out_unlock;
-        }
        if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
                ret = -ENOMEM;
                goto out_unlock;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index eac6875cb990..032c57ca6502 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -32,6 +32,7 @@
 #include <linux/splice.h>
 #include <linux/kdebug.h>
 #include <linux/string.h>
+#include <linux/rwsem.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/poll.h>
@@ -102,9 +103,6 @@ static inline void ftrace_enable_cpu(void)
 static cpumask_var_t __read_mostly      tracing_buffer_mask;
-/* Define which cpu buffers are currently read in trace_pipe */
-static cpumask_var_t                    tracing_reader_cpumask;
 #define for_each_tracing_cpu(cpu)       \
        for_each_cpu(cpu, tracing_buffer_mask)
@@ -243,12 +241,91 @@ static struct tracer		*current_trace __read_mostly;
 /*
 * trace_types_lock is used to protect the trace_types list.
- * This lock is also used to keep user access serialized.
- * Accesses from userspace will grab this lock while userspace
- * activities happen inside the kernel.
 */
 static DEFINE_MUTEX(trace_types_lock);
+/*
+ * serialize the access of the ring buffer
+ *
+ * ring buffer serializes readers, but it is low level protection.
+ * The validity of the events (which returns by ring_buffer_peek() ..etc)
+ * are not protected by ring buffer.
+ *
+ * The content of events may become garbage if we allow other process consumes
+ * these events concurrently:
+ *   A) the page of the consumed events may become a normal page
+ *      (not reader page) in ring buffer, and this page will be rewrited
+ *      by events producer.
+ *   B) The page of the consumed events may become a page for splice_read,
+ *      and this page will be returned to system.
+ *
+ * These primitives allow multi process access to different cpu ring buffer
+ * concurrently.
+ *
+ * These primitives don't distinguish read-only and read-consume access.
+ * Multi read-only access are also serialized.
+ */
+#ifdef CONFIG_SMP
+static DECLARE_RWSEM(all_cpu_access_lock);
+static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
+static inline void trace_access_lock(int cpu)
+{
+        if (cpu == TRACE_PIPE_ALL_CPU) {
+                /* gain it for accessing the whole ring buffer. */
+                down_write(&all_cpu_access_lock);
+        } else {
+                /* gain it for accessing a cpu ring buffer. */
+                /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
+                down_read(&all_cpu_access_lock);
+                /* Secondly block other access to this @cpu ring buffer. */
+                mutex_lock(&per_cpu(cpu_access_lock, cpu));
+        }
+}
+static inline void trace_access_unlock(int cpu)
+{
+        if (cpu == TRACE_PIPE_ALL_CPU) {
+                up_write(&all_cpu_access_lock);
+        } else {
+                mutex_unlock(&per_cpu(cpu_access_lock, cpu));
+                up_read(&all_cpu_access_lock);
+        }
+}
+static inline void trace_access_lock_init(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                mutex_init(&per_cpu(cpu_access_lock, cpu));
+}
+#else
+static DEFINE_MUTEX(access_lock);
+static inline void trace_access_lock(int cpu)
+{
+        (void)cpu;
+        mutex_lock(&access_lock);
+}
+static inline void trace_access_unlock(int cpu)
+{
+        (void)cpu;
+        mutex_unlock(&access_lock);
+}
+static inline void trace_access_lock_init(void)
+{
+}
+#endif
 /* trace_wait is a waitqueue for tasks blocked on trace_poll */
 static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
@@ -1320,8 +1397,10 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        entry->fmt                      = fmt;
        memcpy(entry->buf, trace_buf, sizeof(u32) * len);
-        if (!filter_check_discard(call, entry, buffer, event))
+        if (!filter_check_discard(call, entry, buffer, event)) {
                ring_buffer_unlock_commit(buffer, event);
+                ftrace_trace_stack(buffer, flags, 6, pc);
+        }
 out_unlock:
        arch_spin_unlock(&trace_buf_lock);
@@ -1394,8 +1473,10 @@ int trace_array_vprintk(struct trace_array *tr,
        memcpy(&entry->buf, trace_buf, len);
        entry->buf[len] = '\0';
-        if (!filter_check_discard(call, entry, buffer, event))
+        if (!filter_check_discard(call, entry, buffer, event)) {
                ring_buffer_unlock_commit(buffer, event);
+                ftrace_trace_stack(buffer, irq_flags, 6, pc);
+        }
 out_unlock:
        arch_spin_unlock(&trace_buf_lock);
@@ -1585,12 +1666,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 }
 /*
- * No necessary locking here. The worst thing which can
- * happen is loosing events consumed at the same time
- * by a trace_pipe reader.
- * Other than that, we don't risk to crash the ring buffer
- * because it serializes the readers.
- *
 * The current tracer is copied to avoid a global locking
 * all around.
 */
@@ -1645,12 +1720,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
        }
        trace_event_read_lock();
+        trace_access_lock(cpu_file);
        return p;
 }
 static void s_stop(struct seq_file *m, void *p)
 {
+        struct trace_iterator *iter = m->private;
        atomic_dec(&trace_record_cmdline_disabled);
+        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
 }
@@ -2841,22 +2920,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
        mutex_lock(&trace_types_lock);
-        /* We only allow one reader per cpu */
-        if (cpu_file == TRACE_PIPE_ALL_CPU) {
-                if (!cpumask_empty(tracing_reader_cpumask)) {
-                        ret = -EBUSY;
-                        goto out;
-                }
-                cpumask_setall(tracing_reader_cpumask);
-        } else {
-                if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
-                        cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
-                else {
-                        ret = -EBUSY;
-                        goto out;
-                }
-        }
        /* create a buffer to store the information to pass to userspace */
        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
        if (!iter) {
@@ -2912,12 +2975,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
        mutex_lock(&trace_types_lock);
-        if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
-                cpumask_clear(tracing_reader_cpumask);
-        else
-                cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
        if (iter->trace->pipe_close)
                iter->trace->pipe_close(iter);
@@ -3079,6 +3136,7 @@ waitagain:
        iter->pos = -1;
        trace_event_read_lock();
+        trace_access_lock(iter->cpu_file);
        while (find_next_entry_inc(iter) != NULL) {
                enum print_line_t ret;
                int len = iter->seq.len;
@@ -3095,6 +3153,7 @@ waitagain:
                if (iter->seq.len >= cnt)
                        break;
        }
+        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
        /* Now copy what we have to the user */
@@ -3220,6 +3279,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
        }
        trace_event_read_lock();
+        trace_access_lock(iter->cpu_file);
        /* Fill as many pages as possible. */
        for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
@@ -3243,6 +3303,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                trace_seq_init(&iter->seq);
        }
+        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
        mutex_unlock(&iter->mutex);
@@ -3544,10 +3605,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
        info->read = 0;
+        trace_access_lock(info->cpu);
        ret = ring_buffer_read_page(info->tr->buffer,
                                    &info->spare,
                                    count,
                                    info->cpu, 0);
+        trace_access_unlock(info->cpu);
        if (ret < 0)
                return 0;
@@ -3675,6 +3738,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                len &= PAGE_MASK;
        }
+        trace_access_lock(info->cpu);
        entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
        for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
@@ -3722,6 +3786,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
        }
+        trace_access_unlock(info->cpu);
        spd.nr_pages = i;
        /* did we read anything? */
@@ -4158,6 +4223,8 @@ static __init int tracer_init_debugfs(void)
        struct dentry *d_tracer;
        int cpu;
+        trace_access_lock_init();
        d_tracer = tracing_init_dentry();
        trace_create_file("tracing_enabled", 0644, d_tracer,
@@ -4392,9 +4459,6 @@ __init static int tracer_alloc_buffers(void)
        if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
                goto out_free_buffer_mask;
-        if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
-                goto out_free_tracing_cpumask;
        /* To save memory, keep the ring buffer size to its minimum */
        if (ring_buffer_expanded)
                ring_buf_size = trace_buf_size;
@@ -4452,8 +4516,6 @@ __init static int tracer_alloc_buffers(void)
        return 0;
 out_free_cpumask:
-        free_cpumask_var(tracing_reader_cpumask);
-out_free_tracing_cpumask:
        free_cpumask_var(tracing_cpumask);
 out_free_buffer_mask:
        free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4df6a77eb196..fd05bcaf91b0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -497,6 +497,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* TODO: make this variable */
 #define FTRACE_GRAPH_MAX_FUNCS          32
+extern int ftrace_graph_filter_enabled;
 extern int ftrace_graph_count;
 extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
@@ -504,7 +505,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
 {
        int i;
-        if (!ftrace_graph_count || test_tsk_trace_graph(current))
+        if (!ftrace_graph_filter_enabled)
                return 1;
        for (i = 0; i < ftrace_graph_count; i++) {
@@ -791,7 +792,8 @@ extern const char *__stop___trace_bprintk_fmt[];
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, id, tstruct, print)             \
-        extern struct ftrace_event_call event_##call;
+        extern struct ftrace_event_call                                 \
+        __attribute__((__aligned__(4))) event_##call;
 #undef FTRACE_ENTRY_DUP
 #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print)         \
        FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4a194f08f88c..b9bc4d470177 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -307,8 +307,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
                return -1;
        if (percent_a > percent_b)
                return 1;
-        else
-                return 0;
+        if (a->incorrect < b->incorrect)
+                return -1;
+        if (a->incorrect > b->incorrect)
+                return 1;
+        /*
+         * Since the above shows worse (incorrect) cases
+         * first, we continue that by showing best (correct)
+         * cases last.
+         */
+        if (a->correct > b->correct)
+                return -1;
+        if (a->correct < b->correct)
+                return 1;
+        return 0;
 }
 static struct tracer_stat annotated_branch_stats = {
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 9e25573242cf..f0d693005075 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -6,14 +6,12 @@
 */
 #include <linux/module.h>
+#include <linux/kprobes.h>
 #include "trace.h"
-char *perf_trace_buf;
+static char *perf_trace_buf;
-EXPORT_SYMBOL_GPL(perf_trace_buf);
+static char *perf_trace_buf_nmi;
-char *perf_trace_buf_nmi;
-EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
 typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
@@ -120,3 +118,47 @@ void ftrace_profile_disable(int event_id)
        }
        mutex_unlock(&event_mutex);
 }
+__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
+                                        int *rctxp, unsigned long *irq_flags)
+{
+        struct trace_entry *entry;
+        char *trace_buf, *raw_data;
+        int pc, cpu;
+        pc = preempt_count();
+        /* Protect the per cpu buffer, begin the rcu read side */
+        local_irq_save(*irq_flags);
+        *rctxp = perf_swevent_get_recursion_context();
+        if (*rctxp < 0)
+                goto err_recursion;
+        cpu = smp_processor_id();
+        if (in_nmi())
+                trace_buf = rcu_dereference(perf_trace_buf_nmi);
+        else
+                trace_buf = rcu_dereference(perf_trace_buf);
+        if (!trace_buf)
+                goto err;
+        raw_data = per_cpu_ptr(trace_buf, cpu);
+        /* zero the dead bytes from align to not leak stack to user */
+        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+        entry = (struct trace_entry *)raw_data;
+        tracing_generic_entry_update(entry, *irq_flags, pc);
+        entry->type = type;
+        return raw_data;
+err:
+        perf_swevent_put_recursion_context(*rctxp);
+err_recursion:
+        local_irq_restore(*irq_flags);
+        return NULL;
+}
+EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 189b09baf4fb..3f972ad98d04 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -60,10 +60,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
        return 0;
 err:
-        if (field) {
+        if (field)
                kfree(field->name);
-                kfree(field->type);
-        }
        kfree(field);
        return -ENOMEM;
@@ -520,41 +518,16 @@ out:
        return ret;
 }
-extern char *__bad_type_size(void);
-#undef FIELD
-#define FIELD(type, name)                                               \
-        sizeof(type) != sizeof(field.name) ? __bad_type_size() :        \
-        #type, "common_" #name, offsetof(typeof(field), name),          \
-                sizeof(field.name), is_signed_type(type)
-static int trace_write_header(struct trace_seq *s)
-{
-        struct trace_entry field;
-        /* struct trace_entry */
-        return trace_seq_printf(s,
-                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                        "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
-                        "\n",
-                        FIELD(unsigned short, type),
-                        FIELD(unsigned char, flags),
-                        FIELD(unsigned char, preempt_count),
-                        FIELD(int, pid),
-                        FIELD(int, lock_depth));
-}
 static ssize_t
 event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
                  loff_t *ppos)
 {
        struct ftrace_event_call *call = filp->private_data;
+        struct ftrace_event_field *field;
        struct trace_seq *s;
+        int common_field_count = 5;
        char *buf;
-        int r;
+        int r = 0;
        if (*ppos)
                return 0;
@@ -565,14 +538,48 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
        trace_seq_init(s);
-        /* If any of the first writes fail, so will the show_format. */
        trace_seq_printf(s, "name: %s\n", call->name);
        trace_seq_printf(s, "ID: %d\n", call->id);
        trace_seq_printf(s, "format:\n");
-        trace_write_header(s);
-        r = call->show_format(call, s);
+        list_for_each_entry_reverse(field, &call->fields, link) {
+                /*
+                 * Smartly shows the array type(except dynamic array).
+                 * Normal:
+                 *      field:TYPE VAR
+                 * If TYPE := TYPE[LEN], it is shown:
+                 *      field:TYPE VAR[LEN]
+                 */
+                const char *array_descriptor = strchr(field->type, '[');
+                if (!strncmp(field->type, "__data_loc", 10))
+                        array_descriptor = NULL;
+                if (!array_descriptor) {
+                        r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
+                                        "\tsize:%u;\tsigned:%d;\n",
+                                        field->type, field->name, field->offset,
+                                        field->size, !!field->is_signed);
+                } else {
+                        r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
+                                        "\tsize:%u;\tsigned:%d;\n",
+                                        (int)(array_descriptor - field->type),
+                                        field->type, field->name,
+                                        array_descriptor, field->offset,
+                                        field->size, !!field->is_signed);
+                }
+                if (--common_field_count == 0)
+                        r = trace_seq_printf(s, "\n");
+                if (!r)
+                        break;
+        }
+        if (r)
+                r = trace_seq_printf(s, "\nprint fmt: %s\n",
+                                call->print_fmt);
        if (!r) {
                /*
                 * ug!  The format output is bigger than a PAGE!!
@@ -948,10 +955,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
                                  filter);
        }
-        /* A trace may not want to export its format */
-        if (!call->show_format)
-                return 0;
        trace_create_file("format", 0444, call->dir, call,
                          format);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e42af9aad69f..4615f62a04f1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1371,7 +1371,7 @@ out_unlock:
        return err;
 }
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
 void ftrace_profile_free_filter(struct perf_event *event)
 {
@@ -1439,5 +1439,5 @@ out_unlock:
        return err;
 }
-#endif /* CONFIG_EVENT_PROFILE */
+#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d4fa5dc1ee4e..e091f64ba6ce 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void)	\
 #include "trace_entries.h"
-#undef __field
-#define __field(type, item)                                             \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                               "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
-                               offsetof(typeof(field), item),           \
-                               sizeof(field.item), is_signed_type(type)); \
-        if (!ret)                                                       \
-                return 0;
-#undef __field_desc
-#define __field_desc(type, container, item)                             \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                               "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
-                               offsetof(typeof(field), container.item), \
-                               sizeof(field.container.item),            \
-                               is_signed_type(type));                   \
-        if (!ret)                                                       \
-                return 0;
-#undef __array
-#define __array(type, item, len)                                        \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
-                               "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
-                               offsetof(typeof(field), item),           \
-                               sizeof(field.item), is_signed_type(type)); \
-        if (!ret)                                                       \
-                return 0;
-#undef __array_desc
-#define __array_desc(type, container, item, len)                        \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
-                               "offset:%zu;\tsize:%zu;\tsigned:%u;\n",  \
-                               offsetof(typeof(field), container.item), \
-                               sizeof(field.container.item),            \
-                               is_signed_type(type));                   \
-        if (!ret)                                                       \
-                return 0;
-#undef __dynamic_array
-#define __dynamic_array(type, item)                                     \
-        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                               "offset:%zu;\tsize:0;\tsigned:%u;\n",    \
-                               offsetof(typeof(field), item),           \
-                               is_signed_type(type));                   \
-        if (!ret)                                                       \
-                return 0;
-#undef F_printk
-#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
-#undef __entry
-#define __entry REC
-#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)             \
-static int                                                              \
-ftrace_format_##name(struct ftrace_event_call *unused,                  \
-                     struct trace_seq *s)                               \
-{                                                                       \
-        struct struct_name field __attribute__((unused));               \
-        int ret = 0;                                                    \
-                                                                        \
-        tstruct;                                                        \
-                                                                        \
-        trace_seq_printf(s, "\nprint fmt: " print);                     \
-                                                                        \
-        return ret;                                                     \
-}
-#include "trace_entries.h"
 #undef __field
 #define __field(type, item)                                             \
        ret = trace_define_field(event_call, #type, #item,              \
@@ -175,7 +103,12 @@ ftrace_format_##name(struct ftrace_event_call *unused,			\
                return ret;
 #undef __dynamic_array
-#define __dynamic_array(type, item)
+#define __dynamic_array(type, item)                                     \
+        ret = trace_define_field(event_call, #type, #item,              \
+                                 offsetof(typeof(field), item),         \
+                                 0, is_signed_type(type), FILTER_OTHER);\
+        if (ret)                                                        \
+                return ret;
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(name, struct_name, id, tstruct, print)             \
@@ -198,6 +131,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
        return 0;
 }
+#undef __entry
+#define __entry REC
 #undef __field
 #define __field(type, item)
@@ -213,6 +149,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
 #undef __dynamic_array
 #define __dynamic_array(type, item)
+#undef F_printk
+#define F_printk(fmt, args...) #fmt ", "  __stringify(args)
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, type, tstruct, print)           \
                                                                        \
@@ -223,7 +162,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
        .id                     = type,                                 \
        .system                 = __stringify(TRACE_SYSTEM),            \
        .raw_init               = ftrace_raw_init_event,                \
-        .show_format            = ftrace_format_##call,                 \
+        .print_fmt              = print,                                \
        .define_fields          = ftrace_define_fields_##call,          \
 };                                                                      \
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b1342c5d37cf..e998a824e9db 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -18,6 +18,7 @@ struct fgraph_cpu_data {
        pid_t           last_pid;
        int             depth;
        int             ignore;
+        unsigned long   enter_funcs[FTRACE_RETFUNC_DEPTH];
 };
 struct fgraph_data {
@@ -212,13 +213,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
        int cpu;
        int pc;
-        if (unlikely(!tr))
-                return 0;
        if (!ftrace_trace_task(current))
                return 0;
-        if (!ftrace_graph_addr(trace->func))
+        /* trace it when it is-nested-in or is a function enabled. */
+        if (!(trace->depth || ftrace_graph_addr(trace->func)))
                return 0;
        local_irq_save(flags);
@@ -231,9 +230,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
        } else {
                ret = 0;
        }
-        /* Only do the atomic if it is not already set */
-        if (!test_tsk_trace_graph(current))
-                set_tsk_trace_graph(current);
        atomic_dec(&data->disabled);
        local_irq_restore(flags);
@@ -281,17 +277,24 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
                pc = preempt_count();
                __trace_graph_return(tr, trace, flags, pc);
        }
-        if (!trace->depth)
-                clear_tsk_trace_graph(current);
        atomic_dec(&data->disabled);
        local_irq_restore(flags);
 }
+void set_graph_array(struct trace_array *tr)
+{
+        graph_array = tr;
+        /* Make graph_array visible before we start tracing */
+        smp_mb();
+}
 static int graph_trace_init(struct trace_array *tr)
 {
        int ret;
-        graph_array = tr;
+        set_graph_array(tr);
        ret = register_ftrace_graph(&trace_graph_return,
                                    &trace_graph_entry);
        if (ret)
@@ -301,11 +304,6 @@ static int graph_trace_init(struct trace_array *tr)
        return 0;
 }
-void set_graph_array(struct trace_array *tr)
-{
-        graph_array = tr;
-}
 static void graph_trace_reset(struct trace_array *tr)
 {
        tracing_stop_cmdline_record();
@@ -673,15 +671,21 @@ print_graph_entry_leaf(struct trace_iterator *iter,
        duration = graph_ret->rettime - graph_ret->calltime;
        if (data) {
+                struct fgraph_cpu_data *cpu_data;
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
+                cpu_data = per_cpu_ptr(data->cpu_data, cpu);
                /*
                 * Comments display at + 1 to depth. Since
                 * this is a leaf function, keep the comments
                 * equal to this depth.
                 */
-                *depth = call->depth - 1;
+                cpu_data->depth = call->depth - 1;
+                /* No need to keep this function around for this depth */
+                if (call->depth < FTRACE_RETFUNC_DEPTH)
+                        cpu_data->enter_funcs[call->depth] = 0;
        }
        /* Overhead */
@@ -721,10 +725,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
        int i;
        if (data) {
+                struct fgraph_cpu_data *cpu_data;
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
-                *depth = call->depth;
+                cpu_data = per_cpu_ptr(data->cpu_data, cpu);
+                cpu_data->depth = call->depth;
+                /* Save this function pointer to see if the exit matches */
+                if (call->depth < FTRACE_RETFUNC_DEPTH)
+                        cpu_data->enter_funcs[call->depth] = call->func;
        }
        /* No overhead */
@@ -854,19 +863,28 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
        struct fgraph_data *data = iter->private;
        pid_t pid = ent->pid;
        int cpu = iter->cpu;
+        int func_match = 1;
        int ret;
        int i;
        if (data) {
+                struct fgraph_cpu_data *cpu_data;
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
+                cpu_data = per_cpu_ptr(data->cpu_data, cpu);
                /*
                 * Comments display at + 1 to depth. This is the
                 * return from a function, we now want the comments
                 * to display at the same level of the bracket.
                 */
-                *depth = trace->depth - 1;
+                cpu_data->depth = trace->depth - 1;
+                if (trace->depth < FTRACE_RETFUNC_DEPTH) {
+                        if (cpu_data->enter_funcs[trace->depth] != trace->func)
+                                func_match = 0;
+                        cpu_data->enter_funcs[trace->depth] = 0;
+                }
        }
        if (print_graph_prologue(iter, s, 0, 0))
@@ -891,9 +909,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
                        return TRACE_TYPE_PARTIAL_LINE;
        }
-        ret = trace_seq_printf(s, "}\n");
+        /*
-        if (!ret)
+         * If the return function does not have a matching entry,
-                return TRACE_TYPE_PARTIAL_LINE;
+         * then the entry was lost. Instead of just printing
+         * the '}' and letting the user guess what function this
+         * belongs to, write out the function name.
+         */
+        if (func_match) {
+                ret = trace_seq_printf(s, "}\n");
+                if (!ret)
+                        return TRACE_TYPE_PARTIAL_LINE;
+        } else {
+                ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func);
+                if (!ret)
+                        return TRACE_TYPE_PARTIAL_LINE;
+        }
        /* Overrun */
        if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 50b1b8239806..505c92273b1a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -91,11 +91,6 @@ static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
        return retval;
 }
-static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
-{
-        return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
-}
 static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
                                              void *dummy)
 {
@@ -231,9 +226,7 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
 {
        int ret = -EINVAL;
-        if (ff->func == fetch_argument)
+        if (ff->func == fetch_register) {
-                ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
-        else if (ff->func == fetch_register) {
                const char *name;
                name = regs_query_register_name((unsigned int)((long)ff->data));
                ret = snprintf(buf, n, "%%%s", name);
@@ -489,14 +482,6 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
                        }
                } else
                        ret = -EINVAL;
-        } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
-                ret = strict_strtoul(arg + 3, 10, &param);
-                if (ret || param > PARAM_MAX_ARGS)
-                        ret = -EINVAL;
-                else {
-                        ff->func = fetch_argument;
-                        ff->data = (void *)param;
-                }
        } else
                ret = -EINVAL;
        return ret;
@@ -611,7 +596,6 @@ static int create_trace_probe(int argc, char **argv)
         *  - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
         *  - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
         * Fetch args:
-         *  $argN       : fetch Nth of function argument. (N:0-)
         *  $retval     : fetch return value
         *  $stack      : fetch stack address
         *  $stackN     : fetch Nth of stack (N:0-)
@@ -651,12 +635,12 @@ static int create_trace_probe(int argc, char **argv)
                        event = strchr(group, '/') + 1;
                        event[-1] = '\0';
                        if (strlen(group) == 0) {
-                                pr_info("Group name is not specifiled\n");
+                                pr_info("Group name is not specified\n");
                                return -EINVAL;
                        }
                }
                if (strlen(event) == 0) {
-                        pr_info("Event name is not specifiled\n");
+                        pr_info("Event name is not specified\n");
                        return -EINVAL;
                }
        }
@@ -958,7 +942,7 @@ static const struct file_operations kprobe_profile_ops = {
 };
 /* Kprobe handler */
-static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 {
        struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
        struct kprobe_trace_entry *entry;
@@ -978,7 +962,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
        event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
                                                  irq_flags, pc);
        if (!event)
-                return 0;
+                return;
        entry = ring_buffer_event_data(event);
        entry->nargs = tp->nr_args;
@@ -988,11 +972,10 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
        if (!filter_current_check_discard(buffer, call, entry, event))
                trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
-        return 0;
 }
 /* Kretprobe handler */
-static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
                                          struct pt_regs *regs)
 {
        struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -1011,7 +994,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
        event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
                                                  irq_flags, pc);
        if (!event)
-                return 0;
+                return;
        entry = ring_buffer_event_data(event);
        entry->nargs = tp->nr_args;
@@ -1022,8 +1005,6 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
        if (!filter_current_check_discard(buffer, call, entry, event))
                trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
-        return 0;
 }
 /* Event entry printers */
@@ -1174,213 +1155,123 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
        return 0;
 }
-static int __probe_event_show_format(struct trace_seq *s,
+static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
-                                     struct trace_probe *tp, const char *fmt,
-                                     const char *arg)
 {
        int i;
+        int pos = 0;
-        /* Show format */
+        const char *fmt, *arg;
-        if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
-                return 0;
-        for (i = 0; i < tp->nr_args; i++)
+        if (!probe_is_return(tp)) {
-                if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name))
+                fmt = "(%lx)";
-                        return 0;
+                arg = "REC->" FIELD_STRING_IP;
+        } else {
+                fmt = "(%lx <- %lx)";
+                arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+        }
-        if (!trace_seq_printf(s, "\", %s", arg))
+        /* When len=0, we just calculate the needed length */
-                return 0;
+#define LEN_OR_ZERO (len ? len - pos : 0)
-        for (i = 0; i < tp->nr_args; i++)
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
-                if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
-                        return 0;
-        return trace_seq_puts(s, "\n");
-}
-#undef SHOW_FIELD
+        for (i = 0; i < tp->nr_args; i++) {
-#define SHOW_FIELD(type, item, name)                                    \
+                pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx",
-        do {                                                            \
+                                tp->args[i].name);
-                ret = trace_seq_printf(s, "\tfield:" #type " %s;\t"     \
+        }
-                                "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\
-                                (unsigned int)offsetof(typeof(field), item),\
-                                (unsigned int)sizeof(type),             \
-                                is_signed_type(type));                  \
-                if (!ret)                                               \
-                        return 0;                                       \
-        } while (0)
-static int kprobe_event_show_format(struct ftrace_event_call *call,
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
-                                    struct trace_seq *s)
-{
-        struct kprobe_trace_entry field __attribute__((unused));
-        int ret, i;
-        struct trace_probe *tp = (struct trace_probe *)call->data;
-        SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP);
+        for (i = 0; i < tp->nr_args; i++) {
-        SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
+                pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
+                                tp->args[i].name);
+        }
-        /* Show fields */
+#undef LEN_OR_ZERO
-        for (i = 0; i < tp->nr_args; i++)
-                SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
-        trace_seq_puts(s, "\n");
-        return __probe_event_show_format(s, tp, "(%lx)",
+        /* return the length of print_fmt */
-                                         "REC->" FIELD_STRING_IP);
+        return pos;
 }
-static int kretprobe_event_show_format(struct ftrace_event_call *call,
+static int set_print_fmt(struct trace_probe *tp)
-                                       struct trace_seq *s)
 {
-        struct kretprobe_trace_entry field __attribute__((unused));
+        int len;
-        int ret, i;
+        char *print_fmt;
-        struct trace_probe *tp = (struct trace_probe *)call->data;
-        SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC);
+        /* First: called with 0 length to calculate the needed length */
-        SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP);
+        len = __set_print_fmt(tp, NULL, 0);
-        SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
+        print_fmt = kmalloc(len + 1, GFP_KERNEL);
+        if (!print_fmt)
+                return -ENOMEM;
-        /* Show fields */
+        /* Second: actually write the @print_fmt */
-        for (i = 0; i < tp->nr_args; i++)
+        __set_print_fmt(tp, print_fmt, len + 1);
-                SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
+        tp->call.print_fmt = print_fmt;
-        trace_seq_puts(s, "\n");
-        return __probe_event_show_format(s, tp, "(%lx <- %lx)",
+        return 0;
-                                         "REC->" FIELD_STRING_FUNC
-                                         ", REC->" FIELD_STRING_RETIP);
 }
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
 /* Kprobe profile handler */
-static __kprobes int kprobe_profile_func(struct kprobe *kp,
+static __kprobes void kprobe_profile_func(struct kprobe *kp,
                                         struct pt_regs *regs)
 {
        struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
        struct ftrace_event_call *call = &tp->call;
        struct kprobe_trace_entry *entry;
-        struct trace_entry *ent;
+        int size, __size, i;
-        int size, __size, i, pc, __cpu;
        unsigned long irq_flags;
-        char *trace_buf;
-        char *raw_data;
        int rctx;
-        pc = preempt_count();
        __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
                     "profile buffer not large enough"))
-                return 0;
+                return;
-        /*
-         * Protect the non nmi buffer
-         * This also protects the rcu read side
-         */
-        local_irq_save(irq_flags);
-        rctx = perf_swevent_get_recursion_context();
-        if (rctx < 0)
-                goto end_recursion;
-        __cpu = smp_processor_id();
-        if (in_nmi())
-                trace_buf = rcu_dereference(perf_trace_buf_nmi);
-        else
-                trace_buf = rcu_dereference(perf_trace_buf);
-        if (!trace_buf)
+        entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
-                goto end;
+        if (!entry)
+                return;
-        raw_data = per_cpu_ptr(trace_buf, __cpu);
-        /* Zero dead bytes from alignment to avoid buffer leak to userspace */
-        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-        entry = (struct kprobe_trace_entry *)raw_data;
-        ent = &entry->ent;
-        tracing_generic_entry_update(ent, irq_flags, pc);
-        ent->type = call->id;
        entry->nargs = tp->nr_args;
        entry->ip = (unsigned long)kp->addr;
        for (i = 0; i < tp->nr_args; i++)
                entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
-        perf_tp_event(call->id, entry->ip, 1, entry, size);
-end:
-        perf_swevent_put_recursion_context(rctx);
-end_recursion:
-        local_irq_restore(irq_flags);
-        return 0;
+        ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags);
 }
 /* Kretprobe profile handler */
-static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
                                            struct pt_regs *regs)
 {
        struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
        struct ftrace_event_call *call = &tp->call;
        struct kretprobe_trace_entry *entry;
-        struct trace_entry *ent;
+        int size, __size, i;
-        int size, __size, i, pc, __cpu;
        unsigned long irq_flags;
-        char *trace_buf;
-        char *raw_data;
        int rctx;
-        pc = preempt_count();
        __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
                     "profile buffer not large enough"))
-                return 0;
+                return;
-        /*
-         * Protect the non nmi buffer
-         * This also protects the rcu read side
-         */
-        local_irq_save(irq_flags);
-        rctx = perf_swevent_get_recursion_context();
-        if (rctx < 0)
-                goto end_recursion;
-        __cpu = smp_processor_id();
-        if (in_nmi())
+        entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
-                trace_buf = rcu_dereference(perf_trace_buf_nmi);
+        if (!entry)
-        else
+                return;
-                trace_buf = rcu_dereference(perf_trace_buf);
-        if (!trace_buf)
-                goto end;
-        raw_data = per_cpu_ptr(trace_buf, __cpu);
-        /* Zero dead bytes from alignment to avoid buffer leak to userspace */
-        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-        entry = (struct kretprobe_trace_entry *)raw_data;
-        ent = &entry->ent;
-        tracing_generic_entry_update(ent, irq_flags, pc);
-        ent->type = call->id;
        entry->nargs = tp->nr_args;
        entry->func = (unsigned long)tp->rp.kp.addr;
        entry->ret_ip = (unsigned long)ri->ret_addr;
        for (i = 0; i < tp->nr_args; i++)
                entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
-        perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
-end:
-        perf_swevent_put_recursion_context(rctx);
-end_recursion:
-        local_irq_restore(irq_flags);
-        return 0;
+        ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags);
 }
 static int probe_profile_enable(struct ftrace_event_call *call)
@@ -1408,7 +1299,7 @@ static void probe_profile_disable(struct ftrace_event_call *call)
                        disable_kprobe(&tp->rp.kp);
        }
 }
-#endif  /* CONFIG_EVENT_PROFILE */
+#endif  /* CONFIG_PERF_EVENTS */
 static __kprobes
@@ -1418,10 +1309,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
        if (tp->flags & TP_FLAG_TRACE)
                kprobe_trace_func(kp, regs);
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
        if (tp->flags & TP_FLAG_PROFILE)
                kprobe_profile_func(kp, regs);
-#endif  /* CONFIG_EVENT_PROFILE */
+#endif
        return 0;       /* We don't tweek kernel, so just return 0 */
 }
@@ -1432,10 +1323,10 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
        if (tp->flags & TP_FLAG_TRACE)
                kretprobe_trace_func(ri, regs);
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
        if (tp->flags & TP_FLAG_PROFILE)
                kretprobe_profile_func(ri, regs);
-#endif  /* CONFIG_EVENT_PROFILE */
+#endif
        return 0;       /* We don't tweek kernel, so just return 0 */
 }
@@ -1448,23 +1339,25 @@ static int register_probe_event(struct trace_probe *tp)
        if (probe_is_return(tp)) {
                tp->event.trace = print_kretprobe_event;
                call->raw_init = probe_event_raw_init;
-                call->show_format = kretprobe_event_show_format;
                call->define_fields = kretprobe_event_define_fields;
        } else {
                tp->event.trace = print_kprobe_event;
                call->raw_init = probe_event_raw_init;
-                call->show_format = kprobe_event_show_format;
                call->define_fields = kprobe_event_define_fields;
        }
+        if (set_print_fmt(tp) < 0)
+                return -ENOMEM;
        call->event = &tp->event;
        call->id = register_ftrace_event(&tp->event);
-        if (!call->id)
+        if (!call->id) {
+                kfree(call->print_fmt);
                return -ENODEV;
+        }
        call->enabled = 0;
        call->regfunc = probe_event_enable;
        call->unregfunc = probe_event_disable;
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
        call->profile_enable = probe_profile_enable;
        call->profile_disable = probe_profile_disable;
 #endif
@@ -1472,6 +1365,7 @@ static int register_probe_event(struct trace_probe *tp)
        ret = trace_add_event_call(call);
        if (ret) {
                pr_info("Failed to register kprobe event: %s\n", call->name);
+                kfree(call->print_fmt);
                unregister_ftrace_event(&tp->event);
        }
        return ret;
@@ -1481,6 +1375,7 @@ static void unregister_probe_event(struct trace_probe *tp)
 {
        /* tp->event is unregistered in trace_remove_event_call() */
        trace_remove_event_call(&tp->call);
+        kfree(tp->call.print_fmt);
 }
 /* Make a debugfs interface for controling probe points */
@@ -1523,28 +1418,67 @@ static int kprobe_trace_selftest_target(int a1, int a2, int a3,
 static __init int kprobe_trace_self_tests_init(void)
 {
-        int ret;
+        int ret, warn = 0;
        int (*target)(int, int, int, int, int, int);
+        struct trace_probe *tp;
        target = kprobe_trace_selftest_target;
        pr_info("Testing kprobe tracing: ");
        ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
-                                  "$arg1 $arg2 $arg3 $arg4 $stack $stack0");
+                                  "$stack $stack0 +0($stack)");
-        if (WARN_ON_ONCE(ret))
+        if (WARN_ON_ONCE(ret)) {
-                pr_warning("error enabling function entry\n");
+                pr_warning("error on probing function entry.\n");
+                warn++;
+        } else {
+                /* Enable trace point */
+                tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM);
+                if (WARN_ON_ONCE(tp == NULL)) {
+                        pr_warning("error on getting new probe.\n");
+                        warn++;
+                } else
+                        probe_event_enable(&tp->call);
+        }
        ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
                                  "$retval");
-        if (WARN_ON_ONCE(ret))
+        if (WARN_ON_ONCE(ret)) {
-                pr_warning("error enabling function return\n");
+                pr_warning("error on probing function return.\n");
+                warn++;
+        } else {
+                /* Enable trace point */
+                tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM);
+                if (WARN_ON_ONCE(tp == NULL)) {
+                        pr_warning("error on getting new probe.\n");
+                        warn++;
+                } else
+                        probe_event_enable(&tp->call);
+        }
+        if (warn)
+                goto end;
        ret = target(1, 2, 3, 4, 5, 6);
-        cleanup_all_probes();
+        ret = command_trace_probe("-:testprobe");
+        if (WARN_ON_ONCE(ret)) {
+                pr_warning("error on deleting a probe.\n");
+                warn++;
+        }
-        pr_cont("OK\n");
+        ret = command_trace_probe("-:testprobe2");
+        if (WARN_ON_ONCE(ret)) {
+                pr_warning("error on deleting a probe.\n");
+                warn++;
+        }
+end:
+        cleanup_all_probes();
+        if (warn)
+                pr_cont("NG: Some tests are failed. Please check them.\n");
+        else
+                pr_cont("OK\n");
        return 0;
 }
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 75289f372dd2..cba47d7935cc 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -143,70 +143,65 @@ extern char *__bad_type_size(void);
                #type, #name, offsetof(typeof(trace), name),            \
                sizeof(trace.name), is_signed_type(type)
-int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
+static
+int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 {
        int i;
-        int ret;
+        int pos = 0;
-        struct syscall_metadata *entry = call->data;
-        struct syscall_trace_enter trace;
-        int offset = offsetof(struct syscall_trace_enter, args);
-        ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
+        /* When len=0, we just calculate the needed length */
-                               "\tsigned:%u;\n",
+#define LEN_OR_ZERO (len ? len - pos : 0)
-                               SYSCALL_FIELD(int, nr));
-        if (!ret)
-                return 0;
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
        for (i = 0; i < entry->nb_args; i++) {
-                ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i],
+                pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
-                                        entry->args[i]);
+                                entry->args[i], sizeof(unsigned long),
-                if (!ret)
+                                i == entry->nb_args - 1 ? "" : ", ");
-                        return 0;
-                ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
-                                       "\tsigned:%u;\n", offset,
-                                       sizeof(unsigned long),
-                                       is_signed_type(unsigned long));
-                if (!ret)
-                        return 0;
-                offset += sizeof(unsigned long);
        }
+        pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
-        trace_seq_puts(s, "\nprint fmt: \"");
        for (i = 0; i < entry->nb_args; i++) {
-                ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i],
+                pos += snprintf(buf + pos, LEN_OR_ZERO,
-                                        sizeof(unsigned long),
+                                ", ((unsigned long)(REC->%s))", entry->args[i]);
-                                        i == entry->nb_args - 1 ? "" : ", ");
-                if (!ret)
-                        return 0;
        }
-        trace_seq_putc(s, '"');
-        for (i = 0; i < entry->nb_args; i++) {
+#undef LEN_OR_ZERO
-                ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
-                                       entry->args[i]);
-                if (!ret)
-                        return 0;
-        }
-        return trace_seq_putc(s, '\n');
+        /* return the length of print_fmt */
+        return pos;
 }
-int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
+static int set_syscall_print_fmt(struct ftrace_event_call *call)
 {
-        int ret;
+        char *print_fmt;
-        struct syscall_trace_exit trace;
+        int len;
+        struct syscall_metadata *entry = call->data;
-        ret = trace_seq_printf(s,
+        if (entry->enter_event != call) {
-                               "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
+                call->print_fmt = "\"0x%lx\", REC->ret";
-                               "\tsigned:%u;\n"
-                               "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
-                               "\tsigned:%u;\n",
-                               SYSCALL_FIELD(int, nr),
-                               SYSCALL_FIELD(long, ret));
-        if (!ret)
                return 0;
+        }
+        /* First: called with 0 length to calculate the needed length */
+        len = __set_enter_print_fmt(entry, NULL, 0);
+        print_fmt = kmalloc(len + 1, GFP_KERNEL);
+        if (!print_fmt)
+                return -ENOMEM;
+        /* Second: actually write the @print_fmt */
+        __set_enter_print_fmt(entry, print_fmt, len + 1);
+        call->print_fmt = print_fmt;
-        return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n");
+        return 0;
+}
+static void free_syscall_print_fmt(struct ftrace_event_call *call)
+{
+        struct syscall_metadata *entry = call->data;
+        if (entry->enter_event == call)
+                kfree(call->print_fmt);
 }
 int syscall_enter_define_fields(struct ftrace_event_call *call)
@@ -386,12 +381,22 @@ int init_syscall_trace(struct ftrace_event_call *call)
 {
        int id;
-        id = register_ftrace_event(call->event);
+        if (set_syscall_print_fmt(call) < 0)
-        if (!id)
+                return -ENOMEM;
-                return -ENODEV;
-        call->id = id;
+        id = trace_event_raw_init(call);
-        INIT_LIST_HEAD(&call->fields);
-        return 0;
+        if (id < 0) {
+                free_syscall_print_fmt(call);
+                return id;
+        }
+        return id;
+}
+unsigned long __init arch_syscall_addr(int nr)
+{
+        return (unsigned long)sys_call_table[nr];
 }
 int __init init_ftrace_syscalls(void)
@@ -421,7 +426,7 @@ int __init init_ftrace_syscalls(void)
 }
 core_initcall(init_ftrace_syscalls);
-#ifdef CONFIG_EVENT_PROFILE
+#ifdef CONFIG_PERF_EVENTS
 static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
 static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
@@ -433,12 +438,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
        struct syscall_metadata *sys_data;
        struct syscall_trace_enter *rec;
        unsigned long flags;
-        char *trace_buf;
-        char *raw_data;
        int syscall_nr;
        int rctx;
        int size;
-        int cpu;
        syscall_nr = syscall_get_nr(current, regs);
        if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -457,37 +459,15 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
                      "profile buffer not large enough"))
                return;
-        /* Protect the per cpu buffer, begin the rcu read side */
+        rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size,
-        local_irq_save(flags);
+                                sys_data->enter_event->id, &rctx, &flags);
+        if (!rec)
-        rctx = perf_swevent_get_recursion_context();
+                return;
-        if (rctx < 0)
-                goto end_recursion;
-        cpu = smp_processor_id();
-        trace_buf = rcu_dereference(perf_trace_buf);
-        if (!trace_buf)
-                goto end;
-        raw_data = per_cpu_ptr(trace_buf, cpu);
-        /* zero the dead bytes from align to not leak stack to user */
-        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-        rec = (struct syscall_trace_enter *) raw_data;
-        tracing_generic_entry_update(&rec->ent, 0, 0);
-        rec->ent.type = sys_data->enter_event->id;
        rec->nr = syscall_nr;
        syscall_get_arguments(current, regs, 0, sys_data->nb_args,
                               (unsigned long *)&rec->args);
-        perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
+        ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
-end:
-        perf_swevent_put_recursion_context(rctx);
-end_recursion:
-        local_irq_restore(flags);
 }
 int prof_sysenter_enable(struct ftrace_event_call *call)
@@ -531,11 +511,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
        struct syscall_trace_exit *rec;
        unsigned long flags;
        int syscall_nr;
-        char *trace_buf;
-        char *raw_data;
        int rctx;
        int size;
-        int cpu;
        syscall_nr = syscall_get_nr(current, regs);
        if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -557,38 +534,15 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
                "exit event has grown above profile buffer size"))
                return;
-        /* Protect the per cpu buffer, begin the rcu read side */
+        rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size,
-        local_irq_save(flags);
+                                sys_data->exit_event->id, &rctx, &flags);
+        if (!rec)
-        rctx = perf_swevent_get_recursion_context();
+                return;
-        if (rctx < 0)
-                goto end_recursion;
-        cpu = smp_processor_id();
-        trace_buf = rcu_dereference(perf_trace_buf);
-        if (!trace_buf)
-                goto end;
-        raw_data = per_cpu_ptr(trace_buf, cpu);
-        /* zero the dead bytes from align to not leak stack to user */
-        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-        rec = (struct syscall_trace_exit *)raw_data;
-        tracing_generic_entry_update(&rec->ent, 0, 0);
-        rec->ent.type = sys_data->exit_event->id;
        rec->nr = syscall_nr;
        rec->ret = syscall_get_return_value(current, regs);
-        perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
+        ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
-end:
-        perf_swevent_put_recursion_context(rctx);
-end_recursion:
-        local_irq_restore(flags);
 }
 int prof_sysexit_enable(struct ftrace_event_call *call)
@@ -603,7 +557,7 @@ int prof_sysexit_enable(struct ftrace_event_call *call)
                ret = register_trace_sys_exit(prof_syscall_exit);
        if (ret) {
                pr_info("event trace: Could not activate"
-                                "syscall entry trace point");
+                                "syscall exit trace point");
        } else {
                set_bit(num, enabled_prof_exit_syscalls);
                sys_prof_refcount_exit++;
@@ -626,6 +580,5 @@ void prof_sysexit_disable(struct ftrace_event_call *call)
        mutex_unlock(&syscall_trace_lock);
 }
-#endif
+#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165ca70c..766467b3bcb7 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -56,9 +56,6 @@ struct user_struct root_user = {
        .sigpending     = ATOMIC_INIT(0),
        .locked_shm     = 0,
        .user_ns        = &init_user_ns,
-#ifdef CONFIG_USER_SCHED
-        .tg             = &init_task_group,
-#endif
 };
 /*
@@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up)
        put_user_ns(up->user_ns);
 }
-#ifdef CONFIG_USER_SCHED
-static void sched_destroy_user(struct user_struct *up)
-{
-        sched_destroy_group(up->tg);
-}
-static int sched_create_user(struct user_struct *up)
-{
-        int rc = 0;
-        up->tg = sched_create_group(&root_task_group);
-        if (IS_ERR(up->tg))
-                rc = -ENOMEM;
-        set_tg_uid(up);
-        return rc;
-}
-#else   /* CONFIG_USER_SCHED */
-static void sched_destroy_user(struct user_struct *up) { }
-static int sched_create_user(struct user_struct *up) { return 0; }
-#endif  /* CONFIG_USER_SCHED */
-#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
-static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
-{
-        struct user_struct *user;
-        struct hlist_node *h;
-        hlist_for_each_entry(user, h, hashent, uidhash_node) {
-                if (user->uid == uid) {
-                        /* possibly resurrect an "almost deleted" object */
-                        if (atomic_inc_return(&user->__count) == 1)
-                                cancel_delayed_work(&user->work);
-                        return user;
-                }
-        }
-        return NULL;
-}
-static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
-static DEFINE_MUTEX(uids_mutex);
-static inline void uids_mutex_lock(void)
-{
-        mutex_lock(&uids_mutex);
-}
-static inline void uids_mutex_unlock(void)
-{
-        mutex_unlock(&uids_mutex);
-}
-/* uid directory attributes */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static ssize_t cpu_shares_show(struct kobject *kobj,
-                               struct kobj_attribute *attr,
-                               char *buf)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
-}
-static ssize_t cpu_shares_store(struct kobject *kobj,
-                                struct kobj_attribute *attr,
-                                const char *buf, size_t size)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        unsigned long shares;
-        int rc;
-        sscanf(buf, "%lu", &shares);
-        rc = sched_group_set_shares(up->tg, shares);
-        return (rc ? rc : size);
-}
-static struct kobj_attribute cpu_share_attr =
-        __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
-                                   struct kobj_attribute *attr,
-                                   char *buf)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
-}
-static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
-                                    struct kobj_attribute *attr,
-                                    const char *buf, size_t size)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        unsigned long rt_runtime;
-        int rc;
-        sscanf(buf, "%ld", &rt_runtime);
-        rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
-        return (rc ? rc : size);
-}
-static struct kobj_attribute cpu_rt_runtime_attr =
-        __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
-static ssize_t cpu_rt_period_show(struct kobject *kobj,
-                                   struct kobj_attribute *attr,
-                                   char *buf)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
-}
-static ssize_t cpu_rt_period_store(struct kobject *kobj,
-                                    struct kobj_attribute *attr,
-                                    const char *buf, size_t size)
-{
-        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        unsigned long rt_period;
-        int rc;
-        sscanf(buf, "%lu", &rt_period);
-        rc = sched_group_set_rt_period(up->tg, rt_period);
-        return (rc ? rc : size);
-}
-static struct kobj_attribute cpu_rt_period_attr =
-        __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
-#endif
-/* default attributes per uid directory */
-static struct attribute *uids_attributes[] = {
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        &cpu_share_attr.attr,
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-        &cpu_rt_runtime_attr.attr,
-        &cpu_rt_period_attr.attr,
-#endif
-        NULL
-};
-/* the lifetime of user_struct is not managed by the core (now) */
-static void uids_release(struct kobject *kobj)
-{
-        return;
-}
-static struct kobj_type uids_ktype = {
-        .sysfs_ops = &kobj_sysfs_ops,
-        .default_attrs = uids_attributes,
-        .release = uids_release,
-};
-/*
- * Create /sys/kernel/uids/<uid>/cpu_share file for this user
- * We do not create this file for users in a user namespace (until
- * sysfs tagging is implemented).
- *
- * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
- */
-static int uids_user_create(struct user_struct *up)
-{
-        struct kobject *kobj = &up->kobj;
-        int error;
-        memset(kobj, 0, sizeof(struct kobject));
-        if (up->user_ns != &init_user_ns)
-                return 0;
-        kobj->kset = uids_kset;
-        error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
-        if (error) {
-                kobject_put(kobj);
-                goto done;
-        }
-        kobject_uevent(kobj, KOBJ_ADD);
-done:
-        return error;
-}
-/* create these entries in sysfs:
- *      "/sys/kernel/uids" directory
- *      "/sys/kernel/uids/0" directory (for root user)
- *      "/sys/kernel/uids/0/cpu_share" file (for root user)
- */
-int __init uids_sysfs_init(void)
-{
-        uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
-        if (!uids_kset)
-                return -ENOMEM;
-        return uids_user_create(&root_user);
-}
-/* delayed work function to remove sysfs directory for a user and free up
- * corresponding structures.
- */
-static void cleanup_user_struct(struct work_struct *w)
-{
-        struct user_struct *up = container_of(w, struct user_struct, work.work);
-        unsigned long flags;
-        int remove_user = 0;
-        /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
-         * atomic.
-         */
-        uids_mutex_lock();
-        spin_lock_irqsave(&uidhash_lock, flags);
-        if (atomic_read(&up->__count) == 0) {
-                uid_hash_remove(up);
-                remove_user = 1;
-        }
-        spin_unlock_irqrestore(&uidhash_lock, flags);
-        if (!remove_user)
-                goto done;
-        if (up->user_ns == &init_user_ns) {
-                kobject_uevent(&up->kobj, KOBJ_REMOVE);
-                kobject_del(&up->kobj);
-                kobject_put(&up->kobj);
-        }
-        sched_destroy_user(up);
-        key_put(up->uid_keyring);
-        key_put(up->session_keyring);
-        kmem_cache_free(uid_cachep, up);
-done:
-        uids_mutex_unlock();
-}
-/* IRQs are disabled and uidhash_lock is held upon function entry.
- * IRQ state (as stored in flags) is restored and uidhash_lock released
- * upon function exit.
- */
-static void free_user(struct user_struct *up, unsigned long flags)
-{
-        INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
-        schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
-        spin_unlock_irqrestore(&uidhash_lock, flags);
-}
-#else   /* CONFIG_USER_SCHED && CONFIG_SYSFS */
 static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
 {
        struct user_struct *user;
@@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
        return NULL;
 }
-int uids_sysfs_init(void) { return 0; }
-static inline int uids_user_create(struct user_struct *up) { return 0; }
-static inline void uids_mutex_lock(void) { }
-static inline void uids_mutex_unlock(void) { }
 /* IRQs are disabled and uidhash_lock is held upon function entry.
 * IRQ state (as stored in flags) is restored and uidhash_lock released
 * upon function exit.
@@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags)
 {
        uid_hash_remove(up);
        spin_unlock_irqrestore(&uidhash_lock, flags);
-        sched_destroy_user(up);
        key_put(up->uid_keyring);
        key_put(up->session_keyring);
        kmem_cache_free(uid_cachep, up);
 }
-#endif
-#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
-/*
- * We need to check if a setuid can take place. This function should be called
- * before successfully completing the setuid.
- */
-int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
-{
-        return sched_rt_can_attach(up->tg, tsk);
-}
-#else
-int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
-{
-        return 1;
-}
-#endif
 /*
 * Locate the user_struct for the passed UID.  If found, take a ref on it.  The
 * caller must undo that ref with free_uid().
@@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
        /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
         * atomic.
         */
-        uids_mutex_lock();
        spin_lock_irq(&uidhash_lock);
        up = uid_hash_find(uid, hashent);
        spin_unlock_irq(&uidhash_lock);
@@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
                new->uid = uid;
                atomic_set(&new->__count, 1);
-                if (sched_create_user(new) < 0)
-                        goto out_free_user;
                new->user_ns = get_user_ns(ns);
-                if (uids_user_create(new))
-                        goto out_destoy_sched;
                /*
                 * Before adding this, check whether we raced
                 * on adding the same user already..
@@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
                spin_unlock_irq(&uidhash_lock);
        }
-        uids_mutex_unlock();
        return up;
-out_destoy_sched:
-        sched_destroy_user(new);
        put_user_ns(new->user_ns);
-out_free_user:
        kmem_cache_free(uid_cachep, new);
 out_unlock:
-        uids_mutex_unlock();
        return NULL;
 }
author	Ingo Molnar <mingo@elte.hu>	2010-03-01 03:28:53 -0500
committer	Ingo Molnar <mingo@elte.hu>	2010-03-01 03:28:58 -0500
commit	e2f4699ac15fe36de1288505bc6e6e5a8603ab1b (patch)
tree	8078d3ff21eaa0a0ed6e446ac94f3681e831cad1 /kernel
parent	1883c79a57a5fe25309007590cccb1b2782c41b2 (diff)
parent	30ff056c42c665b9ea535d8515890857ae382540 (diff)