31 files changed, 1025 insertions, 684 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 7c4e2713df0a..291ac586f37f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -212,6 +212,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
                                        hcpu, -1, &nr_calls);
        if (err == NOTIFY_BAD) {
+                set_cpu_active(cpu, true);
                nr_calls--;
                __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
                                          hcpu, nr_calls, NULL);
@@ -223,11 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        /* Ensure that we are not runnable on dying cpu */
        cpumask_copy(old_allowed, &current->cpus_allowed);
-        set_cpus_allowed_ptr(current,
+        set_cpus_allowed_ptr(current, cpu_active_mask);
-                             cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
        err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
        if (err) {
+                set_cpu_active(cpu, true);
                /* CPU didn't die: tell everyone.  Can't complain. */
                if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
                                            hcpu) == NOTIFY_BAD)
@@ -292,9 +294,6 @@ int __ref cpu_down(unsigned int cpu)
        err = _cpu_down(cpu, 0);
-        if (cpu_online(cpu))
-                set_cpu_active(cpu, true);
 out:
        cpu_maps_update_done();
        stop_machine_destroy();
@@ -387,6 +386,15 @@ int disable_nonboot_cpus(void)
         * with the userspace trying to use the CPU hotplug at the same time
         */
        cpumask_clear(frozen_cpus);
+        for_each_online_cpu(cpu) {
+                if (cpu == first_cpu)
+                        continue;
+                set_cpu_active(cpu, false);
+        }
+        synchronize_sched();
        printk("Disabling non-boot CPUs ...\n");
        for_each_online_cpu(cpu) {
                if (cpu == first_cpu)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3cf2183b472d..ba401fab459f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -737,7 +737,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
 {
 }
-static int generate_sched_domains(struct cpumask **domains,
+static int generate_sched_domains(cpumask_var_t **domains,
                        struct sched_domain_attr **attributes)
 {
        *domains = NULL;
@@ -872,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
                if (retval < 0)
                        return retval;
-                if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask))
+                if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
                        return -EINVAL;
        }
        retval = validate_change(cs, trialcs);
@@ -2010,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                }
                /* Continue past cpusets with all cpus, mems online */
-                if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) &&
+                if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
                        continue;
@@ -2019,7 +2019,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                /* Remove offline cpus and mems from this cpuset. */
                mutex_lock(&callback_mutex);
                cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
-                            cpu_online_mask);
+                            cpu_active_mask);
                nodes_and(cp->mems_allowed, cp->mems_allowed,
                                                node_states[N_HIGH_MEMORY]);
                mutex_unlock(&callback_mutex);
@@ -2057,8 +2057,10 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
        switch (phase) {
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-        case CPU_DEAD:
+        case CPU_DOWN_PREPARE:
-        case CPU_DEAD_FROZEN:
+        case CPU_DOWN_PREPARE_FROZEN:
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
                break;
        default:
@@ -2067,7 +2069,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
        cgroup_lock();
        mutex_lock(&callback_mutex);
-        cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
+        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
        mutex_unlock(&callback_mutex);
        scan_for_empty_cpusets(&top_cpuset);
        ndoms = generate_sched_domains(&doms, &attr);
@@ -2114,7 +2116,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
 void __init cpuset_init_smp(void)
 {
-        cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
+        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
        hotcpu_notifier(cpuset_track_online_cpus, 0);
diff --git a/kernel/exit.c b/kernel/exit.c
index 1143012951e9..6f50ef55a6f3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -971,7 +971,7 @@ NORET_TYPE void do_exit(long code)
        exit_thread();
        cgroup_exit(tsk, 1);
-        if (group_dead && tsk->signal->leader)
+        if (group_dead)
                disassociate_ctty(1);
        module_put(task_thread_info(tsk)->exec_domain->module);
diff --git a/kernel/futex.c b/kernel/futex.c
index fb65e822fc41..d73ef1f3e55d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -304,8 +304,14 @@ void put_futex_key(int fshared, union futex_key *key)
 */
 static int fault_in_user_writeable(u32 __user *uaddr)
 {
-        int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
+        struct mm_struct *mm = current->mm;
-                                 1, 1, 0, NULL, NULL);
+        int ret;
+        down_read(&mm->mmap_sem);
+        ret = get_user_pages(current, mm, (unsigned long)uaddr,
+                             1, 1, 0, NULL, NULL);
+        up_read(&mm->mmap_sem);
        return ret < 0 ? ret : 0;
 }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ede527708123..d2f9239dc6ba 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -557,7 +557,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 static int hrtimer_reprogram(struct hrtimer *timer,
                             struct hrtimer_clock_base *base)
 {
-        ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
+        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
        ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
        int res;
@@ -582,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer,
        if (expires.tv64 < 0)
                return -ETIME;
-        if (expires.tv64 >= expires_next->tv64)
+        if (expires.tv64 >= cpu_base->expires_next.tv64)
+                return 0;
+        /*
+         * If a hang was detected in the last timer interrupt then we
+         * do not schedule a timer which is earlier than the expiry
+         * which we enforced in the hang detection. We want the system
+         * to make progress.
+         */
+        if (cpu_base->hang_detected)
                return 0;
        /*
@@ -590,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
         */
        res = tick_program_event(expires, 0);
        if (!IS_ERR_VALUE(res))
-                *expires_next = expires;
+                cpu_base->expires_next = expires;
        return res;
 }
@@ -747,17 +756,33 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
 #endif /* CONFIG_HIGH_RES_TIMERS */
-#ifdef CONFIG_TIMER_STATS
+static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
-void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
 {
+#ifdef CONFIG_TIMER_STATS
        if (timer->start_site)
                return;
+        timer->start_site = __builtin_return_address(0);
-        timer->start_site = addr;
        memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
        timer->start_pid = current->pid;
+#endif
 }
+static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
+{
+#ifdef CONFIG_TIMER_STATS
+        timer->start_site = NULL;
+#endif
+}
+static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
+{
+#ifdef CONFIG_TIMER_STATS
+        if (likely(!timer_stats_active))
+                return;
+        timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
+                                 timer->function, timer->start_comm, 0);
 #endif
+}
 /*
 * Counterpart to lock_hrtimer_base above:
@@ -1217,30 +1242,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 #ifdef CONFIG_HIGH_RES_TIMERS
-static int force_clock_reprogram;
-/*
- * After 5 iteration's attempts, we consider that hrtimer_interrupt()
- * is hanging, which could happen with something that slows the interrupt
- * such as the tracing. Then we force the clock reprogramming for each future
- * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
- * threshold that we will overwrite.
- * The next tick event will be scheduled to 3 times we currently spend on
- * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
- * 1/4 of their time to process the hrtimer interrupts. This is enough to
- * let it running without serious starvation.
- */
-static inline void
-hrtimer_interrupt_hanging(struct clock_event_device *dev,
-                        ktime_t try_time)
-{
-        force_clock_reprogram = 1;
-        dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
-        printk(KERN_WARNING "hrtimer: interrupt too slow, "
-               "forcing clock min delta to %llu ns\n",
-               (unsigned long long) dev->min_delta_ns);
-}
 /*
 * High resolution timer interrupt
 * Called with interrupts disabled
@@ -1249,21 +1250,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 {
        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
        struct hrtimer_clock_base *base;
-        ktime_t expires_next, now;
+        ktime_t expires_next, now, entry_time, delta;
-        int nr_retries = 0;
+        int i, retries = 0;
-        int i;
        BUG_ON(!cpu_base->hres_active);
        cpu_base->nr_events++;
        dev->next_event.tv64 = KTIME_MAX;
- retry:
+        entry_time = now = ktime_get();
-        /* 5 retries is enough to notice a hang */
+retry:
-        if (!(++nr_retries % 5))
-                hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
-        now = ktime_get();
        expires_next.tv64 = KTIME_MAX;
        spin_lock(&cpu_base->lock);
@@ -1325,10 +1320,48 @@ void hrtimer_interrupt(struct clock_event_device *dev)
        spin_unlock(&cpu_base->lock);
        /* Reprogramming necessary ? */
-        if (expires_next.tv64 != KTIME_MAX) {
+        if (expires_next.tv64 == KTIME_MAX ||
-                if (tick_program_event(expires_next, force_clock_reprogram))
+            !tick_program_event(expires_next, 0)) {
-                        goto retry;
+                cpu_base->hang_detected = 0;
+                return;
        }
+        /*
+         * The next timer was already expired due to:
+         * - tracing
+         * - long lasting callbacks
+         * - being scheduled away when running in a VM
+         *
+         * We need to prevent that we loop forever in the hrtimer
+         * interrupt routine. We give it 3 attempts to avoid
+         * overreacting on some spurious event.
+         */
+        now = ktime_get();
+        cpu_base->nr_retries++;
+        if (++retries < 3)
+                goto retry;
+        /*
+         * Give the system a chance to do something else than looping
+         * here. We stored the entry time, so we know exactly how long
+         * we spent here. We schedule the next event this amount of
+         * time away.
+         */
+        cpu_base->nr_hangs++;
+        cpu_base->hang_detected = 1;
+        delta = ktime_sub(now, entry_time);
+        if (delta.tv64 > cpu_base->max_hang_time.tv64)
+                cpu_base->max_hang_time = delta;
+        /*
+         * Limit it to a sensible value as we enforce a longer
+         * delay. Give the CPU at least 100ms to catch up.
+         */
+        if (delta.tv64 > 100 * NSEC_PER_MSEC)
+                expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
+        else
+                expires_next = ktime_add(now, delta);
+        tick_program_event(expires_next, 1);
+        printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
+                    ktime_to_ns(delta));
 }
 /*
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index cf5ee1628411..366eedf949c0 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -52,7 +52,7 @@
 static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
 /* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]);
+static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]);
 /* Number of non-pinned cpu/task breakpoints in a cpu */
 static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
@@ -73,7 +73,7 @@ static DEFINE_MUTEX(nr_bp_mutex);
 static unsigned int max_task_bp_pinned(int cpu)
 {
        int i;
-        unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu);
+        unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
        for (i = HBP_NUM -1; i >= 0; i--) {
                if (tsk_pinned[i] > 0)
@@ -83,15 +83,51 @@ static unsigned int max_task_bp_pinned(int cpu)
        return 0;
 }
+static int task_bp_pinned(struct task_struct *tsk)
+{
+        struct perf_event_context *ctx = tsk->perf_event_ctxp;
+        struct list_head *list;
+        struct perf_event *bp;
+        unsigned long flags;
+        int count = 0;
+        if (WARN_ONCE(!ctx, "No perf context for this task"))
+                return 0;
+        list = &ctx->event_list;
+        spin_lock_irqsave(&ctx->lock, flags);
+        /*
+         * The current breakpoint counter is not included in the list
+         * at the open() callback time
+         */
+        list_for_each_entry(bp, list, event_entry) {
+                if (bp->attr.type == PERF_TYPE_BREAKPOINT)
+                        count++;
+        }
+        spin_unlock_irqrestore(&ctx->lock, flags);
+        return count;
+}
 /*
 * Report the number of pinned/un-pinned breakpoints we have in
 * a given cpu (cpu > -1) or in all of them (cpu = -1).
 */
-static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
+static void
+fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
 {
+        int cpu = bp->cpu;
+        struct task_struct *tsk = bp->ctx->task;
        if (cpu >= 0) {
                slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
-                slots->pinned += max_task_bp_pinned(cpu);
+                if (!tsk)
+                        slots->pinned += max_task_bp_pinned(cpu);
+                else
+                        slots->pinned += task_bp_pinned(tsk);
                slots->flexible = per_cpu(nr_bp_flexible, cpu);
                return;
@@ -101,7 +137,10 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
                unsigned int nr;
                nr = per_cpu(nr_cpu_bp_pinned, cpu);
-                nr += max_task_bp_pinned(cpu);
+                if (!tsk)
+                        nr += max_task_bp_pinned(cpu);
+                else
+                        nr += task_bp_pinned(tsk);
                if (nr > slots->pinned)
                        slots->pinned = nr;
@@ -118,35 +157,12 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
 */
 static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
 {
-        int count = 0;
-        struct perf_event *bp;
-        struct perf_event_context *ctx = tsk->perf_event_ctxp;
        unsigned int *tsk_pinned;
-        struct list_head *list;
+        int count = 0;
-        unsigned long flags;
-        if (WARN_ONCE(!ctx, "No perf context for this task"))
-                return;
-        list = &ctx->event_list;
-        spin_lock_irqsave(&ctx->lock, flags);
-        /*
-         * The current breakpoint counter is not included in the list
-         * at the open() callback time
-         */
-        list_for_each_entry(bp, list, event_entry) {
-                if (bp->attr.type == PERF_TYPE_BREAKPOINT)
-                        count++;
-        }
-        spin_unlock_irqrestore(&ctx->lock, flags);
+        count = task_bp_pinned(tsk);
-        if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list"))
+        tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
-                return;
-        tsk_pinned = per_cpu(task_bp_pinned, cpu);
        if (enable) {
                tsk_pinned[count]++;
                if (count > 0)
@@ -193,7 +209,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 *   - If attached to a single cpu, check:
 *
 *       (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
- *           + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM
+ *           + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM
 *
 *       -> If there are already non-pinned counters in this cpu, it means
 *          there is already a free slot for them.
@@ -204,7 +220,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 *   - If attached to every cpus, check:
 *
 *       (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
- *           + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM
+ *           + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM
 *
 *       -> This is roughly the same, except we check the number of per cpu
 *          bp for every cpu and we keep the max one. Same for the per tasks
@@ -216,7 +232,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 *   - If attached to a single cpu, check:
 *
 *       ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
- *            + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM
+ *            + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM
 *
 *       -> Same checks as before. But now the nr_bp_flexible, if any, must keep
 *          one register at least (or they will never be fed).
@@ -224,7 +240,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 *   - If attached to every cpus, check:
 *
 *       ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
- *            + max(per_cpu(task_bp_pinned, *))) < HBP_NUM
+ *            + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
 */
 int reserve_bp_slot(struct perf_event *bp)
 {
@@ -233,7 +249,7 @@ int reserve_bp_slot(struct perf_event *bp)
        mutex_lock(&nr_bp_mutex);
-        fetch_bp_busy_slots(&slots, bp->cpu);
+        fetch_bp_busy_slots(&slots, bp);
        /* Flexible counters need to keep at least one slot */
        if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
@@ -259,7 +275,7 @@ void release_bp_slot(struct perf_event *bp)
 }
-int __register_perf_hw_breakpoint(struct perf_event *bp)
+int register_perf_hw_breakpoint(struct perf_event *bp)
 {
        int ret;
@@ -276,19 +292,12 @@ int __register_perf_hw_breakpoint(struct perf_event *bp)
         * This is a quick hack that will be removed soon, once we remove
         * the tmp breakpoints from ptrace
         */
-        if (!bp->attr.disabled || bp->callback == perf_bp_event)
+        if (!bp->attr.disabled || !bp->overflow_handler)
                ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
        return ret;
 }
-int register_perf_hw_breakpoint(struct perf_event *bp)
-{
-        bp->callback = perf_bp_event;
-        return __register_perf_hw_breakpoint(bp);
-}
 /**
 * register_user_hw_breakpoint - register a hardware breakpoint for user space
 * @attr: breakpoint attributes
@@ -297,7 +306,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
 */
 struct perf_event *
 register_user_hw_breakpoint(struct perf_event_attr *attr,
-                            perf_callback_t triggered,
+                            perf_overflow_handler_t triggered,
                            struct task_struct *tsk)
 {
        return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
@@ -311,19 +320,40 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 * @triggered: callback to trigger when we hit the breakpoint
 * @tsk: pointer to 'task_struct' of the process to which the address belongs
 */
-struct perf_event *
+int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
-modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr,
-                          perf_callback_t triggered,
-                          struct task_struct *tsk)
 {
-        /*
+        u64 old_addr = bp->attr.bp_addr;
-         * FIXME: do it without unregistering
+        int old_type = bp->attr.bp_type;
-         * - We don't want to lose our slot
+        int old_len = bp->attr.bp_len;
-         * - If the new bp is incorrect, don't lose the older one
+        int err = 0;
-         */
-        unregister_hw_breakpoint(bp);
-        return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
+        perf_event_disable(bp);
+        bp->attr.bp_addr = attr->bp_addr;
+        bp->attr.bp_type = attr->bp_type;
+        bp->attr.bp_len = attr->bp_len;
+        if (attr->disabled)
+                goto end;
+        err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+        if (!err)
+                perf_event_enable(bp);
+        if (err) {
+                bp->attr.bp_addr = old_addr;
+                bp->attr.bp_type = old_type;
+                bp->attr.bp_len = old_len;
+                if (!bp->attr.disabled)
+                        perf_event_enable(bp);
+                return err;
+        }
+end:
+        bp->attr.disabled = attr->disabled;
+        return 0;
 }
 EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
@@ -348,7 +378,7 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
 */
 struct perf_event **
 register_wide_hw_breakpoint(struct perf_event_attr *attr,
-                            perf_callback_t triggered)
+                            perf_overflow_handler_t triggered)
 {
        struct perf_event **cpu_events, **pevent, *bp;
        long err;
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 7d7014634022..2eb517e23514 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -129,6 +129,7 @@ struct task_struct		*kgdb_usethread;
 struct task_struct              *kgdb_contthread;
 int                             kgdb_single_step;
+pid_t                           kgdb_sstep_pid;
 /* Our I/O buffers. */
 static char                     remcom_in_buffer[BUFMAX];
@@ -541,12 +542,17 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
         */
        if (tid == 0 || tid == -1)
                tid = -atomic_read(&kgdb_active) - 2;
-        if (tid < 0) {
+        if (tid < -1 && tid > -NR_CPUS - 2) {
                if (kgdb_info[-tid - 2].task)
                        return kgdb_info[-tid - 2].task;
                else
                        return idle_task(-tid - 2);
        }
+        if (tid <= 0) {
+                printk(KERN_ERR "KGDB: Internal thread select error\n");
+                dump_stack();
+                return NULL;
+        }
        /*
         * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -619,7 +625,8 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
 static int kgdb_activate_sw_breakpoints(void)
 {
        unsigned long addr;
-        int error = 0;
+        int error;
+        int ret = 0;
        int i;
        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -629,13 +636,16 @@ static int kgdb_activate_sw_breakpoints(void)
                addr = kgdb_break[i].bpt_addr;
                error = kgdb_arch_set_breakpoint(addr,
                                kgdb_break[i].saved_instr);
-                if (error)
+                if (error) {
-                        return error;
+                        ret = error;
+                        printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
+                        continue;
+                }
                kgdb_flush_swbreak_addr(addr);
                kgdb_break[i].state = BP_ACTIVE;
        }
-        return 0;
+        return ret;
 }
 static int kgdb_set_sw_break(unsigned long addr)
@@ -682,7 +692,8 @@ static int kgdb_set_sw_break(unsigned long addr)
 static int kgdb_deactivate_sw_breakpoints(void)
 {
        unsigned long addr;
-        int error = 0;
+        int error;
+        int ret = 0;
        int i;
        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -691,13 +702,15 @@ static int kgdb_deactivate_sw_breakpoints(void)
                addr = kgdb_break[i].bpt_addr;
                error = kgdb_arch_remove_breakpoint(addr,
                                        kgdb_break[i].saved_instr);
-                if (error)
+                if (error) {
-                        return error;
+                        printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
+                        ret = error;
+                }
                kgdb_flush_swbreak_addr(addr);
                kgdb_break[i].state = BP_SET;
        }
-        return 0;
+        return ret;
 }
 static int kgdb_remove_sw_break(unsigned long addr)
@@ -1204,8 +1217,10 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks)
                return 1;
        } else {
-                error_packet(remcom_out_buffer, -EINVAL);
+                kgdb_msg_write("KGDB only knows signal 9 (pass)"
-                return 0;
+                        " and 15 (pass and disconnect)\n"
+                        "Executing a continue without signal passing\n", 0);
+                remcom_in_buffer[0] = 'c';
        }
        /* Indicate fall through */
@@ -1395,6 +1410,7 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
        struct kgdb_state kgdb_var;
        struct kgdb_state *ks = &kgdb_var;
        unsigned long flags;
+        int sstep_tries = 100;
        int error = 0;
        int i, cpu;
@@ -1425,13 +1441,14 @@ acquirelock:
                cpu_relax();
        /*
-         * Do not start the debugger connection on this CPU if the last
+         * For single stepping, try to only enter on the processor
-         * instance of the exception handler wanted to come into the
+         * that was single stepping.  To gaurd against a deadlock, the
-         * debugger on a different CPU via a single step
+         * kernel will only try for the value of sstep_tries before
+         * giving up and continuing on.
         */
        if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
-            atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
+            (kgdb_info[cpu].task &&
+             kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
                atomic_set(&kgdb_active, -1);
                touch_softlockup_watchdog();
                clocksource_touch_watchdog();
@@ -1524,6 +1541,13 @@ acquirelock:
        }
 kgdb_restore:
+        if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
+                int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
+                if (kgdb_info[sstep_cpu].task)
+                        kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
+                else
+                        kgdb_sstep_pid = 0;
+        }
        /* Free kgdb_active */
        atomic_set(&kgdb_active, -1);
        touch_softlockup_watchdog();
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f5dcd36d3151..429540c70d3f 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -140,7 +140,8 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
 }
 #ifdef CONFIG_LOCK_STAT
-static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
+static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
+                      cpu_lock_stats);
 static inline u64 lockstat_clock(void)
 {
@@ -168,7 +169,7 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
        if (time > lt->max)
                lt->max = time;
-        if (time < lt->min || !lt->min)
+        if (time < lt->min || !lt->nr)
                lt->min = time;
        lt->total += time;
@@ -177,8 +178,15 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
 static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
 {
-        dst->min += src->min;
+        if (!src->nr)
-        dst->max += src->max;
+                return;
+        if (src->max > dst->max)
+                dst->max = src->max;
+        if (src->min < dst->min || !dst->nr)
+                dst->min = src->min;
        dst->total += src->total;
        dst->nr += src->nr;
 }
@@ -191,7 +199,7 @@ struct lock_class_stats lock_stats(struct lock_class *class)
        memset(&stats, 0, sizeof(struct lock_class_stats));
        for_each_possible_cpu(cpu) {
                struct lock_class_stats *pcs =
-                        &per_cpu(lock_stats, cpu)[class - lock_classes];
+                        &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
                for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
                        stats.contention_point[i] += pcs->contention_point[i];
@@ -218,7 +226,7 @@ void clear_lock_stats(struct lock_class *class)
        for_each_possible_cpu(cpu) {
                struct lock_class_stats *cpu_stats =
-                        &per_cpu(lock_stats, cpu)[class - lock_classes];
+                        &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
                memset(cpu_stats, 0, sizeof(struct lock_class_stats));
        }
@@ -228,12 +236,12 @@ void clear_lock_stats(struct lock_class *class)
 static struct lock_class_stats *get_lock_stats(struct lock_class *class)
 {
-        return &get_cpu_var(lock_stats)[class - lock_classes];
+        return &get_cpu_var(cpu_lock_stats)[class - lock_classes];
 }
 static void put_lock_stats(struct lock_class_stats *stats)
 {
-        put_cpu_var(lock_stats);
+        put_cpu_var(cpu_lock_stats);
 }
 static void lock_release_holdtime(struct held_lock *hlock)
@@ -379,7 +387,8 @@ static int save_trace(struct stack_trace *trace)
         * complete trace that maxes out the entries provided will be reported
         * as incomplete, friggin useless </rant>
         */
-        if (trace->entries[trace->nr_entries-1] == ULONG_MAX)
+        if (trace->nr_entries != 0 &&
+            trace->entries[trace->nr_entries-1] == ULONG_MAX)
                trace->nr_entries--;
        trace->max_entries = trace->nr_entries;
diff --git a/kernel/module.c b/kernel/module.c
index 5842a71cf052..12afc5a3ddd3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -370,8 +370,6 @@ EXPORT_SYMBOL_GPL(find_module);
 #ifdef CONFIG_SMP
-#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
 static void *percpu_modalloc(unsigned long size, unsigned long align,
                             const char *name)
 {
@@ -395,154 +393,6 @@ static void percpu_modfree(void *freeme)
        free_percpu(freeme);
 }
-#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
-/* Number of blocks used and allocated. */
-static unsigned int pcpu_num_used, pcpu_num_allocated;
-/* Size of each block.  -ve means used. */
-static int *pcpu_size;
-static int split_block(unsigned int i, unsigned short size)
-{
-        /* Reallocation required? */
-        if (pcpu_num_used + 1 > pcpu_num_allocated) {
-                int *new;
-                new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2,
-                               GFP_KERNEL);
-                if (!new)
-                        return 0;
-                pcpu_num_allocated *= 2;
-                pcpu_size = new;
-        }
-        /* Insert a new subblock */
-        memmove(&pcpu_size[i+1], &pcpu_size[i],
-                sizeof(pcpu_size[0]) * (pcpu_num_used - i));
-        pcpu_num_used++;
-        pcpu_size[i+1] -= size;
-        pcpu_size[i] = size;
-        return 1;
-}
-static inline unsigned int block_size(int val)
-{
-        if (val < 0)
-                return -val;
-        return val;
-}
-static void *percpu_modalloc(unsigned long size, unsigned long align,
-                             const char *name)
-{
-        unsigned long extra;
-        unsigned int i;
-        void *ptr;
-        int cpu;
-        if (align > PAGE_SIZE) {
-                printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
-                       name, align, PAGE_SIZE);
-                align = PAGE_SIZE;
-        }
-        ptr = __per_cpu_start;
-        for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
-                /* Extra for alignment requirement. */
-                extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
-                BUG_ON(i == 0 && extra != 0);
-                if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size)
-                        continue;
-                /* Transfer extra to previous block. */
-                if (pcpu_size[i-1] < 0)
-                        pcpu_size[i-1] -= extra;
-                else
-                        pcpu_size[i-1] += extra;
-                pcpu_size[i] -= extra;
-                ptr += extra;
-                /* Split block if warranted */
-                if (pcpu_size[i] - size > sizeof(unsigned long))
-                        if (!split_block(i, size))
-                                return NULL;
-                /* add the per-cpu scanning areas */
-                for_each_possible_cpu(cpu)
-                        kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
-                                       GFP_KERNEL);
-                /* Mark allocated */
-                pcpu_size[i] = -pcpu_size[i];
-                return ptr;
-        }
-        printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n",
-               size);
-        return NULL;
-}
-static void percpu_modfree(void *freeme)
-{
-        unsigned int i;
-        void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
-        int cpu;
-        /* First entry is core kernel percpu data. */
-        for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
-                if (ptr == freeme) {
-                        pcpu_size[i] = -pcpu_size[i];
-                        goto free;
-                }
-        }
-        BUG();
- free:
-        /* remove the per-cpu scanning areas */
-        for_each_possible_cpu(cpu)
-                kmemleak_free(freeme + per_cpu_offset(cpu));
-        /* Merge with previous? */
-        if (pcpu_size[i-1] >= 0) {
-                pcpu_size[i-1] += pcpu_size[i];
-                pcpu_num_used--;
-                memmove(&pcpu_size[i], &pcpu_size[i+1],
-                        (pcpu_num_used - i) * sizeof(pcpu_size[0]));
-                i--;
-        }
-        /* Merge with next? */
-        if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) {
-                pcpu_size[i] += pcpu_size[i+1];
-                pcpu_num_used--;
-                memmove(&pcpu_size[i+1], &pcpu_size[i+2],
-                        (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0]));
-        }
-}
-static int percpu_modinit(void)
-{
-        pcpu_num_used = 2;
-        pcpu_num_allocated = 2;
-        pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
-                            GFP_KERNEL);
-        /* Static in-kernel percpu data (used). */
-        pcpu_size[0] = -(__per_cpu_end-__per_cpu_start);
-        /* Free room. */
-        pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
-        if (pcpu_size[1] < 0) {
-                printk(KERN_ERR "No per-cpu room for modules.\n");
-                pcpu_num_used = 1;
-        }
-        return 0;
-}
-__initcall(percpu_modinit);
-#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
 static unsigned int find_pcpusec(Elf_Ehdr *hdr,
                                 Elf_Shdr *sechdrs,
                                 const char *secstrings)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 40a996ec39fa..e73e53c7582f 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -36,7 +36,7 @@
 /*
 * Each CPU has a list of per CPU events:
 */
-DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
 int perf_max_events __read_mostly = 1;
 static int perf_reserved_percpu __read_mostly;
@@ -567,7 +567,7 @@ static void __perf_event_disable(void *info)
 * is the current context on this CPU and preemption is disabled,
 * hence we can't get into perf_event_task_sched_out for this context.
 */
-static void perf_event_disable(struct perf_event *event)
+void perf_event_disable(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
@@ -971,7 +971,7 @@ static void __perf_event_enable(void *info)
 * perf_event_for_each_child or perf_event_for_each as described
 * for perf_event_disable.
 */
-static void perf_event_enable(struct perf_event *event)
+void perf_event_enable(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
@@ -1579,7 +1579,6 @@ static void
 __perf_event_init_context(struct perf_event_context *ctx,
                            struct task_struct *task)
 {
-        memset(ctx, 0, sizeof(*ctx));
        spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
        INIT_LIST_HEAD(&ctx->group_list);
@@ -1654,7 +1653,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
        }
        if (!ctx) {
-                ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+                ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
                err = -ENOMEM;
                if (!ctx)
                        goto errout;
@@ -4011,6 +4010,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        event->pmu->read(event);
        data.addr = 0;
+        data.raw = NULL;
        data.period = event->hw.last_period;
        regs = get_irq_regs();
        /*
@@ -4080,8 +4080,7 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
        u64 now;
        now = cpu_clock(cpu);
-        prev = atomic64_read(&event->hw.prev_count);
+        prev = atomic64_xchg(&event->hw.prev_count, now);
-        atomic64_set(&event->hw.prev_count, now);
        atomic64_add(now - prev, &event->count);
 }
@@ -4286,15 +4285,8 @@ static void bp_perf_event_destroy(struct perf_event *event)
 static const struct pmu *bp_perf_event_init(struct perf_event *bp)
 {
        int err;
-        /*
-         * The breakpoint is already filled if we haven't created the counter
+        err = register_perf_hw_breakpoint(bp);
-         * through perf syscall
-         * FIXME: manage to get trigerred to NULL if it comes from syscalls
-         */
-        if (!bp->callback)
-                err = register_perf_hw_breakpoint(bp);
-        else
-                err = __register_perf_hw_breakpoint(bp);
        if (err)
                return ERR_PTR(err);
@@ -4308,6 +4300,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
        struct perf_sample_data sample;
        struct pt_regs *regs = data;
+        sample.raw = NULL;
        sample.addr = bp->attr.bp_addr;
        if (!perf_exclude_event(bp, regs))
@@ -4390,7 +4383,7 @@ perf_event_alloc(struct perf_event_attr *attr,
                   struct perf_event_context *ctx,
                   struct perf_event *group_leader,
                   struct perf_event *parent_event,
-                   perf_callback_t callback,
+                   perf_overflow_handler_t overflow_handler,
                   gfp_t gfpflags)
 {
        const struct pmu *pmu;
@@ -4433,10 +4426,10 @@ perf_event_alloc(struct perf_event_attr *attr,
        event->state            = PERF_EVENT_STATE_INACTIVE;
-        if (!callback && parent_event)
+        if (!overflow_handler && parent_event)
-                callback = parent_event->callback;
+                overflow_handler = parent_event->overflow_handler;
        
-        event->callback = callback;
+        event->overflow_handler = overflow_handler;
        if (attr->disabled)
                event->state = PERF_EVENT_STATE_OFF;
@@ -4776,7 +4769,8 @@ err_put_context:
 */
 struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
-                                 pid_t pid, perf_callback_t callback)
+                                 pid_t pid,
+                                 perf_overflow_handler_t overflow_handler)
 {
        struct perf_event *event;
        struct perf_event_context *ctx;
@@ -4793,7 +4787,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        }
        event = perf_event_alloc(attr, cpu, ctx, NULL,
-                                     NULL, callback, GFP_KERNEL);
+                                 NULL, overflow_handler, GFP_KERNEL);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err_put_context;
@@ -5090,7 +5084,7 @@ again:
 */
 int perf_event_init_task(struct task_struct *child)
 {
-        struct perf_event_context *child_ctx, *parent_ctx;
+        struct perf_event_context *child_ctx = NULL, *parent_ctx;
        struct perf_event_context *cloned_ctx;
        struct perf_event *event;
        struct task_struct *parent = current;
@@ -5106,20 +5100,6 @@ int perf_event_init_task(struct task_struct *child)
                return 0;
        /*
-         * This is executed from the parent task context, so inherit
-         * events that have been marked for cloning.
-         * First allocate and initialize a context for the child.
-         */
-        child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
-        if (!child_ctx)
-                return -ENOMEM;
-        __perf_event_init_context(child_ctx, child);
-        child->perf_event_ctxp = child_ctx;
-        get_task_struct(child);
-        /*
         * If the parent's context is a clone, pin it so it won't get
         * swapped under us.
         */
@@ -5149,6 +5129,26 @@ int perf_event_init_task(struct task_struct *child)
                        continue;
                }
+                if (!child->perf_event_ctxp) {
+                        /*
+                         * This is executed from the parent task context, so
+                         * inherit events that have been marked for cloning.
+                         * First allocate and initialize a context for the
+                         * child.
+                         */
+                        child_ctx = kzalloc(sizeof(struct perf_event_context),
+                                            GFP_KERNEL);
+                        if (!child_ctx) {
+                                ret = -ENOMEM;
+                                goto exit;
+                        }
+                        __perf_event_init_context(child_ctx, child);
+                        child->perf_event_ctxp = child_ctx;
+                        get_task_struct(child);
+                }
                ret = inherit_group(event, parent, parent_ctx,
                                             child, child_ctx);
                if (ret) {
@@ -5177,6 +5177,7 @@ int perf_event_init_task(struct task_struct *child)
                get_ctx(child_ctx->parent_ctx);
        }
+exit:
        mutex_unlock(&parent_ctx->mutex);
        perf_unpin_context(parent_ctx);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index a621a67ef4e3..9bb52177af02 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -763,13 +763,13 @@ static void rcu_torture_timer(unsigned long unused)
                /* Should not happen, but... */
                pipe_count = RCU_TORTURE_PIPE_LEN;
        }
-        ++__get_cpu_var(rcu_torture_count)[pipe_count];
+        __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
        completed = cur_ops->completed() - completed;
        if (completed > RCU_TORTURE_PIPE_LEN) {
                /* Should not happen, but... */
                completed = RCU_TORTURE_PIPE_LEN;
        }
-        ++__get_cpu_var(rcu_torture_batch)[completed];
+        __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
        preempt_enable();
        cur_ops->readunlock(idx);
 }
@@ -818,13 +818,13 @@ rcu_torture_reader(void *arg)
                        /* Should not happen, but... */
                        pipe_count = RCU_TORTURE_PIPE_LEN;
                }
-                ++__get_cpu_var(rcu_torture_count)[pipe_count];
+                __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
                completed = cur_ops->completed() - completed;
                if (completed > RCU_TORTURE_PIPE_LEN) {
                        /* Should not happen, but... */
                        completed = RCU_TORTURE_PIPE_LEN;
                }
-                ++__get_cpu_var(rcu_torture_batch)[completed];
+                __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
                preempt_enable();
                cur_ops->readunlock(idx);
                schedule();
diff --git a/kernel/resource.c b/kernel/resource.c
index fb11a58b9594..dc15686b7a77 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -308,35 +308,37 @@ static int find_resource(struct resource *root, struct resource *new,
                         void *alignf_data)
 {
        struct resource *this = root->child;
+        resource_size_t start, end;
-        new->start = root->start;
+        start = root->start;
        /*
         * Skip past an allocated resource that starts at 0, since the assignment
         * of this->start - 1 to new->end below would cause an underflow.
         */
        if (this && this->start == 0) {
-                new->start = this->end + 1;
+                start = this->end + 1;
                this = this->sibling;
        }
        for(;;) {
                if (this)
-                        new->end = this->start - 1;
+                        end = this->start - 1;
                else
-                        new->end = root->end;
+                        end = root->end;
-                if (new->start < min)
+                if (start < min)
-                        new->start = min;
+                        start = min;
-                if (new->end > max)
+                if (end > max)
-                        new->end = max;
+                        end = max;
-                new->start = ALIGN(new->start, align);
+                start = ALIGN(start, align);
                if (alignf)
                        alignf(alignf_data, new, size, align);
-                if (new->start < new->end && new->end - new->start >= size - 1) {
+                if (start < end && end - start >= size - 1) {
-                        new->end = new->start + size - 1;
+                        new->start = start;
+                        new->end = start + size - 1;
                        return 0;
                }
                if (!this)
                        break;
-                new->start = this->end + 1;
+                start = this->end + 1;
                this = this->sibling;
        }
        return -EBUSY;
diff --git a/kernel/sched.c b/kernel/sched.c
index e7f2cfa6a257..fd05861b2111 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -298,7 +298,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
 #ifdef CONFIG_RT_GROUP_SCHED
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
 #endif /* CONFIG_RT_GROUP_SCHED */
 #else /* !CONFIG_USER_SCHED */
 #define root_task_group init_task_group
@@ -814,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
 * default: 0.25ms
 */
 unsigned int sysctl_sched_shares_ratelimit = 250000;
+unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
 /*
 * Inject some fuzzyness into changing the per-cpu group shares
@@ -1614,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
 */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-        unsigned long weight, rq_weight = 0, shares = 0;
+        unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
        unsigned long *usd_rq_weight;
        struct sched_domain *sd = data;
        unsigned long flags;
@@ -1630,6 +1631,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
                weight = tg->cfs_rq[i]->load.weight;
                usd_rq_weight[i] = weight;
+                rq_weight += weight;
                /*
                 * If there are currently no tasks on the cpu pretend there
                 * is one of average load so that when a new task gets to
@@ -1638,10 +1640,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
                if (!weight)
                        weight = NICE_0_LOAD;
-                rq_weight += weight;
+                sum_weight += weight;
                shares += tg->cfs_rq[i]->shares;
        }
+        if (!rq_weight)
+                rq_weight = sum_weight;
        if ((!shares && rq_weight) || shares > tg->shares)
                shares = tg->shares;
@@ -1810,6 +1815,22 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 #endif
 static void calc_load_account_active(struct rq *this_rq);
+static void update_sysctl(void);
+static int get_update_sysctl_factor(void);
+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+        set_task_rq(p, cpu);
+#ifdef CONFIG_SMP
+        /*
+         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
+         * successfuly executed on another CPU. We must ensure that updates of
+         * per-task data have been completed by this moment.
+         */
+        smp_wmb();
+        task_thread_info(p)->cpu = cpu;
+#endif
+}
 #include "sched_stats.h"
 #include "sched_idletask.c"
@@ -1967,20 +1988,6 @@ inline int task_curr(const struct task_struct *p)
        return cpu_curr(task_cpu(p)) == p;
 }
-static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-        set_task_rq(p, cpu);
-#ifdef CONFIG_SMP
-        /*
-         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
-         * successfuly executed on another CPU. We must ensure that updates of
-         * per-task data have been completed by this moment.
-         */
-        smp_wmb();
-        task_thread_info(p)->cpu = cpu;
-#endif
-}
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                                       const struct sched_class *prev_class,
                                       int oldprio, int running)
@@ -2060,29 +2067,13 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
        int old_cpu = task_cpu(p);
-        struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
        struct cfs_rq *old_cfsrq = task_cfs_rq(p),
                      *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
-        u64 clock_offset;
-        clock_offset = old_rq->clock - new_rq->clock;
        trace_sched_migrate_task(p, new_cpu);
-#ifdef CONFIG_SCHEDSTATS
-        if (p->se.wait_start)
-                p->se.wait_start -= clock_offset;
-        if (p->se.sleep_start)
-                p->se.sleep_start -= clock_offset;
-        if (p->se.block_start)
-                p->se.block_start -= clock_offset;
-#endif
        if (old_cpu != new_cpu) {
                p->se.nr_migrations++;
-#ifdef CONFIG_SCHEDSTATS
-                if (task_hot(p, old_rq->clock, NULL))
-                        schedstat_inc(p, se.nr_forced2_migrations);
-#endif
                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
                                     1, 1, NULL, 0);
        }
@@ -2323,6 +2314,14 @@ void task_oncpu_function_call(struct task_struct *p,
        preempt_enable();
 }
+#ifdef CONFIG_SMP
+static inline
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+{
+        return p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+}
+#endif
 /***
 * try_to_wake_up - wake up a thread
 * @p: the to-be-woken-up thread
@@ -2374,17 +2373,14 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
        if (task_contributes_to_load(p))
                rq->nr_uninterruptible--;
        p->state = TASK_WAKING;
-        task_rq_unlock(rq, &flags);
+        __task_rq_unlock(rq);
-        cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-        if (cpu != orig_cpu) {
+        if (cpu != orig_cpu)
-                local_irq_save(flags);
-                rq = cpu_rq(cpu);
-                update_rq_clock(rq);
                set_task_cpu(p, cpu);
-                local_irq_restore(flags);
-        }
+        rq = __task_rq_lock(p);
-        rq = task_rq_lock(p, &flags);
+        update_rq_clock(rq);
        WARN_ON(p->state != TASK_WAKING);
        cpu = task_cpu(p);
@@ -2499,7 +2495,6 @@ static void __sched_fork(struct task_struct *p)
        p->se.avg_overlap               = 0;
        p->se.start_runtime             = 0;
        p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
-        p->se.avg_running               = 0;
 #ifdef CONFIG_SCHEDSTATS
        p->se.wait_start                        = 0;
@@ -2521,7 +2516,6 @@ static void __sched_fork(struct task_struct *p)
        p->se.nr_failed_migrations_running      = 0;
        p->se.nr_failed_migrations_hot          = 0;
        p->se.nr_forced_migrations              = 0;
-        p->se.nr_forced2_migrations             = 0;
        p->se.nr_wakeups                        = 0;
        p->se.nr_wakeups_sync                   = 0;
@@ -2558,7 +2552,6 @@ static void __sched_fork(struct task_struct *p)
 void sched_fork(struct task_struct *p, int clone_flags)
 {
        int cpu = get_cpu();
-        unsigned long flags;
        __sched_fork(p);
@@ -2592,13 +2585,13 @@ void sched_fork(struct task_struct *p, int clone_flags)
        if (!rt_prio(p->prio))
                p->sched_class = &fair_sched_class;
+        if (p->sched_class->task_fork)
+                p->sched_class->task_fork(p);
 #ifdef CONFIG_SMP
-        cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
+        cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
 #endif
-        local_irq_save(flags);
-        update_rq_clock(cpu_rq(cpu));
        set_task_cpu(p, cpu);
-        local_irq_restore(flags);
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
        if (likely(sched_info_on()))
@@ -2631,17 +2624,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
        rq = task_rq_lock(p, &flags);
        BUG_ON(p->state != TASK_RUNNING);
        update_rq_clock(rq);
+        activate_task(rq, p, 0);
-        if (!p->sched_class->task_new || !current->se.on_rq) {
-                activate_task(rq, p, 0);
-        } else {
-                /*
-                 * Let the scheduling class do new task startup
-                 * management (if any):
-                 */
-                p->sched_class->task_new(rq, p);
-                inc_nr_running(rq);
-        }
        trace_sched_wakeup_new(rq, p, 1);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
@@ -3156,7 +3139,7 @@ out:
 void sched_exec(void)
 {
        int new_cpu, this_cpu = get_cpu();
-        new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
+        new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0);
        put_cpu();
        if (new_cpu != this_cpu)
                sched_migrate_task(current, new_cpu);
@@ -3172,10 +3155,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
        deactivate_task(src_rq, p, 0);
        set_task_cpu(p, this_cpu);
        activate_task(this_rq, p, 0);
-        /*
-         * Note that idle threads have a prio of MAX_PRIO, for this test
-         * to be always true for them.
-         */
        check_preempt_curr(this_rq, p, 0);
 }
@@ -4134,7 +4113,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        unsigned long flags;
        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-        cpumask_copy(cpus, cpu_online_mask);
+        cpumask_copy(cpus, cpu_active_mask);
        /*
         * When power savings policy is enabled for the parent domain, idle
@@ -4297,7 +4276,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
        int all_pinned = 0;
        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-        cpumask_copy(cpus, cpu_online_mask);
+        cpumask_copy(cpus, cpu_active_mask);
        /*
         * When power savings policy is enabled for the parent domain, idle
@@ -4694,7 +4673,7 @@ int select_nohz_load_balancer(int stop_tick)
                cpumask_set_cpu(cpu, nohz.cpu_mask);
                /* time for ilb owner also to sleep */
-                if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
+                if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
                        if (atomic_read(&nohz.load_balancer) == cpu)
                                atomic_set(&nohz.load_balancer, -1);
                        return 0;
@@ -5396,13 +5375,14 @@ static inline void schedule_debug(struct task_struct *prev)
 #endif
 }
-static void put_prev_task(struct rq *rq, struct task_struct *p)
+static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-        u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
+        if (prev->state == TASK_RUNNING) {
+                u64 runtime = prev->se.sum_exec_runtime;
-        update_avg(&p->se.avg_running, runtime);
+                runtime -= prev->se.prev_sum_exec_runtime;
+                runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
-        if (p->state == TASK_RUNNING) {
                /*
                 * In order to avoid avg_overlap growing stale when we are
                 * indeed overlapping and hence not getting put to sleep, grow
@@ -5412,12 +5392,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p)
                 * correlates to the amount of cache footprint a task can
                 * build up.
                 */
-                runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+                update_avg(&prev->se.avg_overlap, runtime);
-                update_avg(&p->se.avg_overlap, runtime);
-        } else {
-                update_avg(&p->se.avg_running, 0);
        }
-        p->sched_class->put_prev_task(rq, p);
+        prev->sched_class->put_prev_task(rq, prev);
 }
 /*
@@ -6631,6 +6608,8 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
 long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
        struct task_struct *p;
+        unsigned long flags;
+        struct rq *rq;
        int retval;
        get_online_cpus();
@@ -6645,7 +6624,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
        if (retval)
                goto out_unlock;
+        rq = task_rq_lock(p, &flags);
        cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+        task_rq_unlock(rq, &flags);
 out_unlock:
        read_unlock(&tasklist_lock);
@@ -6883,6 +6864,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 {
        struct task_struct *p;
        unsigned int time_slice;
+        unsigned long flags;
+        struct rq *rq;
        int retval;
        struct timespec t;
@@ -6899,7 +6882,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
        if (retval)
                goto out_unlock;
-        time_slice = p->sched_class->get_rr_interval(p);
+        rq = task_rq_lock(p, &flags);
+        time_slice = p->sched_class->get_rr_interval(rq, p);
+        task_rq_unlock(rq, &flags);
        read_unlock(&tasklist_lock);
        jiffies_to_timespec(time_slice, &t);
@@ -7000,7 +6985,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        __sched_fork(idle);
        idle->se.exec_start = sched_clock();
-        idle->prio = idle->normal_prio = MAX_PRIO;
        cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
        __set_task_cpu(idle, cpu);
@@ -7041,22 +7025,43 @@ cpumask_var_t nohz_cpu_mask;
 *
 * This idea comes from the SD scheduler of Con Kolivas:
 */
-static inline void sched_init_granularity(void)
+static int get_update_sysctl_factor(void)
 {
-        unsigned int factor = 1 + ilog2(num_online_cpus());
+        unsigned int cpus = min_t(int, num_online_cpus(), 8);
-        const unsigned long limit = 200000000;
+        unsigned int factor;
+        switch (sysctl_sched_tunable_scaling) {
+        case SCHED_TUNABLESCALING_NONE:
+                factor = 1;
+                break;
+        case SCHED_TUNABLESCALING_LINEAR:
+                factor = cpus;
+                break;
+        case SCHED_TUNABLESCALING_LOG:
+        default:
+                factor = 1 + ilog2(cpus);
+                break;
+        }
-        sysctl_sched_min_granularity *= factor;
+        return factor;
-        if (sysctl_sched_min_granularity > limit)
+}
-                sysctl_sched_min_granularity = limit;
-        sysctl_sched_latency *= factor;
+static void update_sysctl(void)
-        if (sysctl_sched_latency > limit)
+{
-                sysctl_sched_latency = limit;
+        unsigned int factor = get_update_sysctl_factor();
-        sysctl_sched_wakeup_granularity *= factor;
+#define SET_SYSCTL(name) \
+        (sysctl_##name = (factor) * normalized_sysctl_##name)
+        SET_SYSCTL(sched_min_granularity);
+        SET_SYSCTL(sched_latency);
+        SET_SYSCTL(sched_wakeup_granularity);
+        SET_SYSCTL(sched_shares_ratelimit);
+#undef SET_SYSCTL
+}
-        sysctl_sched_shares_ratelimit *= factor;
+static inline void sched_init_granularity(void)
+{
+        update_sysctl();
 }
 #ifdef CONFIG_SMP
@@ -7093,7 +7098,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        int ret = 0;
        rq = task_rq_lock(p, &flags);
-        if (!cpumask_intersects(new_mask, cpu_online_mask)) {
+        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                ret = -EINVAL;
                goto out;
        }
@@ -7115,7 +7120,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
-        if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
+        if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
                /* Need help from migration thread: drop lock and wait. */
                struct task_struct *mt = rq->migration_thread;
@@ -7269,19 +7274,19 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 again:
        /* Look for allowed, online CPU in same node. */
-        for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
+        for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
                if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                        goto move;
        /* Any allowed, online CPU? */
-        dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
+        dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
        if (dest_cpu < nr_cpu_ids)
                goto move;
        /* No more Mr. Nice Guy. */
        if (dest_cpu >= nr_cpu_ids) {
                cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
-                dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
+                dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
                /*
                 * Don't tell them about moving exiting tasks or
@@ -7310,7 +7315,7 @@ move:
 */
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
-        struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
+        struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
        unsigned long flags;
        local_irq_save(flags);
@@ -7563,7 +7568,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 static struct ctl_table_header *sd_sysctl_header;
 static void register_sched_domain_sysctl(void)
 {
-        int i, cpu_num = num_online_cpus();
+        int i, cpu_num = num_possible_cpus();
        struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
        char buf[32];
@@ -7573,7 +7578,7 @@ static void register_sched_domain_sysctl(void)
        if (entry == NULL)
                return;
-        for_each_online_cpu(i) {
+        for_each_possible_cpu(i) {
                snprintf(buf, 32, "cpu%d", i);
                entry->procname = kstrdup(buf, GFP_KERNEL);
                entry->mode = 0555;
@@ -7703,7 +7708,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                spin_lock_irq(&rq->lock);
                update_rq_clock(rq);
                deactivate_task(rq, rq->idle, 0);
-                rq->idle->static_prio = MAX_PRIO;
                __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
                rq->idle->sched_class = &idle_sched_class;
                migrate_dead_tasks(cpu);
@@ -8282,14 +8286,14 @@ enum s_alloc {
 */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
+static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
 static int
 cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
                 struct sched_group **sg, struct cpumask *unused)
 {
        if (sg)
-                *sg = &per_cpu(sched_group_cpus, cpu).sg;
+                *sg = &per_cpu(sched_groups, cpu).sg;
        return cpu;
 }
 #endif /* CONFIG_SCHED_SMT */
@@ -9099,7 +9103,7 @@ match1:
        if (doms_new == NULL) {
                ndoms_cur = 0;
                doms_new = &fallback_doms;
-                cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map);
+                cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
                WARN_ON_ONCE(dattr_new);
        }
@@ -9230,8 +9234,10 @@ static int update_sched_domains(struct notifier_block *nfb,
        switch (action) {
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-        case CPU_DEAD:
+        case CPU_DOWN_PREPARE:
-        case CPU_DEAD_FROZEN:
+        case CPU_DOWN_PREPARE_FROZEN:
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
                partition_sched_domains(1, NULL, NULL);
                return NOTIFY_OK;
@@ -9278,7 +9284,7 @@ void __init sched_init_smp(void)
 #endif
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
-        arch_init_sched_domains(cpu_online_mask);
+        arch_init_sched_domains(cpu_active_mask);
        cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
        if (cpumask_empty(non_isolated_cpus))
                cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -9577,7 +9583,7 @@ void __init sched_init(void)
 #elif defined CONFIG_USER_SCHED
                init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
                init_tg_rt_entry(&init_task_group,
-                                &per_cpu(init_rt_rq, i),
+                                &per_cpu(init_rt_rq_var, i),
                                &per_cpu(init_sched_rt_entity, i), i, 1,
                                root_task_group.rt_se[i]);
 #endif
@@ -9842,13 +9848,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                se = kzalloc_node(sizeof(struct sched_entity),
                                  GFP_KERNEL, cpu_to_node(i));
                if (!se)
-                        goto err;
+                        goto err_free_rq;
                init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
        }
        return 1;
+ err_free_rq:
+        kfree(cfs_rq);
 err:
        return 0;
 }
@@ -9930,13 +9938,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
                                     GFP_KERNEL, cpu_to_node(i));
                if (!rt_se)
-                        goto err;
+                        goto err_free_rq;
                init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
        }
        return 1;
+ err_free_rq:
+        kfree(rt_rq);
 err:
        return 0;
 }
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6988cf08f705..5ae24fc65d75 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -309,6 +309,12 @@ static void print_cpu(struct seq_file *m, int cpu)
        print_rq(m, rq, cpu);
 }
+static const char *sched_tunable_scaling_names[] = {
+        "none",
+        "logaritmic",
+        "linear"
+};
 static int sched_debug_show(struct seq_file *m, void *v)
 {
        u64 now = ktime_to_ns(ktime_get());
@@ -334,6 +340,10 @@ static int sched_debug_show(struct seq_file *m, void *v)
 #undef PN
 #undef P
+        SEQ_printf(m, "  .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+                sysctl_sched_tunable_scaling,
+                sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
        for_each_online_cpu(cpu)
                print_cpu(m, cpu);
@@ -399,7 +409,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        PN(se.sum_exec_runtime);
        PN(se.avg_overlap);
        PN(se.avg_wakeup);
-        PN(se.avg_running);
        nr_switches = p->nvcsw + p->nivcsw;
@@ -423,7 +432,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        P(se.nr_failed_migrations_running);
        P(se.nr_failed_migrations_hot);
        P(se.nr_forced_migrations);
-        P(se.nr_forced2_migrations);
        P(se.nr_wakeups);
        P(se.nr_wakeups_sync);
        P(se.nr_wakeups_migrate);
@@ -499,7 +507,6 @@ void proc_sched_set_task(struct task_struct *p)
        p->se.nr_failed_migrations_running      = 0;
        p->se.nr_failed_migrations_hot          = 0;
        p->se.nr_forced_migrations              = 0;
-        p->se.nr_forced2_migrations             = 0;
        p->se.nr_wakeups                        = 0;
        p->se.nr_wakeups_sync                   = 0;
        p->se.nr_wakeups_migrate                = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f61837ad336d..804a411838f1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
 */
 #include <linux/latencytop.h>
+#include <linux/sched.h>
 /*
 * Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
 *  run vmstat and monitor the context-switches (cs) field)
 */
 unsigned int sysctl_sched_latency = 5000000ULL;
+unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+/*
+ * The initial- and re-scaling of tunables is configurable
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
+ *
+ * Options are:
+ * SCHED_TUNABLESCALING_NONE - unscaled, always *1
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ */
+enum sched_tunable_scaling sysctl_sched_tunable_scaling
+        = SCHED_TUNABLESCALING_LOG;
 /*
 * Minimal preemption granularity for CPU-bound tasks:
 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
 */
 unsigned int sysctl_sched_min_granularity = 1000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
 /*
 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
 * have immediate wakeup/sleep latencies.
 */
 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 */
 #ifdef CONFIG_SCHED_DEBUG
-int sched_nr_latency_handler(struct ctl_table *table, int write,
+int sched_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        int factor = get_update_sysctl_factor();
        if (ret || !write)
                return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
        sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
                                        sysctl_sched_min_granularity);
+#define WRT_SYSCTL(name) \
+        (normalized_sysctl_##name = sysctl_##name / (factor))
+        WRT_SYSCTL(sched_min_granularity);
+        WRT_SYSCTL(sched_latency);
+        WRT_SYSCTL(sched_wakeup_granularity);
+        WRT_SYSCTL(sched_shares_ratelimit);
+#undef WRT_SYSCTL
        return 0;
 }
 #endif
@@ -1403,7 +1428,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                new_cpu = prev_cpu;
        }
-        rcu_read_lock();
        for_each_domain(cpu, tmp) {
                /*
                 * If power savings logic is enabled for a domain, see if we
@@ -1484,10 +1508,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                        update_shares(tmp);
        }
-        if (affine_sd && wake_affine(affine_sd, p, sync)) {
+        if (affine_sd && wake_affine(affine_sd, p, sync))
-                new_cpu = cpu;
+                return cpu;
-                goto out;
-        }
        while (sd) {
                int load_idx = sd->forkexec_idx;
@@ -1528,8 +1550,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                /* while loop will break here if sd == NULL */
        }
-out:
-        rcu_read_unlock();
        return new_cpu;
 }
 #endif /* CONFIG_SMP */
@@ -1651,12 +1671,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        int sync = wake_flags & WF_SYNC;
        int scale = cfs_rq->nr_running >= sched_nr_latency;
-        update_curr(cfs_rq);
+        if (unlikely(rt_prio(p->prio)))
+                goto preempt;
-        if (unlikely(rt_prio(p->prio))) {
-                resched_task(curr);
-                return;
-        }
        if (unlikely(p->sched_class != &fair_sched_class))
                return;
@@ -1682,50 +1698,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
                return;
        /* Idle tasks are by definition preempted by everybody. */
-        if (unlikely(curr->policy == SCHED_IDLE)) {
+        if (unlikely(curr->policy == SCHED_IDLE))
-                resched_task(curr);
+                goto preempt;
-                return;
-        }
-        if ((sched_feat(WAKEUP_SYNC) && sync) ||
+        if (sched_feat(WAKEUP_SYNC) && sync)
-            (sched_feat(WAKEUP_OVERLAP) &&
+                goto preempt;
-             (se->avg_overlap < sysctl_sched_migration_cost &&
-              pse->avg_overlap < sysctl_sched_migration_cost))) {
-                resched_task(curr);
-                return;
-        }
-        if (sched_feat(WAKEUP_RUNNING)) {
+        if (sched_feat(WAKEUP_OVERLAP) &&
-                if (pse->avg_running < se->avg_running) {
+                        se->avg_overlap < sysctl_sched_migration_cost &&
-                        set_next_buddy(pse);
+                        pse->avg_overlap < sysctl_sched_migration_cost)
-                        resched_task(curr);
+                goto preempt;
-                        return;
-                }
-        }
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
+        update_curr(cfs_rq);
        find_matching_se(&se, &pse);
        BUG_ON(!pse);
+        if (wakeup_preempt_entity(se, pse) == 1)
+                goto preempt;
-        if (wakeup_preempt_entity(se, pse) == 1) {
+        return;
-                resched_task(curr);
-                /*
+preempt:
-                 * Only set the backward buddy when the current task is still
+        resched_task(curr);
-                 * on the rq. This can happen when a wakeup gets interleaved
+        /*
-                 * with schedule on the ->pre_schedule() or idle_balance()
+         * Only set the backward buddy when the current task is still
-                 * point, either of which can * drop the rq lock.
+         * on the rq. This can happen when a wakeup gets interleaved
-                 *
+         * with schedule on the ->pre_schedule() or idle_balance()
-                 * Also, during early boot the idle thread is in the fair class,
+         * point, either of which can * drop the rq lock.
-                 * for obvious reasons its a bad idea to schedule back to it.
+         *
-                 */
+         * Also, during early boot the idle thread is in the fair class,
-                if (unlikely(!se->on_rq || curr == rq->idle))
+         * for obvious reasons its a bad idea to schedule back to it.
-                        return;
+         */
-                if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+        if (unlikely(!se->on_rq || curr == rq->idle))
-                        set_last_buddy(se);
+                return;
-        }
+        if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+                set_last_buddy(se);
 }
 static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1905,6 +1915,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        return 0;
 }
+static void rq_online_fair(struct rq *rq)
+{
+        update_sysctl();
+}
+static void rq_offline_fair(struct rq *rq)
+{
+        update_sysctl();
+}
 #endif /* CONFIG_SMP */
 /*
@@ -1922,28 +1943,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 }
 /*
- * Share the fairness runtime between parent and child, thus the
+ * called on fork with the child task as argument from the parent's context
- * total amount of pressure for CPU stays equal - new tasks
+ *  - child not yet on the tasklist
- * get a chance to run but frequent forkers are not allowed to
+ *  - preemption disabled
- * monopolize the CPU. Note: the parent runqueue is locked,
- * the child is not running yet.
 */
-static void task_new_fair(struct rq *rq, struct task_struct *p)
+static void task_fork_fair(struct task_struct *p)
 {
-        struct cfs_rq *cfs_rq = task_cfs_rq(p);
+        struct cfs_rq *cfs_rq = task_cfs_rq(current);
        struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
        int this_cpu = smp_processor_id();
+        struct rq *rq = this_rq();
+        unsigned long flags;
+        spin_lock_irqsave(&rq->lock, flags);
-        sched_info_queued(p);
+        if (unlikely(task_cpu(p) != this_cpu))
+                __set_task_cpu(p, this_cpu);
        update_curr(cfs_rq);
        if (curr)
                se->vruntime = curr->vruntime;
        place_entity(cfs_rq, se, 1);
-        /* 'curr' will be NULL if the child belongs to a different group */
+        if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
-        if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
-                        curr && entity_before(curr, se)) {
                /*
                 * Upon rescheduling, sched_class::put_prev_task() will place
                 * 'current' within the tree based on its new key value.
@@ -1952,7 +1975,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
                resched_task(rq->curr);
        }
-        enqueue_task_fair(rq, p, 0);
+        spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
@@ -2014,21 +2037,17 @@ static void moved_group_fair(struct task_struct *p)
 }
 #endif
-unsigned int get_rr_interval_fair(struct task_struct *task)
+unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
 {
        struct sched_entity *se = &task->se;
-        unsigned long flags;
-        struct rq *rq;
        unsigned int rr_interval = 0;
        /*
         * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
         * idle runqueue:
         */
-        rq = task_rq_lock(task, &flags);
        if (rq->cfs.load.weight)
                rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
-        task_rq_unlock(rq, &flags);
        return rr_interval;
 }
@@ -2052,11 +2071,13 @@ static const struct sched_class fair_sched_class = {
        .load_balance           = load_balance_fair,
        .move_one_task          = move_one_task_fair,
+        .rq_online              = rq_online_fair,
+        .rq_offline             = rq_offline_fair,
 #endif
        .set_curr_task          = set_curr_task_fair,
        .task_tick              = task_tick_fair,
-        .task_new               = task_new_fair,
+        .task_fork              = task_fork_fair,
        .prio_changed           = prio_changed_fair,
        .switched_to            = switched_to_fair,
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 0d94083582c7..d5059fd761d9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -54,11 +54,6 @@ SCHED_FEAT(WAKEUP_SYNC, 0)
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
 /*
- * Wakeup preemption towards tasks that run short
- */
-SCHED_FEAT(WAKEUP_RUNNING, 0)
-/*
 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
 * the remote end is likely to consume the data we just wrote, and
 * therefore has cache benefit from being placed on the same cpu, see
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index b133a28fcde3..33d5384a73a8 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -97,7 +97,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
                check_preempt_curr(rq, p, 0);
 }
-unsigned int get_rr_interval_idle(struct task_struct *task)
+unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
 {
        return 0;
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 5c5fef378415..aecbd9c6b20c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1721,7 +1721,7 @@ static void set_curr_task_rt(struct rq *rq)
        dequeue_pushable_task(rq, p);
 }
-unsigned int get_rr_interval_rt(struct task_struct *task)
+unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
 {
        /*
         * Time slice is 0 for SCHED_FIFO tasks
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 21939d9e830e..a09502e2ef75 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -697,7 +697,7 @@ void __init softirq_init(void)
        open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
-static int ksoftirqd(void * __bind_cpu)
+static int run_ksoftirqd(void * __bind_cpu)
 {
        set_current_state(TASK_INTERRUPTIBLE);
@@ -810,7 +810,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
+                p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
                if (IS_ERR(p)) {
                        printk("ksoftirqd for %i failed\n", hotcpu);
                        return NOTIFY_BAD;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 81324d12eb35..d22579087e27 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -22,9 +22,9 @@
 static DEFINE_SPINLOCK(print_lock);
-static DEFINE_PER_CPU(unsigned long, touch_timestamp);
+static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
-static DEFINE_PER_CPU(unsigned long, print_timestamp);
+static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
-static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
+static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
 static int __read_mostly did_panic;
 int __read_mostly softlockup_thresh = 60;
@@ -70,12 +70,12 @@ static void __touch_softlockup_watchdog(void)
 {
        int this_cpu = raw_smp_processor_id();
-        __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu);
+        __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
 }
 void touch_softlockup_watchdog(void)
 {
-        __raw_get_cpu_var(touch_timestamp) = 0;
+        __raw_get_cpu_var(softlockup_touch_ts) = 0;
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
@@ -85,7 +85,7 @@ void touch_all_softlockup_watchdogs(void)
        /* Cause each CPU to re-update its timestamp rather than complain */
        for_each_online_cpu(cpu)
-                per_cpu(touch_timestamp, cpu) = 0;
+                per_cpu(softlockup_touch_ts, cpu) = 0;
 }
 EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
@@ -104,28 +104,28 @@ int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
 void softlockup_tick(void)
 {
        int this_cpu = smp_processor_id();
-        unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
+        unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
-        unsigned long print_timestamp;
+        unsigned long print_ts;
        struct pt_regs *regs = get_irq_regs();
        unsigned long now;
        /* Is detection switched off? */
-        if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) {
+        if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
                /* Be sure we don't false trigger if switched back on */
-                if (touch_timestamp)
+                if (touch_ts)
-                        per_cpu(touch_timestamp, this_cpu) = 0;
+                        per_cpu(softlockup_touch_ts, this_cpu) = 0;
                return;
        }
-        if (touch_timestamp == 0) {
+        if (touch_ts == 0) {
                __touch_softlockup_watchdog();
                return;
        }
-        print_timestamp = per_cpu(print_timestamp, this_cpu);
+        print_ts = per_cpu(softlockup_print_ts, this_cpu);
        /* report at most once a second */
-        if (print_timestamp == touch_timestamp || did_panic)
+        if (print_ts == touch_ts || did_panic)
                return;
        /* do not print during early bootup: */
@@ -140,18 +140,18 @@ void softlockup_tick(void)
         * Wake up the high-prio watchdog task twice per
         * threshold timespan.
         */
-        if (now > touch_timestamp + softlockup_thresh/2)
+        if (now > touch_ts + softlockup_thresh/2)
-                wake_up_process(per_cpu(watchdog_task, this_cpu));
+                wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
        /* Warn about unreasonable delays: */
-        if (now <= (touch_timestamp + softlockup_thresh))
+        if (now <= (touch_ts + softlockup_thresh))
                return;
-        per_cpu(print_timestamp, this_cpu) = touch_timestamp;
+        per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
        spin_lock(&print_lock);
        printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
-                        this_cpu, now - touch_timestamp,
+                        this_cpu, now - touch_ts,
                        current->comm, task_pid_nr(current));
        print_modules();
        print_irqtrace_events(current);
@@ -209,32 +209,32 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                BUG_ON(per_cpu(watchdog_task, hotcpu));
+                BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
                p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
                if (IS_ERR(p)) {
                        printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
                        return NOTIFY_BAD;
                }
-                per_cpu(touch_timestamp, hotcpu) = 0;
+                per_cpu(softlockup_touch_ts, hotcpu) = 0;
-                per_cpu(watchdog_task, hotcpu) = p;
+                per_cpu(softlockup_watchdog, hotcpu) = p;
                kthread_bind(p, hotcpu);
                break;
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-                wake_up_process(per_cpu(watchdog_task, hotcpu));
+                wake_up_process(per_cpu(softlockup_watchdog, hotcpu));
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
-                if (!per_cpu(watchdog_task, hotcpu))
+                if (!per_cpu(softlockup_watchdog, hotcpu))
                        break;
                /* Unbind so it can run.  Fall thru. */
-                kthread_bind(per_cpu(watchdog_task, hotcpu),
+                kthread_bind(per_cpu(softlockup_watchdog, hotcpu),
                             cpumask_any(cpu_online_mask));
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                p = per_cpu(watchdog_task, hotcpu);
+                p = per_cpu(softlockup_watchdog, hotcpu);
-                per_cpu(watchdog_task, hotcpu) = NULL;
+                per_cpu(softlockup_watchdog, hotcpu) = NULL;
                kthread_stop(p);
                break;
 #endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9327a26765c5..554ac4894f0f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -244,6 +244,10 @@ static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;     /* 1 second */
 static int min_wakeup_granularity_ns;                   /* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;    /* 1 second */
+static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
+static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
+static int min_sched_shares_ratelimit = 100000; /* 100 usec */
+static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
 #endif
 static struct ctl_table kern_table[] = {
@@ -260,7 +264,7 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_sched_min_granularity,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = sched_nr_latency_handler,
+                .proc_handler   = sched_proc_update_handler,
                .extra1         = &min_sched_granularity_ns,
                .extra2         = &max_sched_granularity_ns,
        },
@@ -269,7 +273,7 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_sched_latency,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = sched_nr_latency_handler,
+                .proc_handler   = sched_proc_update_handler,
                .extra1         = &min_sched_granularity_ns,
                .extra2         = &max_sched_granularity_ns,
        },
@@ -278,7 +282,7 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_sched_wakeup_granularity,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = sched_proc_update_handler,
                .extra1         = &min_wakeup_granularity_ns,
                .extra2         = &max_wakeup_granularity_ns,
        },
@@ -287,7 +291,18 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_sched_shares_ratelimit,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = sched_proc_update_handler,
+                .extra1         = &min_sched_shares_ratelimit,
+                .extra2         = &max_sched_shares_ratelimit,
+        },
+        {
+                .procname       = "sched_tunable_scaling",
+                .data           = &sysctl_sched_tunable_scaling,
+                .maxlen         = sizeof(enum sched_tunable_scaling),
+                .mode           = 0644,
+                .proc_handler   = sched_proc_update_handler,
+                .extra1         = &min_sched_tunable_scaling,
+                .extra2         = &max_sched_tunable_scaling,
        },
        {
                .procname       = "sched_shares_thresh",
@@ -298,13 +313,6 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
        },
        {
-                .procname       = "sched_features",
-                .data           = &sysctl_sched_features,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-        {
                .procname       = "sched_migration_cost",
                .data           = &sysctl_sched_migration_cost,
                .maxlen         = sizeof(unsigned int),
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 665c76edbf17..9d80db4747d4 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
        P_ns(expires_next);
        P(hres_active);
        P(nr_events);
+        P(nr_retries);
+        P(nr_hangs);
+        P_ns(max_hang_time);
 #endif
 #undef P
 #undef P_ns
@@ -254,7 +257,7 @@ static int timer_list_show(struct seq_file *m, void *v)
        u64 now = ktime_to_ns(ktime_get());
        int cpu;
-        SEQ_printf(m, "Timer List Version: v0.4\n");
+        SEQ_printf(m, "Timer List Version: v0.5\n");
        SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
        SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index ee5681f8d7ec..63b117e9eba1 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -86,7 +86,7 @@ static DEFINE_SPINLOCK(table_lock);
 /*
 * Per-CPU lookup locks for fast hash lookup:
 */
-static DEFINE_PER_CPU(spinlock_t, lookup_lock);
+static DEFINE_PER_CPU(spinlock_t, tstats_lookup_lock);
 /*
 * Mutex to serialize state changes with show-stats activities:
@@ -245,7 +245,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
        if (likely(!timer_stats_active))
                return;
-        lock = &per_cpu(lookup_lock, raw_smp_processor_id());
+        lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id());
        input.timer = timer;
        input.start_func = startf;
@@ -348,9 +348,10 @@ static void sync_access(void)
        int cpu;
        for_each_online_cpu(cpu) {
-                spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags);
+                spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu);
+                spin_lock_irqsave(lock, flags);
                /* nothing */
-                spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags);
+                spin_unlock_irqrestore(lock, flags);
        }
 }
@@ -408,7 +409,7 @@ void __init init_timer_stats(void)
        int cpu;
        for_each_possible_cpu(cpu)
-                spin_lock_init(&per_cpu(lookup_lock, cpu));
+                spin_lock_init(&per_cpu(tstats_lookup_lock, cpu));
 }
 static int __init init_tstats_procfs(void)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 874f2893cff0..c82dfd92fdfd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -86,17 +86,17 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
 */
 static int tracing_disabled = 1;
-DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
+DEFINE_PER_CPU(int, ftrace_cpu_disabled);
 static inline void ftrace_disable_cpu(void)
 {
        preempt_disable();
-        local_inc(&__get_cpu_var(ftrace_cpu_disabled));
+        __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled));
 }
 static inline void ftrace_enable_cpu(void)
 {
-        local_dec(&__get_cpu_var(ftrace_cpu_disabled));
+        __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled));
        preempt_enable();
 }
@@ -203,7 +203,7 @@ cycle_t ftrace_now(int cpu)
 */
 static struct trace_array       max_tr;
-static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
+static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
 /* tracer_enabled is used to toggle activation of a tracer */
 static int                      tracer_enabled = 1;
@@ -1085,7 +1085,7 @@ trace_function(struct trace_array *tr,
        struct ftrace_entry *entry;
        /* If we are reading the ring buffer, don't trace */
-        if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+        if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
                return;
        event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1361,11 +1361,7 @@ int trace_array_vprintk(struct trace_array *tr,
        pause_graph_tracing();
        raw_local_irq_save(irq_flags);
        __raw_spin_lock(&trace_buf_lock);
-        if (args == NULL) {
+        len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
-                strncpy(trace_buf, fmt, TRACE_BUF_SIZE);
-                len = strlen(trace_buf);
-        } else
-                len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
        size = sizeof(*entry) + len + 1;
        buffer = tr->buffer;
@@ -1516,6 +1512,8 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
        int i = (int)*pos;
        void *ent;
+        WARN_ON_ONCE(iter->leftover);
        (*pos)++;
        /* can't go backwards */
@@ -1614,8 +1612,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
                        ;
        } else {
-                l = *pos - 1;
+                /*
-                p = s_next(m, p, &l);
+                 * If we overflowed the seq_file before, then we want
+                 * to just reuse the trace_seq buffer again.
+                 */
+                if (iter->leftover)
+                        p = iter;
+                else {
+                        l = *pos - 1;
+                        p = s_next(m, p, &l);
+                }
        }
        trace_event_read_lock();
@@ -1923,6 +1929,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 static int s_show(struct seq_file *m, void *v)
 {
        struct trace_iterator *iter = v;
+        int ret;
        if (iter->ent == NULL) {
                if (iter->tr) {
@@ -1942,9 +1949,27 @@ static int s_show(struct seq_file *m, void *v)
                        if (!(trace_flags & TRACE_ITER_VERBOSE))
                                print_func_help_header(m);
                }
+        } else if (iter->leftover) {
+                /*
+                 * If we filled the seq_file buffer earlier, we
+                 * want to just show it now.
+                 */
+                ret = trace_print_seq(m, &iter->seq);
+                /* ret should this time be zero, but you never know */
+                iter->leftover = ret;
        } else {
                print_trace_line(iter);
-                trace_print_seq(m, &iter->seq);
+                ret = trace_print_seq(m, &iter->seq);
+                /*
+                 * If we overflow the seq_file buffer, then it will
+                 * ask us for this data again at start up.
+                 * Use that instead.
+                 *  ret is 0 if seq_file write succeeded.
+                 *        -1 otherwise.
+                 */
+                iter->leftover = ret;
        }
        return 0;
@@ -2898,6 +2923,10 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
        else
                cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
+        if (iter->trace->pipe_close)
+                iter->trace->pipe_close(iter);
        mutex_unlock(&trace_types_lock);
        free_cpumask_var(iter->started);
@@ -3320,6 +3349,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
        return cnt;
 }
+static int mark_printk(const char *fmt, ...)
+{
+        int ret;
+        va_list args;
+        va_start(args, fmt);
+        ret = trace_vprintk(0, fmt, args);
+        va_end(args);
+        return ret;
+}
 static ssize_t
 tracing_mark_write(struct file *filp, const char __user *ubuf,
                                        size_t cnt, loff_t *fpos)
@@ -3346,7 +3385,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
        } else
                buf[cnt] = '\0';
-        cnt = trace_vprintk(0, buf, NULL);
+        cnt = mark_printk("%s", buf);
        kfree(buf);
        *fpos += cnt;
@@ -4415,7 +4454,7 @@ __init static int tracer_alloc_buffers(void)
        /* Allocate the first page for all buffers */
        for_each_tracing_cpu(i) {
                global_trace.data[i] = &per_cpu(global_trace_cpu, i);
-                max_tr.data[i] = &per_cpu(max_data, i);
+                max_tr.data[i] = &per_cpu(max_tr_data, i);
        }
        trace_init_cmdlines();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1d7f4830a80d..a52bed2eedd8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,6 +272,7 @@ struct tracer_flags {
 * @pipe_open: called when the trace_pipe file is opened
 * @wait_pipe: override how the user waits for traces on trace_pipe
 * @close: called when the trace file is released
+ * @pipe_close: called when the trace_pipe file is released
 * @read: override the default read callback on trace_pipe
 * @splice_read: override the default splice_read callback on trace_pipe
 * @selftest: selftest to run on boot (see trace_selftest.c)
@@ -290,6 +291,7 @@ struct tracer {
        void                    (*pipe_open)(struct trace_iterator *iter);
        void                    (*wait_pipe)(struct trace_iterator *iter);
        void                    (*close)(struct trace_iterator *iter);
+        void                    (*pipe_close)(struct trace_iterator *iter);
        ssize_t                 (*read)(struct trace_iterator *iter,
                                        struct file *filp, char __user *ubuf,
                                        size_t cnt, loff_t *ppos);
@@ -441,7 +443,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
 extern int ring_buffer_expanded;
 extern bool tracing_selftest_disabled;
-DECLARE_PER_CPU(local_t, ftrace_cpu_disabled);
+DECLARE_PER_CPU(int, ftrace_cpu_disabled);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 extern int trace_selftest_startup_function(struct tracer *trace,
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 45e6c01b2e4d..b1342c5d37cf 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -14,9 +14,20 @@
 #include "trace.h"
 #include "trace_output.h"
-struct fgraph_data {
+struct fgraph_cpu_data {
        pid_t           last_pid;
        int             depth;
+        int             ignore;
+};
+struct fgraph_data {
+        struct fgraph_cpu_data          *cpu_data;
+        /* Place to preserve last processed entry. */
+        struct ftrace_graph_ent_entry   ent;
+        struct ftrace_graph_ret_entry   ret;
+        int                             failed;
+        int                             cpu;
 };
 #define TRACE_GRAPH_INDENT      2
@@ -176,7 +187,7 @@ static int __trace_graph_entry(struct trace_array *tr,
        struct ring_buffer *buffer = tr->buffer;
        struct ftrace_graph_ent_entry *entry;
-        if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+        if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
                return 0;
        event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -240,7 +251,7 @@ static void __trace_graph_return(struct trace_array *tr,
        struct ring_buffer *buffer = tr->buffer;
        struct ftrace_graph_ret_entry *entry;
-        if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+        if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
                return;
        event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -384,7 +395,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
        if (!data)
                return TRACE_TYPE_HANDLED;
-        last_pid = &(per_cpu_ptr(data, cpu)->last_pid);
+        last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
        if (*last_pid == pid)
                return TRACE_TYPE_HANDLED;
@@ -435,26 +446,49 @@ static struct ftrace_graph_ret_entry *
 get_return_for_leaf(struct trace_iterator *iter,
                struct ftrace_graph_ent_entry *curr)
 {
-        struct ring_buffer_iter *ring_iter;
+        struct fgraph_data *data = iter->private;
+        struct ring_buffer_iter *ring_iter = NULL;
        struct ring_buffer_event *event;
        struct ftrace_graph_ret_entry *next;
-        ring_iter = iter->buffer_iter[iter->cpu];
+        /*
+         * If the previous output failed to write to the seq buffer,
+         * then we just reuse the data from before.
+         */
+        if (data && data->failed) {
+                curr = &data->ent;
+                next = &data->ret;
+        } else {
-        /* First peek to compare current entry and the next one */
+                ring_iter = iter->buffer_iter[iter->cpu];
-        if (ring_iter)
-                event = ring_buffer_iter_peek(ring_iter, NULL);
+                /* First peek to compare current entry and the next one */
-        else {
+                if (ring_iter)
-        /* We need to consume the current entry to see the next one */
+                        event = ring_buffer_iter_peek(ring_iter, NULL);
-                ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+                else {
-                event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+                        /*
-                                        NULL);
+                         * We need to consume the current entry to see
-        }
+                         * the next one.
+                         */
+                        ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+                        event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+                                                 NULL);
+                }
-        if (!event)
+                if (!event)
-                return NULL;
+                        return NULL;
+                next = ring_buffer_event_data(event);
-        next = ring_buffer_event_data(event);
+                if (data) {
+                        /*
+                         * Save current and next entries for later reference
+                         * if the output fails.
+                         */
+                        data->ent = *curr;
+                        data->ret = *next;
+                }
+        }
        if (next->ent.type != TRACE_GRAPH_RET)
                return NULL;
@@ -640,7 +674,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
        if (data) {
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data, cpu)->depth);
+                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
                /*
                 * Comments display at + 1 to depth. Since
@@ -688,7 +722,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
        if (data) {
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data, cpu)->depth);
+                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
                *depth = call->depth;
        }
@@ -782,19 +816,34 @@ static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
                        struct trace_iterator *iter)
 {
-        int cpu = iter->cpu;
+        struct fgraph_data *data = iter->private;
        struct ftrace_graph_ent *call = &field->graph_ent;
        struct ftrace_graph_ret_entry *leaf_ret;
+        static enum print_line_t ret;
+        int cpu = iter->cpu;
        if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
                return TRACE_TYPE_PARTIAL_LINE;
        leaf_ret = get_return_for_leaf(iter, field);
        if (leaf_ret)
-                return print_graph_entry_leaf(iter, field, leaf_ret, s);
+                ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
        else
-                return print_graph_entry_nested(iter, field, s, cpu);
+                ret = print_graph_entry_nested(iter, field, s, cpu);
+        if (data) {
+                /*
+                 * If we failed to write our output, then we need to make
+                 * note of it. Because we already consumed our entry.
+                 */
+                if (s->full) {
+                        data->failed = 1;
+                        data->cpu = cpu;
+                } else
+                        data->failed = 0;
+        }
+        return ret;
 }
 static enum print_line_t
@@ -810,7 +859,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
        if (data) {
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data, cpu)->depth);
+                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
                /*
                 * Comments display at + 1 to depth. This is the
@@ -873,7 +922,7 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
        int i;
        if (data)
-                depth = per_cpu_ptr(data, iter->cpu)->depth;
+                depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
        if (print_graph_prologue(iter, s, 0, 0))
                return TRACE_TYPE_PARTIAL_LINE;
@@ -941,8 +990,33 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
 enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
+        struct ftrace_graph_ent_entry *field;
+        struct fgraph_data *data = iter->private;
        struct trace_entry *entry = iter->ent;
        struct trace_seq *s = &iter->seq;
+        int cpu = iter->cpu;
+        int ret;
+        if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
+                per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
+                return TRACE_TYPE_HANDLED;
+        }
+        /*
+         * If the last output failed, there's a possibility we need
+         * to print out the missing entry which would never go out.
+         */
+        if (data && data->failed) {
+                field = &data->ent;
+                iter->cpu = data->cpu;
+                ret = print_graph_entry(field, s, iter);
+                if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
+                        per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
+                        ret = TRACE_TYPE_NO_CONSUME;
+                }
+                iter->cpu = cpu;
+                return ret;
+        }
        switch (entry->type) {
        case TRACE_GRAPH_ENT: {
@@ -952,7 +1026,7 @@ print_graph_function(struct trace_iterator *iter)
                 * sizeof(struct ftrace_graph_ent_entry) is very small,
                 * it can be safely saved at the stack.
                 */
-                struct ftrace_graph_ent_entry *field, saved;
+                struct ftrace_graph_ent_entry saved;
                trace_assign_type(field, entry);
                saved = *field;
                return print_graph_entry(&saved, s, iter);
@@ -1030,31 +1104,54 @@ static void print_graph_headers(struct seq_file *s)
 static void graph_trace_open(struct trace_iterator *iter)
 {
        /* pid and depth on the last trace processed */
-        struct fgraph_data *data = alloc_percpu(struct fgraph_data);
+        struct fgraph_data *data;
        int cpu;
+        iter->private = NULL;
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (!data)
-                pr_warning("function graph tracer: not enough memory\n");
+                goto out_err;
-        else
-                for_each_possible_cpu(cpu) {
+        data->cpu_data = alloc_percpu(struct fgraph_cpu_data);
-                        pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid);
+        if (!data->cpu_data)
-                        int *depth = &(per_cpu_ptr(data, cpu)->depth);
+                goto out_err_free;
-                        *pid = -1;
-                        *depth = 0;
+        for_each_possible_cpu(cpu) {
-                }
+                pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
+                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
+                int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
+                *pid = -1;
+                *depth = 0;
+                *ignore = 0;
+        }
        iter->private = data;
+        return;
+ out_err_free:
+        kfree(data);
+ out_err:
+        pr_warning("function graph tracer: not enough memory\n");
 }
 static void graph_trace_close(struct trace_iterator *iter)
 {
-        free_percpu(iter->private);
+        struct fgraph_data *data = iter->private;
+        if (data) {
+                free_percpu(data->cpu_data);
+                kfree(data);
+        }
 }
 static struct tracer graph_trace __read_mostly = {
        .name           = "function_graph",
        .open           = graph_trace_open,
+        .pipe_open      = graph_trace_open,
        .close          = graph_trace_close,
+        .pipe_close     = graph_trace_close,
        .wait_pipe      = poll_wait_pipe,
        .init           = graph_trace_init,
        .reset          = graph_trace_reset,
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 69543a905cd5..7b97000745f5 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -20,10 +20,10 @@
 #define BTS_BUFFER_SIZE (1 << 13)
-static DEFINE_PER_CPU(struct bts_tracer *, tracer);
+static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
-static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
+static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
-#define this_tracer per_cpu(tracer, smp_processor_id())
+#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
 static int trace_hw_branches_enabled __read_mostly;
 static int trace_hw_branches_suspended __read_mostly;
@@ -32,12 +32,13 @@ static struct trace_array *hw_branch_trace __read_mostly;
 static void bts_trace_init_cpu(int cpu)
 {
-        per_cpu(tracer, cpu) =
+        per_cpu(hwb_tracer, cpu) =
-                ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE,
+                ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
-                                   NULL, (size_t)-1, BTS_KERNEL);
+                                   BTS_BUFFER_SIZE, NULL, (size_t)-1,
+                                   BTS_KERNEL);
-        if (IS_ERR(per_cpu(tracer, cpu)))
+        if (IS_ERR(per_cpu(hwb_tracer, cpu)))
-                per_cpu(tracer, cpu) = NULL;
+                per_cpu(hwb_tracer, cpu) = NULL;
 }
 static int bts_trace_init(struct trace_array *tr)
@@ -51,7 +52,7 @@ static int bts_trace_init(struct trace_array *tr)
        for_each_online_cpu(cpu) {
                bts_trace_init_cpu(cpu);
-                if (likely(per_cpu(tracer, cpu)))
+                if (likely(per_cpu(hwb_tracer, cpu)))
                        trace_hw_branches_enabled = 1;
        }
        trace_hw_branches_suspended = 0;
@@ -67,9 +68,9 @@ static void bts_trace_reset(struct trace_array *tr)
        get_online_cpus();
        for_each_online_cpu(cpu) {
-                if (likely(per_cpu(tracer, cpu))) {
+                if (likely(per_cpu(hwb_tracer, cpu))) {
-                        ds_release_bts(per_cpu(tracer, cpu));
+                        ds_release_bts(per_cpu(hwb_tracer, cpu));
-                        per_cpu(tracer, cpu) = NULL;
+                        per_cpu(hwb_tracer, cpu) = NULL;
                }
        }
        trace_hw_branches_enabled = 0;
@@ -83,8 +84,8 @@ static void bts_trace_start(struct trace_array *tr)
        get_online_cpus();
        for_each_online_cpu(cpu)
-                if (likely(per_cpu(tracer, cpu)))
+                if (likely(per_cpu(hwb_tracer, cpu)))
-                        ds_resume_bts(per_cpu(tracer, cpu));
+                        ds_resume_bts(per_cpu(hwb_tracer, cpu));
        trace_hw_branches_suspended = 0;
        put_online_cpus();
 }
@@ -95,8 +96,8 @@ static void bts_trace_stop(struct trace_array *tr)
        get_online_cpus();
        for_each_online_cpu(cpu)
-                if (likely(per_cpu(tracer, cpu)))
+                if (likely(per_cpu(hwb_tracer, cpu)))
-                        ds_suspend_bts(per_cpu(tracer, cpu));
+                        ds_suspend_bts(per_cpu(hwb_tracer, cpu));
        trace_hw_branches_suspended = 1;
        put_online_cpus();
 }
@@ -114,16 +115,16 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
                        bts_trace_init_cpu(cpu);
                        if (trace_hw_branches_suspended &&
-                            likely(per_cpu(tracer, cpu)))
+                            likely(per_cpu(hwb_tracer, cpu)))
-                                ds_suspend_bts(per_cpu(tracer, cpu));
+                                ds_suspend_bts(per_cpu(hwb_tracer, cpu));
                }
                break;
        case CPU_DOWN_PREPARE:
                /* The notification is sent with interrupts enabled. */
-                if (likely(per_cpu(tracer, cpu))) {
+                if (likely(per_cpu(hwb_tracer, cpu))) {
-                        ds_release_bts(per_cpu(tracer, cpu));
+                        ds_release_bts(per_cpu(hwb_tracer, cpu));
-                        per_cpu(tracer, cpu) = NULL;
+                        per_cpu(hwb_tracer, cpu) = NULL;
                }
        }
@@ -258,8 +259,8 @@ static void trace_bts_prepare(struct trace_iterator *iter)
        get_online_cpus();
        for_each_online_cpu(cpu)
-                if (likely(per_cpu(tracer, cpu)))
+                if (likely(per_cpu(hwb_tracer, cpu)))
-                        ds_suspend_bts(per_cpu(tracer, cpu));
+                        ds_suspend_bts(per_cpu(hwb_tracer, cpu));
        /*
         * We need to collect the trace on the respective cpu since ftrace
         * implicitly adds the record for the current cpu.
@@ -268,8 +269,8 @@ static void trace_bts_prepare(struct trace_iterator *iter)
        on_each_cpu(trace_bts_cpu, iter->tr, 1);
        for_each_online_cpu(cpu)
-                if (likely(per_cpu(tracer, cpu)))
+                if (likely(per_cpu(hwb_tracer, cpu)))
-                        ds_resume_bts(per_cpu(tracer, cpu));
+                        ds_resume_bts(per_cpu(hwb_tracer, cpu));
        put_online_cpus();
 }
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index aff5f80b59b8..b52d397e57eb 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -606,23 +606,22 @@ static int create_trace_probe(int argc, char **argv)
         */
        struct trace_probe *tp;
        int i, ret = 0;
-        int is_return = 0;
+        int is_return = 0, is_delete = 0;
        char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
        unsigned long offset = 0;
        void *addr = NULL;
        char buf[MAX_EVENT_NAME_LEN];
-        if (argc < 2) {
+        /* argc must be >= 1 */
-                pr_info("Probe point is not specified.\n");
-                return -EINVAL;
-        }
        if (argv[0][0] == 'p')
                is_return = 0;
        else if (argv[0][0] == 'r')
                is_return = 1;
+        else if (argv[0][0] == '-')
+                is_delete = 1;
        else {
-                pr_info("Probe definition must be started with 'p' or 'r'.\n");
+                pr_info("Probe definition must be started with 'p', 'r' or"
+                        " '-'.\n");
                return -EINVAL;
        }
@@ -642,7 +641,29 @@ static int create_trace_probe(int argc, char **argv)
                        return -EINVAL;
                }
        }
+        if (!group)
+                group = KPROBE_EVENT_SYSTEM;
+        if (is_delete) {
+                if (!event) {
+                        pr_info("Delete command needs an event name.\n");
+                        return -EINVAL;
+                }
+                tp = find_probe_event(event, group);
+                if (!tp) {
+                        pr_info("Event %s/%s doesn't exist.\n", group, event);
+                        return -ENOENT;
+                }
+                /* delete an event */
+                unregister_trace_probe(tp);
+                free_trace_probe(tp);
+                return 0;
+        }
+        if (argc < 2) {
+                pr_info("Probe point is not specified.\n");
+                return -EINVAL;
+        }
        if (isdigit(argv[1][0])) {
                if (is_return) {
                        pr_info("Return probe point must be a symbol.\n");
@@ -671,8 +692,6 @@ static int create_trace_probe(int argc, char **argv)
        argc -= 2; argv += 2;
        /* setup a probe */
-        if (!group)
-                group = KPROBE_EVENT_SYSTEM;
        if (!event) {
                /* Make a new event name */
                if (symbol)
@@ -1114,7 +1133,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
        struct trace_probe *tp = (struct trace_probe *)event_call->data;
        ret = trace_define_common_fields(event_call);
-        if (!ret)
+        if (ret)
                return ret;
        DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
@@ -1132,7 +1151,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
        struct trace_probe *tp = (struct trace_probe *)event_call->data;
        ret = trace_define_common_fields(event_call);
-        if (!ret)
+        if (ret)
                return ret;
        DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index ddfa0fd43bc0..acb87d4a4ac1 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -79,11 +79,12 @@ void ksym_collect_stats(unsigned long hbp_hit_addr)
 }
 #endif /* CONFIG_PROFILE_KSYM_TRACER */
-void ksym_hbp_handler(struct perf_event *hbp, void *data)
+void ksym_hbp_handler(struct perf_event *hbp, int nmi,
+                      struct perf_sample_data *data,
+                      struct pt_regs *regs)
 {
        struct ring_buffer_event *event;
        struct ksym_trace_entry *entry;
-        struct pt_regs *regs = data;
        struct ring_buffer *buffer;
        int pc;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b6c12c6a1bcd..8e46b3323cdc 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -23,13 +23,21 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 static int next_event_type = __TRACE_LAST_TYPE + 1;
-void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+int trace_print_seq(struct seq_file *m, struct trace_seq *s)
 {
        int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+        int ret;
+        ret = seq_write(m, s->buffer, len);
-        seq_write(m, s->buffer, len);
+        /*
+         * Only reset this buffer if we successfully wrote to the
+         * seq_file buffer.
+         */
+        if (!ret)
+                trace_seq_init(s);
-        trace_seq_init(s);
+        return ret;
 }
 enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -85,7 +93,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
        va_list ap;
        int ret;
-        if (!len)
+        if (s->full || !len)
                return 0;
        va_start(ap, fmt);
@@ -93,8 +101,10 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
        va_end(ap);
        /* If we can't write it all, don't bother writing anything */
-        if (ret >= len)
+        if (ret >= len) {
+                s->full = 1;
                return 0;
+        }
        s->len += ret;
@@ -119,14 +129,16 @@ trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
        int len = (PAGE_SIZE - 1) - s->len;
        int ret;
-        if (!len)
+        if (s->full || !len)
                return 0;
        ret = vsnprintf(s->buffer + s->len, len, fmt, args);
        /* If we can't write it all, don't bother writing anything */
-        if (ret >= len)
+        if (ret >= len) {
+                s->full = 1;
                return 0;
+        }
        s->len += ret;
@@ -139,14 +151,16 @@ int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
        int len = (PAGE_SIZE - 1) - s->len;
        int ret;
-        if (!len)
+        if (s->full || !len)
                return 0;
        ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
        /* If we can't write it all, don't bother writing anything */
-        if (ret >= len)
+        if (ret >= len) {
+                s->full = 1;
                return 0;
+        }
        s->len += ret;
@@ -167,8 +181,13 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
 {
        int len = strlen(str);
-        if (len > ((PAGE_SIZE - 1) - s->len))
+        if (s->full)
+                return 0;
+        if (len > ((PAGE_SIZE - 1) - s->len)) {
+                s->full = 1;
                return 0;
+        }
        memcpy(s->buffer + s->len, str, len);
        s->len += len;
@@ -178,9 +197,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
 int trace_seq_putc(struct trace_seq *s, unsigned char c)
 {
-        if (s->len >= (PAGE_SIZE - 1))
+        if (s->full)
                return 0;
+        if (s->len >= (PAGE_SIZE - 1)) {
+                s->full = 1;
+                return 0;
+        }
        s->buffer[s->len++] = c;
        return 1;
@@ -188,9 +212,14 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
 int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
 {
-        if (len > ((PAGE_SIZE - 1) - s->len))
+        if (s->full)
                return 0;
+        if (len > ((PAGE_SIZE - 1) - s->len)) {
+                s->full = 1;
+                return 0;
+        }
        memcpy(s->buffer + s->len, mem, len);
        s->len += len;
@@ -203,6 +232,9 @@ int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
        const unsigned char *data = mem;
        int i, j;
+        if (s->full)
+                return 0;
 #ifdef __BIG_ENDIAN
        for (i = 0, j = 0; i < len; i++) {
 #else
@@ -220,8 +252,13 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
 {
        void *ret;
-        if (len > ((PAGE_SIZE - 1) - s->len))
+        if (s->full)
+                return 0;
+        if (len > ((PAGE_SIZE - 1) - s->len)) {
+                s->full = 1;
                return NULL;
+        }
        ret = s->buffer + s->len;
        s->len += len;
@@ -233,8 +270,14 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
 {
        unsigned char *p;
-        if (s->len >= (PAGE_SIZE - 1))
+        if (s->full)
+                return 0;
+        if (s->len >= (PAGE_SIZE - 1)) {
+                s->full = 1;
                return 0;
+        }
        p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
        if (!IS_ERR(p)) {
                p = mangle_path(s->buffer + s->len, p, "\n");
@@ -247,6 +290,7 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
                return 1;
        }
+        s->full = 1;
        return 0;
 }
@@ -373,6 +417,9 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
        unsigned long vmstart = 0;
        int ret = 1;
+        if (s->full)
+                return 0;
        if (mm) {
                const struct vm_area_struct *vma;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 67e526b6ae81..dee48658805c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,116 @@ struct workqueue_struct {
 #endif
 };
+#ifdef CONFIG_DEBUG_OBJECTS_WORK
+static struct debug_obj_descr work_debug_descr;
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
+ */
+static int work_fixup_init(void *addr, enum debug_obj_state state)
+{
+        struct work_struct *work = addr;
+        switch (state) {
+        case ODEBUG_STATE_ACTIVE:
+                cancel_work_sync(work);
+                debug_object_init(work, &work_debug_descr);
+                return 1;
+        default:
+                return 0;
+        }
+}
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ */
+static int work_fixup_activate(void *addr, enum debug_obj_state state)
+{
+        struct work_struct *work = addr;
+        switch (state) {
+        case ODEBUG_STATE_NOTAVAILABLE:
+                /*
+                 * This is not really a fixup. The work struct was
+                 * statically initialized. We just make sure that it
+                 * is tracked in the object tracker.
+                 */
+                if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
+                        debug_object_init(work, &work_debug_descr);
+                        debug_object_activate(work, &work_debug_descr);
+                        return 0;
+                }
+                WARN_ON_ONCE(1);
+                return 0;
+        case ODEBUG_STATE_ACTIVE:
+                WARN_ON(1);
+        default:
+                return 0;
+        }
+}
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int work_fixup_free(void *addr, enum debug_obj_state state)
+{
+        struct work_struct *work = addr;
+        switch (state) {
+        case ODEBUG_STATE_ACTIVE:
+                cancel_work_sync(work);
+                debug_object_free(work, &work_debug_descr);
+                return 1;
+        default:
+                return 0;
+        }
+}
+static struct debug_obj_descr work_debug_descr = {
+        .name           = "work_struct",
+        .fixup_init     = work_fixup_init,
+        .fixup_activate = work_fixup_activate,
+        .fixup_free     = work_fixup_free,
+};
+static inline void debug_work_activate(struct work_struct *work)
+{
+        debug_object_activate(work, &work_debug_descr);
+}
+static inline void debug_work_deactivate(struct work_struct *work)
+{
+        debug_object_deactivate(work, &work_debug_descr);
+}
+void __init_work(struct work_struct *work, int onstack)
+{
+        if (onstack)
+                debug_object_init_on_stack(work, &work_debug_descr);
+        else
+                debug_object_init(work, &work_debug_descr);
+}
+EXPORT_SYMBOL_GPL(__init_work);
+void destroy_work_on_stack(struct work_struct *work)
+{
+        debug_object_free(work, &work_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_work_on_stack);
+#else
+static inline void debug_work_activate(struct work_struct *work) { }
+static inline void debug_work_deactivate(struct work_struct *work) { }
+#endif
 /* Serializes the accesses to the list of workqueues. */
 static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
@@ -145,6 +255,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 {
        unsigned long flags;
+        debug_work_activate(work);
        spin_lock_irqsave(&cwq->lock, flags);
        insert_work(cwq, work, &cwq->worklist);
        spin_unlock_irqrestore(&cwq->lock, flags);
@@ -280,6 +391,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
                struct lockdep_map lockdep_map = work->lockdep_map;
 #endif
                trace_workqueue_execution(cwq->thread, work);
+                debug_work_deactivate(work);
                cwq->current_work = work;
                list_del_init(cwq->worklist.next);
                spin_unlock_irq(&cwq->lock);
@@ -350,11 +462,18 @@ static void wq_barrier_func(struct work_struct *work)
 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
                        struct wq_barrier *barr, struct list_head *head)
 {
-        INIT_WORK(&barr->work, wq_barrier_func);
+        /*
+         * debugobject calls are safe here even with cwq->lock locked
+         * as we know for sure that this will not trigger any of the
+         * checks and call back into the fixup functions where we
+         * might deadlock.
+         */
+        INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
        __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
        init_completion(&barr->done);
+        debug_work_activate(&barr->work);
        insert_work(cwq, &barr->work, head);
 }
@@ -372,8 +491,10 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
        }
        spin_unlock_irq(&cwq->lock);
-        if (active)
+        if (active) {
                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+        }
        return active;
 }
@@ -451,6 +572,7 @@ out:
                return 0;
        wait_for_completion(&barr.done);
+        destroy_work_on_stack(&barr.work);
        return 1;
 }
 EXPORT_SYMBOL_GPL(flush_work);
@@ -485,6 +607,7 @@ static int try_to_grab_pending(struct work_struct *work)
                 */
                smp_rmb();
                if (cwq == get_wq_data(work)) {
+                        debug_work_deactivate(work);
                        list_del_init(&work->entry);
                        ret = 1;
                }
@@ -507,8 +630,10 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
        }
        spin_unlock_irq(&cwq->lock);
-        if (unlikely(running))
+        if (unlikely(running)) {
                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+        }
 }
 static void wait_on_work(struct work_struct *work)