37 files changed, 1246 insertions, 599 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ba0f1ecb212..291ac586f37f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -212,6 +212,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
                                        hcpu, -1, &nr_calls);
        if (err == NOTIFY_BAD) {
+                set_cpu_active(cpu, true);
                nr_calls--;
                __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
                                          hcpu, nr_calls, NULL);
@@ -223,11 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        /* Ensure that we are not runnable on dying cpu */
        cpumask_copy(old_allowed, &current->cpus_allowed);
-        set_cpus_allowed_ptr(current,
+        set_cpus_allowed_ptr(current, cpu_active_mask);
-                             cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
        err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
        if (err) {
+                set_cpu_active(cpu, true);
                /* CPU didn't die: tell everyone.  Can't complain. */
                if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
                                            hcpu) == NOTIFY_BAD)
@@ -292,9 +294,6 @@ int __ref cpu_down(unsigned int cpu)
        err = _cpu_down(cpu, 0);
-        if (cpu_online(cpu))
-                set_cpu_active(cpu, true);
 out:
        cpu_maps_update_done();
        stop_machine_destroy();
@@ -387,15 +386,23 @@ int disable_nonboot_cpus(void)
         * with the userspace trying to use the CPU hotplug at the same time
         */
        cpumask_clear(frozen_cpus);
+        for_each_online_cpu(cpu) {
+                if (cpu == first_cpu)
+                        continue;
+                set_cpu_active(cpu, false);
+        }
+        synchronize_sched();
        printk("Disabling non-boot CPUs ...\n");
        for_each_online_cpu(cpu) {
                if (cpu == first_cpu)
                        continue;
                error = _cpu_down(cpu, 1);
-                if (!error) {
+                if (!error)
                        cpumask_set_cpu(cpu, frozen_cpus);
-                        printk("CPU%d is down\n", cpu);
+                else {
-                } else {
                        printk(KERN_ERR "Error taking CPU%d down: %d\n",
                                cpu, error);
                        break;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3cf2183b472d..ba401fab459f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -737,7 +737,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
 {
 }
-static int generate_sched_domains(struct cpumask **domains,
+static int generate_sched_domains(cpumask_var_t **domains,
                        struct sched_domain_attr **attributes)
 {
        *domains = NULL;
@@ -872,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
                if (retval < 0)
                        return retval;
-                if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask))
+                if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
                        return -EINVAL;
        }
        retval = validate_change(cs, trialcs);
@@ -2010,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                }
                /* Continue past cpusets with all cpus, mems online */
-                if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) &&
+                if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
                        continue;
@@ -2019,7 +2019,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                /* Remove offline cpus and mems from this cpuset. */
                mutex_lock(&callback_mutex);
                cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
-                            cpu_online_mask);
+                            cpu_active_mask);
                nodes_and(cp->mems_allowed, cp->mems_allowed,
                                                node_states[N_HIGH_MEMORY]);
                mutex_unlock(&callback_mutex);
@@ -2057,8 +2057,10 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
        switch (phase) {
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-        case CPU_DEAD:
+        case CPU_DOWN_PREPARE:
-        case CPU_DEAD_FROZEN:
+        case CPU_DOWN_PREPARE_FROZEN:
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
                break;
        default:
@@ -2067,7 +2069,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
        cgroup_lock();
        mutex_lock(&callback_mutex);
-        cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
+        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
        mutex_unlock(&callback_mutex);
        scan_for_empty_cpusets(&top_cpuset);
        ndoms = generate_sched_domains(&doms, &attr);
@@ -2114,7 +2116,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
 void __init cpuset_init_smp(void)
 {
-        cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
+        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
        hotcpu_notifier(cpuset_track_online_cpus, 0);
diff --git a/kernel/exit.c b/kernel/exit.c
index 1143012951e9..6f50ef55a6f3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -971,7 +971,7 @@ NORET_TYPE void do_exit(long code)
        exit_thread();
        cgroup_exit(tsk, 1);
-        if (group_dead && tsk->signal->leader)
+        if (group_dead)
                disassociate_ctty(1);
        module_put(task_thread_info(tsk)->exec_domain->module);
diff --git a/kernel/futex.c b/kernel/futex.c
index fb65e822fc41..d73ef1f3e55d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -304,8 +304,14 @@ void put_futex_key(int fshared, union futex_key *key)
 */
 static int fault_in_user_writeable(u32 __user *uaddr)
 {
-        int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
+        struct mm_struct *mm = current->mm;
-                                 1, 1, 0, NULL, NULL);
+        int ret;
+        down_read(&mm->mmap_sem);
+        ret = get_user_pages(current, mm, (unsigned long)uaddr,
+                             1, 1, 0, NULL, NULL);
+        up_read(&mm->mmap_sem);
        return ret < 0 ? ret : 0;
 }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 3e1c36e7998f..d2f9239dc6ba 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -557,7 +557,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 static int hrtimer_reprogram(struct hrtimer *timer,
                             struct hrtimer_clock_base *base)
 {
-        ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
+        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
        ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
        int res;
@@ -582,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer,
        if (expires.tv64 < 0)
                return -ETIME;
-        if (expires.tv64 >= expires_next->tv64)
+        if (expires.tv64 >= cpu_base->expires_next.tv64)
+                return 0;
+        /*
+         * If a hang was detected in the last timer interrupt then we
+         * do not schedule a timer which is earlier than the expiry
+         * which we enforced in the hang detection. We want the system
+         * to make progress.
+         */
+        if (cpu_base->hang_detected)
                return 0;
        /*
@@ -590,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
         */
        res = tick_program_event(expires, 0);
        if (!IS_ERR_VALUE(res))
-                *expires_next = expires;
+                cpu_base->expires_next = expires;
        return res;
 }
@@ -747,17 +756,33 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
 #endif /* CONFIG_HIGH_RES_TIMERS */
-#ifdef CONFIG_TIMER_STATS
+static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
-void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
 {
+#ifdef CONFIG_TIMER_STATS
        if (timer->start_site)
                return;
+        timer->start_site = __builtin_return_address(0);
-        timer->start_site = addr;
        memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
        timer->start_pid = current->pid;
+#endif
 }
+static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
+{
+#ifdef CONFIG_TIMER_STATS
+        timer->start_site = NULL;
+#endif
+}
+static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
+{
+#ifdef CONFIG_TIMER_STATS
+        if (likely(!timer_stats_active))
+                return;
+        timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
+                                 timer->function, timer->start_comm, 0);
 #endif
+}
 /*
 * Counterpart to lock_hrtimer_base above:
@@ -1217,29 +1242,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 #ifdef CONFIG_HIGH_RES_TIMERS
-static int force_clock_reprogram;
-/*
- * After 5 iteration's attempts, we consider that hrtimer_interrupt()
- * is hanging, which could happen with something that slows the interrupt
- * such as the tracing. Then we force the clock reprogramming for each future
- * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
- * threshold that we will overwrite.
- * The next tick event will be scheduled to 3 times we currently spend on
- * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
- * 1/4 of their time to process the hrtimer interrupts. This is enough to
- * let it running without serious starvation.
- */
-static inline void
-hrtimer_interrupt_hanging(struct clock_event_device *dev,
-                        ktime_t try_time)
-{
-        force_clock_reprogram = 1;
-        dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
-        printk(KERN_WARNING "hrtimer: interrupt too slow, "
-                "forcing clock min delta to %lu ns\n", dev->min_delta_ns);
-}
 /*
 * High resolution timer interrupt
 * Called with interrupts disabled
@@ -1248,21 +1250,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 {
        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
        struct hrtimer_clock_base *base;
-        ktime_t expires_next, now;
+        ktime_t expires_next, now, entry_time, delta;
-        int nr_retries = 0;
+        int i, retries = 0;
-        int i;
        BUG_ON(!cpu_base->hres_active);
        cpu_base->nr_events++;
        dev->next_event.tv64 = KTIME_MAX;
- retry:
+        entry_time = now = ktime_get();
-        /* 5 retries is enough to notice a hang */
+retry:
-        if (!(++nr_retries % 5))
-                hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
-        now = ktime_get();
        expires_next.tv64 = KTIME_MAX;
        spin_lock(&cpu_base->lock);
@@ -1324,10 +1320,48 @@ void hrtimer_interrupt(struct clock_event_device *dev)
        spin_unlock(&cpu_base->lock);
        /* Reprogramming necessary ? */
-        if (expires_next.tv64 != KTIME_MAX) {
+        if (expires_next.tv64 == KTIME_MAX ||
-                if (tick_program_event(expires_next, force_clock_reprogram))
+            !tick_program_event(expires_next, 0)) {
-                        goto retry;
+                cpu_base->hang_detected = 0;
+                return;
        }
+        /*
+         * The next timer was already expired due to:
+         * - tracing
+         * - long lasting callbacks
+         * - being scheduled away when running in a VM
+         *
+         * We need to prevent that we loop forever in the hrtimer
+         * interrupt routine. We give it 3 attempts to avoid
+         * overreacting on some spurious event.
+         */
+        now = ktime_get();
+        cpu_base->nr_retries++;
+        if (++retries < 3)
+                goto retry;
+        /*
+         * Give the system a chance to do something else than looping
+         * here. We stored the entry time, so we know exactly how long
+         * we spent here. We schedule the next event this amount of
+         * time away.
+         */
+        cpu_base->nr_hangs++;
+        cpu_base->hang_detected = 1;
+        delta = ktime_sub(now, entry_time);
+        if (delta.tv64 > cpu_base->max_hang_time.tv64)
+                cpu_base->max_hang_time = delta;
+        /*
+         * Limit it to a sensible value as we enforce a longer
+         * delay. Give the CPU at least 100ms to catch up.
+         */
+        if (delta.tv64 > 100 * NSEC_PER_MSEC)
+                expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
+        else
+                expires_next = ktime_add(now, delta);
+        tick_program_event(expires_next, 1);
+        printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
+                    ktime_to_ns(delta));
 }
 /*
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index cf5ee1628411..366eedf949c0 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -52,7 +52,7 @@
 static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
 /* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]);
+static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]);
 /* Number of non-pinned cpu/task breakpoints in a cpu */
 static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
@@ -73,7 +73,7 @@ static DEFINE_MUTEX(nr_bp_mutex);
 static unsigned int max_task_bp_pinned(int cpu)
 {
        int i;
-        unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu);
+        unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
        for (i = HBP_NUM -1; i >= 0; i--) {
                if (tsk_pinned[i] > 0)
@@ -83,15 +83,51 @@ static unsigned int max_task_bp_pinned(int cpu)
        return 0;
 }
+static int task_bp_pinned(struct task_struct *tsk)
+{
+        struct perf_event_context *ctx = tsk->perf_event_ctxp;
+        struct list_head *list;
+        struct perf_event *bp;
+        unsigned long flags;
+        int count = 0;
+        if (WARN_ONCE(!ctx, "No perf context for this task"))
+                return 0;
+        list = &ctx->event_list;
+        spin_lock_irqsave(&ctx->lock, flags);
+        /*
+         * The current breakpoint counter is not included in the list
+         * at the open() callback time
+         */
+        list_for_each_entry(bp, list, event_entry) {
+                if (bp->attr.type == PERF_TYPE_BREAKPOINT)
+                        count++;
+        }
+        spin_unlock_irqrestore(&ctx->lock, flags);
+        return count;
+}
 /*
 * Report the number of pinned/un-pinned breakpoints we have in
 * a given cpu (cpu > -1) or in all of them (cpu = -1).
 */
-static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
+static void
+fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
 {
+        int cpu = bp->cpu;
+        struct task_struct *tsk = bp->ctx->task;
        if (cpu >= 0) {
                slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
-                slots->pinned += max_task_bp_pinned(cpu);
+                if (!tsk)
+                        slots->pinned += max_task_bp_pinned(cpu);
+                else
+                        slots->pinned += task_bp_pinned(tsk);
                slots->flexible = per_cpu(nr_bp_flexible, cpu);
                return;
@@ -101,7 +137,10 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
                unsigned int nr;
                nr = per_cpu(nr_cpu_bp_pinned, cpu);
-                nr += max_task_bp_pinned(cpu);
+                if (!tsk)
+                        nr += max_task_bp_pinned(cpu);
+                else
+                        nr += task_bp_pinned(tsk);
                if (nr > slots->pinned)
                        slots->pinned = nr;
@@ -118,35 +157,12 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
 */
 static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
 {
-        int count = 0;
-        struct perf_event *bp;
-        struct perf_event_context *ctx = tsk->perf_event_ctxp;
        unsigned int *tsk_pinned;
-        struct list_head *list;
+        int count = 0;
-        unsigned long flags;
-        if (WARN_ONCE(!ctx, "No perf context for this task"))
-                return;
-        list = &ctx->event_list;
-        spin_lock_irqsave(&ctx->lock, flags);
-        /*
-         * The current breakpoint counter is not included in the list
-         * at the open() callback time
-         */
-        list_for_each_entry(bp, list, event_entry) {
-                if (bp->attr.type == PERF_TYPE_BREAKPOINT)
-                        count++;
-        }
-        spin_unlock_irqrestore(&ctx->lock, flags);
+        count = task_bp_pinned(tsk);
-        if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list"))
+        tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
-                return;
-        tsk_pinned = per_cpu(task_bp_pinned, cpu);
        if (enable) {
                tsk_pinned[count]++;
                if (count > 0)
@@ -193,7 +209,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 *   - If attached to a single cpu, check:
 *
 *       (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
- *           + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM
+ *           + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM
 *
 *       -> If there are already non-pinned counters in this cpu, it means
 *          there is already a free slot for them.
@@ -204,7 +220,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 *   - If attached to every cpus, check:
 *
 *       (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
- *           + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM
+ *           + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM
 *
 *       -> This is roughly the same, except we check the number of per cpu
 *          bp for every cpu and we keep the max one. Same for the per tasks
@@ -216,7 +232,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 *   - If attached to a single cpu, check:
 *
 *       ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
- *            + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM
+ *            + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM
 *
 *       -> Same checks as before. But now the nr_bp_flexible, if any, must keep
 *          one register at least (or they will never be fed).
@@ -224,7 +240,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 *   - If attached to every cpus, check:
 *
 *       ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
- *            + max(per_cpu(task_bp_pinned, *))) < HBP_NUM
+ *            + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
 */
 int reserve_bp_slot(struct perf_event *bp)
 {
@@ -233,7 +249,7 @@ int reserve_bp_slot(struct perf_event *bp)
        mutex_lock(&nr_bp_mutex);
-        fetch_bp_busy_slots(&slots, bp->cpu);
+        fetch_bp_busy_slots(&slots, bp);
        /* Flexible counters need to keep at least one slot */
        if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
@@ -259,7 +275,7 @@ void release_bp_slot(struct perf_event *bp)
 }
-int __register_perf_hw_breakpoint(struct perf_event *bp)
+int register_perf_hw_breakpoint(struct perf_event *bp)
 {
        int ret;
@@ -276,19 +292,12 @@ int __register_perf_hw_breakpoint(struct perf_event *bp)
         * This is a quick hack that will be removed soon, once we remove
         * the tmp breakpoints from ptrace
         */
-        if (!bp->attr.disabled || bp->callback == perf_bp_event)
+        if (!bp->attr.disabled || !bp->overflow_handler)
                ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
        return ret;
 }
-int register_perf_hw_breakpoint(struct perf_event *bp)
-{
-        bp->callback = perf_bp_event;
-        return __register_perf_hw_breakpoint(bp);
-}
 /**
 * register_user_hw_breakpoint - register a hardware breakpoint for user space
 * @attr: breakpoint attributes
@@ -297,7 +306,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
 */
 struct perf_event *
 register_user_hw_breakpoint(struct perf_event_attr *attr,
-                            perf_callback_t triggered,
+                            perf_overflow_handler_t triggered,
                            struct task_struct *tsk)
 {
        return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
@@ -311,19 +320,40 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
 * @triggered: callback to trigger when we hit the breakpoint
 * @tsk: pointer to 'task_struct' of the process to which the address belongs
 */
-struct perf_event *
+int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
-modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr,
-                          perf_callback_t triggered,
-                          struct task_struct *tsk)
 {
-        /*
+        u64 old_addr = bp->attr.bp_addr;
-         * FIXME: do it without unregistering
+        int old_type = bp->attr.bp_type;
-         * - We don't want to lose our slot
+        int old_len = bp->attr.bp_len;
-         * - If the new bp is incorrect, don't lose the older one
+        int err = 0;
-         */
-        unregister_hw_breakpoint(bp);
-        return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
+        perf_event_disable(bp);
+        bp->attr.bp_addr = attr->bp_addr;
+        bp->attr.bp_type = attr->bp_type;
+        bp->attr.bp_len = attr->bp_len;
+        if (attr->disabled)
+                goto end;
+        err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+        if (!err)
+                perf_event_enable(bp);
+        if (err) {
+                bp->attr.bp_addr = old_addr;
+                bp->attr.bp_type = old_type;
+                bp->attr.bp_len = old_len;
+                if (!bp->attr.disabled)
+                        perf_event_enable(bp);
+                return err;
+        }
+end:
+        bp->attr.disabled = attr->disabled;
+        return 0;
 }
 EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
@@ -348,7 +378,7 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
 */
 struct perf_event **
 register_wide_hw_breakpoint(struct perf_event_attr *attr,
-                            perf_callback_t triggered)
+                            perf_overflow_handler_t triggered)
 {
        struct perf_event **cpu_events, **pevent, *bp;
        long err;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index bde4c667d24d..7305b297d1eb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1067,7 +1067,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
                kfree(action);
 #ifdef CONFIG_DEBUG_SHIRQ
-        if (irqflags & IRQF_SHARED) {
+        if (!retval && (irqflags & IRQF_SHARED)) {
                /*
                 * It's a shared IRQ -- the driver ought to be prepared for it
                 * to happen immediately, so let's make sure....
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 22b0a6eedf24..e49ea1c5232d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -220,7 +220,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
                /*
                 * If we are seeing only the odd spurious IRQ caused by
                 * bus asynchronicity then don't eventually trigger an error,
-                 * otherwise the couter becomes a doomsday timer for otherwise
+                 * otherwise the counter becomes a doomsday timer for otherwise
                 * working systems
                 */
                if (time_after(jiffies, desc->last_unhandled + HZ/10))
diff --git a/kernel/itimer.c b/kernel/itimer.c
index b03451ede528..d802883153da 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -146,6 +146,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 {
        cputime_t cval, nval, cinterval, ninterval;
        s64 ns_ninterval, ns_nval;
+        u32 error, incr_error;
        struct cpu_itimer *it = &tsk->signal->it[clock_id];
        nval = timeval_to_cputime(&value->it_value);
@@ -153,8 +154,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
        ninterval = timeval_to_cputime(&value->it_interval);
        ns_ninterval = timeval_to_ns(&value->it_interval);
-        it->incr_error = cputime_sub_ns(ninterval, ns_ninterval);
+        error = cputime_sub_ns(nval, ns_nval);
-        it->error = cputime_sub_ns(nval, ns_nval);
+        incr_error = cputime_sub_ns(ninterval, ns_ninterval);
        spin_lock_irq(&tsk->sighand->siglock);
@@ -168,6 +169,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
        }
        it->expires = nval;
        it->incr = ninterval;
+        it->error = error;
+        it->incr_error = incr_error;
        trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
                           ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 7d7014634022..2eb517e23514 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -129,6 +129,7 @@ struct task_struct		*kgdb_usethread;
 struct task_struct              *kgdb_contthread;
 int                             kgdb_single_step;
+pid_t                           kgdb_sstep_pid;
 /* Our I/O buffers. */
 static char                     remcom_in_buffer[BUFMAX];
@@ -541,12 +542,17 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
         */
        if (tid == 0 || tid == -1)
                tid = -atomic_read(&kgdb_active) - 2;
-        if (tid < 0) {
+        if (tid < -1 && tid > -NR_CPUS - 2) {
                if (kgdb_info[-tid - 2].task)
                        return kgdb_info[-tid - 2].task;
                else
                        return idle_task(-tid - 2);
        }
+        if (tid <= 0) {
+                printk(KERN_ERR "KGDB: Internal thread select error\n");
+                dump_stack();
+                return NULL;
+        }
        /*
         * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -619,7 +625,8 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
 static int kgdb_activate_sw_breakpoints(void)
 {
        unsigned long addr;
-        int error = 0;
+        int error;
+        int ret = 0;
        int i;
        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -629,13 +636,16 @@ static int kgdb_activate_sw_breakpoints(void)
                addr = kgdb_break[i].bpt_addr;
                error = kgdb_arch_set_breakpoint(addr,
                                kgdb_break[i].saved_instr);
-                if (error)
+                if (error) {
-                        return error;
+                        ret = error;
+                        printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
+                        continue;
+                }
                kgdb_flush_swbreak_addr(addr);
                kgdb_break[i].state = BP_ACTIVE;
        }
-        return 0;
+        return ret;
 }
 static int kgdb_set_sw_break(unsigned long addr)
@@ -682,7 +692,8 @@ static int kgdb_set_sw_break(unsigned long addr)
 static int kgdb_deactivate_sw_breakpoints(void)
 {
        unsigned long addr;
-        int error = 0;
+        int error;
+        int ret = 0;
        int i;
        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -691,13 +702,15 @@ static int kgdb_deactivate_sw_breakpoints(void)
                addr = kgdb_break[i].bpt_addr;
                error = kgdb_arch_remove_breakpoint(addr,
                                        kgdb_break[i].saved_instr);
-                if (error)
+                if (error) {
-                        return error;
+                        printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
+                        ret = error;
+                }
                kgdb_flush_swbreak_addr(addr);
                kgdb_break[i].state = BP_SET;
        }
-        return 0;
+        return ret;
 }
 static int kgdb_remove_sw_break(unsigned long addr)
@@ -1204,8 +1217,10 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks)
                return 1;
        } else {
-                error_packet(remcom_out_buffer, -EINVAL);
+                kgdb_msg_write("KGDB only knows signal 9 (pass)"
-                return 0;
+                        " and 15 (pass and disconnect)\n"
+                        "Executing a continue without signal passing\n", 0);
+                remcom_in_buffer[0] = 'c';
        }
        /* Indicate fall through */
@@ -1395,6 +1410,7 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
        struct kgdb_state kgdb_var;
        struct kgdb_state *ks = &kgdb_var;
        unsigned long flags;
+        int sstep_tries = 100;
        int error = 0;
        int i, cpu;
@@ -1425,13 +1441,14 @@ acquirelock:
                cpu_relax();
        /*
-         * Do not start the debugger connection on this CPU if the last
+         * For single stepping, try to only enter on the processor
-         * instance of the exception handler wanted to come into the
+         * that was single stepping.  To gaurd against a deadlock, the
-         * debugger on a different CPU via a single step
+         * kernel will only try for the value of sstep_tries before
+         * giving up and continuing on.
         */
        if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
-            atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
+            (kgdb_info[cpu].task &&
+             kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
                atomic_set(&kgdb_active, -1);
                touch_softlockup_watchdog();
                clocksource_touch_watchdog();
@@ -1524,6 +1541,13 @@ acquirelock:
        }
 kgdb_restore:
+        if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
+                int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
+                if (kgdb_info[sstep_cpu].task)
+                        kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
+                else
+                        kgdb_sstep_pid = 0;
+        }
        /* Free kgdb_active */
        atomic_set(&kgdb_active, -1);
        touch_softlockup_watchdog();
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f5dcd36d3151..4f8df01dbe51 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -168,7 +168,7 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
        if (time > lt->max)
                lt->max = time;
-        if (time < lt->min || !lt->min)
+        if (time < lt->min || !lt->nr)
                lt->min = time;
        lt->total += time;
@@ -177,8 +177,15 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
 static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
 {
-        dst->min += src->min;
+        if (!src->nr)
-        dst->max += src->max;
+                return;
+        if (src->max > dst->max)
+                dst->max = src->max;
+        if (src->min < dst->min || !dst->nr)
+                dst->min = src->min;
        dst->total += src->total;
        dst->nr += src->nr;
 }
@@ -379,7 +386,8 @@ static int save_trace(struct stack_trace *trace)
         * complete trace that maxes out the entries provided will be reported
         * as incomplete, friggin useless </rant>
         */
-        if (trace->entries[trace->nr_entries-1] == ULONG_MAX)
+        if (trace->nr_entries != 0 &&
+            trace->entries[trace->nr_entries-1] == ULONG_MAX)
                trace->nr_entries--;
        trace->max_entries = trace->nr_entries;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 6b7ddba1dd64..e73e53c7582f 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -36,7 +36,7 @@
 /*
 * Each CPU has a list of per CPU events:
 */
-DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
 int perf_max_events __read_mostly = 1;
 static int perf_reserved_percpu __read_mostly;
@@ -476,7 +476,7 @@ static void perf_event_remove_from_context(struct perf_event *event)
        if (!task) {
                /*
                 * Per cpu events are removed via an smp call and
-                 * the removal is always sucessful.
+                 * the removal is always successful.
                 */
                smp_call_function_single(event->cpu,
                                         __perf_event_remove_from_context,
@@ -567,7 +567,7 @@ static void __perf_event_disable(void *info)
 * is the current context on this CPU and preemption is disabled,
 * hence we can't get into perf_event_task_sched_out for this context.
 */
-static void perf_event_disable(struct perf_event *event)
+void perf_event_disable(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
@@ -845,7 +845,7 @@ perf_install_in_context(struct perf_event_context *ctx,
        if (!task) {
                /*
                 * Per cpu events are installed via an smp call and
-                 * the install is always sucessful.
+                 * the install is always successful.
                 */
                smp_call_function_single(cpu, __perf_install_in_context,
                                         event, 1);
@@ -971,7 +971,7 @@ static void __perf_event_enable(void *info)
 * perf_event_for_each_child or perf_event_for_each as described
 * for perf_event_disable.
 */
-static void perf_event_enable(struct perf_event *event)
+void perf_event_enable(struct perf_event *event)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
@@ -1579,7 +1579,6 @@ static void
 __perf_event_init_context(struct perf_event_context *ctx,
                            struct task_struct *task)
 {
-        memset(ctx, 0, sizeof(*ctx));
        spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
        INIT_LIST_HEAD(&ctx->group_list);
@@ -1654,7 +1653,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
        }
        if (!ctx) {
-                ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
+                ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
                err = -ENOMEM;
                if (!ctx)
                        goto errout;
@@ -4011,6 +4010,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        event->pmu->read(event);
        data.addr = 0;
+        data.raw = NULL;
        data.period = event->hw.last_period;
        regs = get_irq_regs();
        /*
@@ -4080,8 +4080,7 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
        u64 now;
        now = cpu_clock(cpu);
-        prev = atomic64_read(&event->hw.prev_count);
+        prev = atomic64_xchg(&event->hw.prev_count, now);
-        atomic64_set(&event->hw.prev_count, now);
        atomic64_add(now - prev, &event->count);
 }
@@ -4286,15 +4285,8 @@ static void bp_perf_event_destroy(struct perf_event *event)
 static const struct pmu *bp_perf_event_init(struct perf_event *bp)
 {
        int err;
-        /*
-         * The breakpoint is already filled if we haven't created the counter
+        err = register_perf_hw_breakpoint(bp);
-         * through perf syscall
-         * FIXME: manage to get trigerred to NULL if it comes from syscalls
-         */
-        if (!bp->callback)
-                err = register_perf_hw_breakpoint(bp);
-        else
-                err = __register_perf_hw_breakpoint(bp);
        if (err)
                return ERR_PTR(err);
@@ -4308,6 +4300,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
        struct perf_sample_data sample;
        struct pt_regs *regs = data;
+        sample.raw = NULL;
        sample.addr = bp->attr.bp_addr;
        if (!perf_exclude_event(bp, regs))
@@ -4390,7 +4383,7 @@ perf_event_alloc(struct perf_event_attr *attr,
                   struct perf_event_context *ctx,
                   struct perf_event *group_leader,
                   struct perf_event *parent_event,
-                   perf_callback_t callback,
+                   perf_overflow_handler_t overflow_handler,
                   gfp_t gfpflags)
 {
        const struct pmu *pmu;
@@ -4433,10 +4426,10 @@ perf_event_alloc(struct perf_event_attr *attr,
        event->state            = PERF_EVENT_STATE_INACTIVE;
-        if (!callback && parent_event)
+        if (!overflow_handler && parent_event)
-                callback = parent_event->callback;
+                overflow_handler = parent_event->overflow_handler;
        
-        event->callback = callback;
+        event->overflow_handler = overflow_handler;
        if (attr->disabled)
                event->state = PERF_EVENT_STATE_OFF;
@@ -4776,7 +4769,8 @@ err_put_context:
 */
 struct perf_event *
 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
-                                 pid_t pid, perf_callback_t callback)
+                                 pid_t pid,
+                                 perf_overflow_handler_t overflow_handler)
 {
        struct perf_event *event;
        struct perf_event_context *ctx;
@@ -4793,7 +4787,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        }
        event = perf_event_alloc(attr, cpu, ctx, NULL,
-                                     NULL, callback, GFP_KERNEL);
+                                 NULL, overflow_handler, GFP_KERNEL);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err_put_context;
@@ -5090,7 +5084,7 @@ again:
 */
 int perf_event_init_task(struct task_struct *child)
 {
-        struct perf_event_context *child_ctx, *parent_ctx;
+        struct perf_event_context *child_ctx = NULL, *parent_ctx;
        struct perf_event_context *cloned_ctx;
        struct perf_event *event;
        struct task_struct *parent = current;
@@ -5106,20 +5100,6 @@ int perf_event_init_task(struct task_struct *child)
                return 0;
        /*
-         * This is executed from the parent task context, so inherit
-         * events that have been marked for cloning.
-         * First allocate and initialize a context for the child.
-         */
-        child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
-        if (!child_ctx)
-                return -ENOMEM;
-        __perf_event_init_context(child_ctx, child);
-        child->perf_event_ctxp = child_ctx;
-        get_task_struct(child);
-        /*
         * If the parent's context is a clone, pin it so it won't get
         * swapped under us.
         */
@@ -5149,6 +5129,26 @@ int perf_event_init_task(struct task_struct *child)
                        continue;
                }
+                if (!child->perf_event_ctxp) {
+                        /*
+                         * This is executed from the parent task context, so
+                         * inherit events that have been marked for cloning.
+                         * First allocate and initialize a context for the
+                         * child.
+                         */
+                        child_ctx = kzalloc(sizeof(struct perf_event_context),
+                                            GFP_KERNEL);
+                        if (!child_ctx) {
+                                ret = -ENOMEM;
+                                goto exit;
+                        }
+                        __perf_event_init_context(child_ctx, child);
+                        child->perf_event_ctxp = child_ctx;
+                        get_task_struct(child);
+                }
                ret = inherit_group(event, parent, parent_ctx,
                                             child, child_ctx);
                if (ret) {
@@ -5177,6 +5177,7 @@ int perf_event_init_task(struct task_struct *child)
                get_ctx(child_ctx->parent_ctx);
        }
+exit:
        mutex_unlock(&parent_ctx->mutex);
        perf_unpin_context(parent_ctx);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index dfdec524d1b7..3db49b9ca374 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -29,7 +29,6 @@
 #include <linux/pm_qos_params.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/time.h>
@@ -344,37 +343,33 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
 }
 EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
-#define PID_NAME_LEN sizeof("process_1234567890")
+#define PID_NAME_LEN 32
-static char name[PID_NAME_LEN];
 static int pm_qos_power_open(struct inode *inode, struct file *filp)
 {
        int ret;
        long pm_qos_class;
+        char name[PID_NAME_LEN];
-        lock_kernel();
        pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
        if (pm_qos_class >= 0) {
                filp->private_data = (void *)pm_qos_class;
-                sprintf(name, "process_%d", current->pid);
+                snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
                ret = pm_qos_add_requirement(pm_qos_class, name,
                                        PM_QOS_DEFAULT_VALUE);
-                if (ret >= 0) {
+                if (ret >= 0)
-                        unlock_kernel();
                        return 0;
-                }
        }
-        unlock_kernel();
        return -EPERM;
 }
 static int pm_qos_power_release(struct inode *inode, struct file *filp)
 {
        int pm_qos_class;
+        char name[PID_NAME_LEN];
        pm_qos_class = (long)filp->private_data;
-        sprintf(name, "process_%d", current->pid);
+        snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
        pm_qos_remove_requirement(pm_qos_class, name);
        return 0;
@@ -385,13 +380,14 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
 {
        s32 value;
        int pm_qos_class;
+        char name[PID_NAME_LEN];
        pm_qos_class = (long)filp->private_data;
        if (count != sizeof(s32))
                return -EINVAL;
        if (copy_from_user(&value, buf, sizeof(s32)))
                return -EFAULT;
-        sprintf(name, "process_%d", current->pid);
+        snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
        pm_qos_update_requirement(pm_qos_class, name, value);
        return  sizeof(s32);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 5c9dc228747b..438ff4523513 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -384,7 +384,8 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 /*
 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
- * This is called from sys_timer_create with the new timer already locked.
+ * This is called from sys_timer_create() and do_cpu_nanosleep() with the
+ * new timer already all-zeros initialized.
 */
 int posix_cpu_timer_create(struct k_itimer *new_timer)
 {
@@ -396,8 +397,6 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
                return -EINVAL;
        INIT_LIST_HEAD(&new_timer->it.cpu.entry);
-        new_timer->it.cpu.incr.sched = 0;
-        new_timer->it.cpu.expires.sched = 0;
        read_lock(&tasklist_lock);
        if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
diff --git a/kernel/resource.c b/kernel/resource.c
index fb11a58b9594..dc15686b7a77 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -308,35 +308,37 @@ static int find_resource(struct resource *root, struct resource *new,
                         void *alignf_data)
 {
        struct resource *this = root->child;
+        resource_size_t start, end;
-        new->start = root->start;
+        start = root->start;
        /*
         * Skip past an allocated resource that starts at 0, since the assignment
         * of this->start - 1 to new->end below would cause an underflow.
         */
        if (this && this->start == 0) {
-                new->start = this->end + 1;
+                start = this->end + 1;
                this = this->sibling;
        }
        for(;;) {
                if (this)
-                        new->end = this->start - 1;
+                        end = this->start - 1;
                else
-                        new->end = root->end;
+                        end = root->end;
-                if (new->start < min)
+                if (start < min)
-                        new->start = min;
+                        start = min;
-                if (new->end > max)
+                if (end > max)
-                        new->end = max;
+                        end = max;
-                new->start = ALIGN(new->start, align);
+                start = ALIGN(start, align);
                if (alignf)
                        alignf(alignf_data, new, size, align);
-                if (new->start < new->end && new->end - new->start >= size - 1) {
+                if (start < end && end - start >= size - 1) {
-                        new->end = new->start + size - 1;
+                        new->start = start;
+                        new->end = start + size - 1;
                        return 0;
                }
                if (!this)
                        break;
-                new->start = this->end + 1;
+                start = this->end + 1;
                this = this->sibling;
        }
        return -EBUSY;
diff --git a/kernel/sched.c b/kernel/sched.c
index e7f2cfa6a257..ff39cadf621e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -814,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
 * default: 0.25ms
 */
 unsigned int sysctl_sched_shares_ratelimit = 250000;
+unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
 /*
 * Inject some fuzzyness into changing the per-cpu group shares
@@ -1614,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
 */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-        unsigned long weight, rq_weight = 0, shares = 0;
+        unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
        unsigned long *usd_rq_weight;
        struct sched_domain *sd = data;
        unsigned long flags;
@@ -1630,6 +1631,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
                weight = tg->cfs_rq[i]->load.weight;
                usd_rq_weight[i] = weight;
+                rq_weight += weight;
                /*
                 * If there are currently no tasks on the cpu pretend there
                 * is one of average load so that when a new task gets to
@@ -1638,10 +1640,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
                if (!weight)
                        weight = NICE_0_LOAD;
-                rq_weight += weight;
+                sum_weight += weight;
                shares += tg->cfs_rq[i]->shares;
        }
+        if (!rq_weight)
+                rq_weight = sum_weight;
        if ((!shares && rq_weight) || shares > tg->shares)
                shares = tg->shares;
@@ -1810,6 +1815,22 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 #endif
 static void calc_load_account_active(struct rq *this_rq);
+static void update_sysctl(void);
+static int get_update_sysctl_factor(void);
+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+        set_task_rq(p, cpu);
+#ifdef CONFIG_SMP
+        /*
+         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
+         * successfuly executed on another CPU. We must ensure that updates of
+         * per-task data have been completed by this moment.
+         */
+        smp_wmb();
+        task_thread_info(p)->cpu = cpu;
+#endif
+}
 #include "sched_stats.h"
 #include "sched_idletask.c"
@@ -1967,20 +1988,6 @@ inline int task_curr(const struct task_struct *p)
        return cpu_curr(task_cpu(p)) == p;
 }
-static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-        set_task_rq(p, cpu);
-#ifdef CONFIG_SMP
-        /*
-         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
-         * successfuly executed on another CPU. We must ensure that updates of
-         * per-task data have been completed by this moment.
-         */
-        smp_wmb();
-        task_thread_info(p)->cpu = cpu;
-#endif
-}
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                                       const struct sched_class *prev_class,
                                       int oldprio, int running)
@@ -2060,29 +2067,13 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
        int old_cpu = task_cpu(p);
-        struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
        struct cfs_rq *old_cfsrq = task_cfs_rq(p),
                      *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
-        u64 clock_offset;
-        clock_offset = old_rq->clock - new_rq->clock;
        trace_sched_migrate_task(p, new_cpu);
-#ifdef CONFIG_SCHEDSTATS
-        if (p->se.wait_start)
-                p->se.wait_start -= clock_offset;
-        if (p->se.sleep_start)
-                p->se.sleep_start -= clock_offset;
-        if (p->se.block_start)
-                p->se.block_start -= clock_offset;
-#endif
        if (old_cpu != new_cpu) {
                p->se.nr_migrations++;
-#ifdef CONFIG_SCHEDSTATS
-                if (task_hot(p, old_rq->clock, NULL))
-                        schedstat_inc(p, se.nr_forced2_migrations);
-#endif
                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
                                     1, 1, NULL, 0);
        }
@@ -2323,6 +2314,14 @@ void task_oncpu_function_call(struct task_struct *p,
        preempt_enable();
 }
+#ifdef CONFIG_SMP
+static inline
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+{
+        return p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+}
+#endif
 /***
 * try_to_wake_up - wake up a thread
 * @p: the to-be-woken-up thread
@@ -2374,17 +2373,14 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
        if (task_contributes_to_load(p))
                rq->nr_uninterruptible--;
        p->state = TASK_WAKING;
-        task_rq_unlock(rq, &flags);
+        __task_rq_unlock(rq);
-        cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-        if (cpu != orig_cpu) {
+        if (cpu != orig_cpu)
-                local_irq_save(flags);
-                rq = cpu_rq(cpu);
-                update_rq_clock(rq);
                set_task_cpu(p, cpu);
-                local_irq_restore(flags);
-        }
+        rq = __task_rq_lock(p);
-        rq = task_rq_lock(p, &flags);
+        update_rq_clock(rq);
        WARN_ON(p->state != TASK_WAKING);
        cpu = task_cpu(p);
@@ -2499,7 +2495,6 @@ static void __sched_fork(struct task_struct *p)
        p->se.avg_overlap               = 0;
        p->se.start_runtime             = 0;
        p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
-        p->se.avg_running               = 0;
 #ifdef CONFIG_SCHEDSTATS
        p->se.wait_start                        = 0;
@@ -2521,7 +2516,6 @@ static void __sched_fork(struct task_struct *p)
        p->se.nr_failed_migrations_running      = 0;
        p->se.nr_failed_migrations_hot          = 0;
        p->se.nr_forced_migrations              = 0;
-        p->se.nr_forced2_migrations             = 0;
        p->se.nr_wakeups                        = 0;
        p->se.nr_wakeups_sync                   = 0;
@@ -2558,7 +2552,6 @@ static void __sched_fork(struct task_struct *p)
 void sched_fork(struct task_struct *p, int clone_flags)
 {
        int cpu = get_cpu();
-        unsigned long flags;
        __sched_fork(p);
@@ -2592,13 +2585,13 @@ void sched_fork(struct task_struct *p, int clone_flags)
        if (!rt_prio(p->prio))
                p->sched_class = &fair_sched_class;
+        if (p->sched_class->task_fork)
+                p->sched_class->task_fork(p);
 #ifdef CONFIG_SMP
-        cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
+        cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
 #endif
-        local_irq_save(flags);
-        update_rq_clock(cpu_rq(cpu));
        set_task_cpu(p, cpu);
-        local_irq_restore(flags);
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
        if (likely(sched_info_on()))
@@ -2631,17 +2624,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
        rq = task_rq_lock(p, &flags);
        BUG_ON(p->state != TASK_RUNNING);
        update_rq_clock(rq);
+        activate_task(rq, p, 0);
-        if (!p->sched_class->task_new || !current->se.on_rq) {
-                activate_task(rq, p, 0);
-        } else {
-                /*
-                 * Let the scheduling class do new task startup
-                 * management (if any):
-                 */
-                p->sched_class->task_new(rq, p);
-                inc_nr_running(rq);
-        }
        trace_sched_wakeup_new(rq, p, 1);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
@@ -3156,7 +3139,7 @@ out:
 void sched_exec(void)
 {
        int new_cpu, this_cpu = get_cpu();
-        new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
+        new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0);
        put_cpu();
        if (new_cpu != this_cpu)
                sched_migrate_task(current, new_cpu);
@@ -3172,10 +3155,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
        deactivate_task(src_rq, p, 0);
        set_task_cpu(p, this_cpu);
        activate_task(this_rq, p, 0);
-        /*
-         * Note that idle threads have a prio of MAX_PRIO, for this test
-         * to be always true for them.
-         */
        check_preempt_curr(this_rq, p, 0);
 }
@@ -4134,7 +4113,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        unsigned long flags;
        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-        cpumask_copy(cpus, cpu_online_mask);
+        cpumask_copy(cpus, cpu_active_mask);
        /*
         * When power savings policy is enabled for the parent domain, idle
@@ -4297,7 +4276,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
        int all_pinned = 0;
        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-        cpumask_copy(cpus, cpu_online_mask);
+        cpumask_copy(cpus, cpu_active_mask);
        /*
         * When power savings policy is enabled for the parent domain, idle
@@ -4694,7 +4673,7 @@ int select_nohz_load_balancer(int stop_tick)
                cpumask_set_cpu(cpu, nohz.cpu_mask);
                /* time for ilb owner also to sleep */
-                if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
+                if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
                        if (atomic_read(&nohz.load_balancer) == cpu)
                                atomic_set(&nohz.load_balancer, -1);
                        return 0;
@@ -5396,13 +5375,14 @@ static inline void schedule_debug(struct task_struct *prev)
 #endif
 }
-static void put_prev_task(struct rq *rq, struct task_struct *p)
+static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-        u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
+        if (prev->state == TASK_RUNNING) {
+                u64 runtime = prev->se.sum_exec_runtime;
-        update_avg(&p->se.avg_running, runtime);
+                runtime -= prev->se.prev_sum_exec_runtime;
+                runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
-        if (p->state == TASK_RUNNING) {
                /*
                 * In order to avoid avg_overlap growing stale when we are
                 * indeed overlapping and hence not getting put to sleep, grow
@@ -5412,12 +5392,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p)
                 * correlates to the amount of cache footprint a task can
                 * build up.
                 */
-                runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+                update_avg(&prev->se.avg_overlap, runtime);
-                update_avg(&p->se.avg_overlap, runtime);
-        } else {
-                update_avg(&p->se.avg_running, 0);
        }
-        p->sched_class->put_prev_task(rq, p);
+        prev->sched_class->put_prev_task(rq, prev);
 }
 /*
@@ -6631,6 +6608,8 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
 long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
        struct task_struct *p;
+        unsigned long flags;
+        struct rq *rq;
        int retval;
        get_online_cpus();
@@ -6645,7 +6624,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
        if (retval)
                goto out_unlock;
+        rq = task_rq_lock(p, &flags);
        cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+        task_rq_unlock(rq, &flags);
 out_unlock:
        read_unlock(&tasklist_lock);
@@ -6883,6 +6864,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 {
        struct task_struct *p;
        unsigned int time_slice;
+        unsigned long flags;
+        struct rq *rq;
        int retval;
        struct timespec t;
@@ -6899,7 +6882,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
        if (retval)
                goto out_unlock;
-        time_slice = p->sched_class->get_rr_interval(p);
+        rq = task_rq_lock(p, &flags);
+        time_slice = p->sched_class->get_rr_interval(rq, p);
+        task_rq_unlock(rq, &flags);
        read_unlock(&tasklist_lock);
        jiffies_to_timespec(time_slice, &t);
@@ -7000,7 +6985,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        __sched_fork(idle);
        idle->se.exec_start = sched_clock();
-        idle->prio = idle->normal_prio = MAX_PRIO;
        cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
        __set_task_cpu(idle, cpu);
@@ -7041,22 +7025,43 @@ cpumask_var_t nohz_cpu_mask;
 *
 * This idea comes from the SD scheduler of Con Kolivas:
 */
-static inline void sched_init_granularity(void)
+static int get_update_sysctl_factor(void)
 {
-        unsigned int factor = 1 + ilog2(num_online_cpus());
+        unsigned int cpus = min_t(int, num_online_cpus(), 8);
-        const unsigned long limit = 200000000;
+        unsigned int factor;
+        switch (sysctl_sched_tunable_scaling) {
+        case SCHED_TUNABLESCALING_NONE:
+                factor = 1;
+                break;
+        case SCHED_TUNABLESCALING_LINEAR:
+                factor = cpus;
+                break;
+        case SCHED_TUNABLESCALING_LOG:
+        default:
+                factor = 1 + ilog2(cpus);
+                break;
+        }
-        sysctl_sched_min_granularity *= factor;
+        return factor;
-        if (sysctl_sched_min_granularity > limit)
+}
-                sysctl_sched_min_granularity = limit;
-        sysctl_sched_latency *= factor;
+static void update_sysctl(void)
-        if (sysctl_sched_latency > limit)
+{
-                sysctl_sched_latency = limit;
+        unsigned int factor = get_update_sysctl_factor();
-        sysctl_sched_wakeup_granularity *= factor;
+#define SET_SYSCTL(name) \
+        (sysctl_##name = (factor) * normalized_sysctl_##name)
+        SET_SYSCTL(sched_min_granularity);
+        SET_SYSCTL(sched_latency);
+        SET_SYSCTL(sched_wakeup_granularity);
+        SET_SYSCTL(sched_shares_ratelimit);
+#undef SET_SYSCTL
+}
-        sysctl_sched_shares_ratelimit *= factor;
+static inline void sched_init_granularity(void)
+{
+        update_sysctl();
 }
 #ifdef CONFIG_SMP
@@ -7093,7 +7098,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        int ret = 0;
        rq = task_rq_lock(p, &flags);
-        if (!cpumask_intersects(new_mask, cpu_online_mask)) {
+        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                ret = -EINVAL;
                goto out;
        }
@@ -7115,7 +7120,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
-        if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
+        if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
                /* Need help from migration thread: drop lock and wait. */
                struct task_struct *mt = rq->migration_thread;
@@ -7269,19 +7274,19 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 again:
        /* Look for allowed, online CPU in same node. */
-        for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
+        for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
                if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                        goto move;
        /* Any allowed, online CPU? */
-        dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
+        dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
        if (dest_cpu < nr_cpu_ids)
                goto move;
        /* No more Mr. Nice Guy. */
        if (dest_cpu >= nr_cpu_ids) {
                cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
-                dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
+                dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
                /*
                 * Don't tell them about moving exiting tasks or
@@ -7310,7 +7315,7 @@ move:
 */
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
-        struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
+        struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
        unsigned long flags;
        local_irq_save(flags);
@@ -7563,7 +7568,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 static struct ctl_table_header *sd_sysctl_header;
 static void register_sched_domain_sysctl(void)
 {
-        int i, cpu_num = num_online_cpus();
+        int i, cpu_num = num_possible_cpus();
        struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
        char buf[32];
@@ -7573,7 +7578,7 @@ static void register_sched_domain_sysctl(void)
        if (entry == NULL)
                return;
-        for_each_online_cpu(i) {
+        for_each_possible_cpu(i) {
                snprintf(buf, 32, "cpu%d", i);
                entry->procname = kstrdup(buf, GFP_KERNEL);
                entry->mode = 0555;
@@ -7703,7 +7708,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                spin_lock_irq(&rq->lock);
                update_rq_clock(rq);
                deactivate_task(rq, rq->idle, 0);
-                rq->idle->static_prio = MAX_PRIO;
                __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
                rq->idle->sched_class = &idle_sched_class;
                migrate_dead_tasks(cpu);
@@ -9099,7 +9103,7 @@ match1:
        if (doms_new == NULL) {
                ndoms_cur = 0;
                doms_new = &fallback_doms;
-                cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map);
+                cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
                WARN_ON_ONCE(dattr_new);
        }
@@ -9230,8 +9234,10 @@ static int update_sched_domains(struct notifier_block *nfb,
        switch (action) {
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-        case CPU_DEAD:
+        case CPU_DOWN_PREPARE:
-        case CPU_DEAD_FROZEN:
+        case CPU_DOWN_PREPARE_FROZEN:
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
                partition_sched_domains(1, NULL, NULL);
                return NOTIFY_OK;
@@ -9278,7 +9284,7 @@ void __init sched_init_smp(void)
 #endif
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
-        arch_init_sched_domains(cpu_online_mask);
+        arch_init_sched_domains(cpu_active_mask);
        cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
        if (cpumask_empty(non_isolated_cpus))
                cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -9842,13 +9848,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                se = kzalloc_node(sizeof(struct sched_entity),
                                  GFP_KERNEL, cpu_to_node(i));
                if (!se)
-                        goto err;
+                        goto err_free_rq;
                init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
        }
        return 1;
+ err_free_rq:
+        kfree(cfs_rq);
 err:
        return 0;
 }
@@ -9930,13 +9938,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
                                     GFP_KERNEL, cpu_to_node(i));
                if (!rt_se)
-                        goto err;
+                        goto err_free_rq;
                init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
        }
        return 1;
+ err_free_rq:
+        kfree(rt_rq);
 err:
        return 0;
 }
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6988cf08f705..5ae24fc65d75 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -309,6 +309,12 @@ static void print_cpu(struct seq_file *m, int cpu)
        print_rq(m, rq, cpu);
 }
+static const char *sched_tunable_scaling_names[] = {
+        "none",
+        "logaritmic",
+        "linear"
+};
 static int sched_debug_show(struct seq_file *m, void *v)
 {
        u64 now = ktime_to_ns(ktime_get());
@@ -334,6 +340,10 @@ static int sched_debug_show(struct seq_file *m, void *v)
 #undef PN
 #undef P
+        SEQ_printf(m, "  .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+                sysctl_sched_tunable_scaling,
+                sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
        for_each_online_cpu(cpu)
                print_cpu(m, cpu);
@@ -399,7 +409,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        PN(se.sum_exec_runtime);
        PN(se.avg_overlap);
        PN(se.avg_wakeup);
-        PN(se.avg_running);
        nr_switches = p->nvcsw + p->nivcsw;
@@ -423,7 +432,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        P(se.nr_failed_migrations_running);
        P(se.nr_failed_migrations_hot);
        P(se.nr_forced_migrations);
-        P(se.nr_forced2_migrations);
        P(se.nr_wakeups);
        P(se.nr_wakeups_sync);
        P(se.nr_wakeups_migrate);
@@ -499,7 +507,6 @@ void proc_sched_set_task(struct task_struct *p)
        p->se.nr_failed_migrations_running      = 0;
        p->se.nr_failed_migrations_hot          = 0;
        p->se.nr_forced_migrations              = 0;
-        p->se.nr_forced2_migrations             = 0;
        p->se.nr_wakeups                        = 0;
        p->se.nr_wakeups_sync                   = 0;
        p->se.nr_wakeups_migrate                = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f61837ad336d..804a411838f1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
 */
 #include <linux/latencytop.h>
+#include <linux/sched.h>
 /*
 * Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
 *  run vmstat and monitor the context-switches (cs) field)
 */
 unsigned int sysctl_sched_latency = 5000000ULL;
+unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+/*
+ * The initial- and re-scaling of tunables is configurable
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
+ *
+ * Options are:
+ * SCHED_TUNABLESCALING_NONE - unscaled, always *1
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ */
+enum sched_tunable_scaling sysctl_sched_tunable_scaling
+        = SCHED_TUNABLESCALING_LOG;
 /*
 * Minimal preemption granularity for CPU-bound tasks:
 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
 */
 unsigned int sysctl_sched_min_granularity = 1000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
 /*
 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
 * have immediate wakeup/sleep latencies.
 */
 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 */
 #ifdef CONFIG_SCHED_DEBUG
-int sched_nr_latency_handler(struct ctl_table *table, int write,
+int sched_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        int factor = get_update_sysctl_factor();
        if (ret || !write)
                return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
        sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
                                        sysctl_sched_min_granularity);
+#define WRT_SYSCTL(name) \
+        (normalized_sysctl_##name = sysctl_##name / (factor))
+        WRT_SYSCTL(sched_min_granularity);
+        WRT_SYSCTL(sched_latency);
+        WRT_SYSCTL(sched_wakeup_granularity);
+        WRT_SYSCTL(sched_shares_ratelimit);
+#undef WRT_SYSCTL
        return 0;
 }
 #endif
@@ -1403,7 +1428,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                new_cpu = prev_cpu;
        }
-        rcu_read_lock();
        for_each_domain(cpu, tmp) {
                /*
                 * If power savings logic is enabled for a domain, see if we
@@ -1484,10 +1508,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                        update_shares(tmp);
        }
-        if (affine_sd && wake_affine(affine_sd, p, sync)) {
+        if (affine_sd && wake_affine(affine_sd, p, sync))
-                new_cpu = cpu;
+                return cpu;
-                goto out;
-        }
        while (sd) {
                int load_idx = sd->forkexec_idx;
@@ -1528,8 +1550,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                /* while loop will break here if sd == NULL */
        }
-out:
-        rcu_read_unlock();
        return new_cpu;
 }
 #endif /* CONFIG_SMP */
@@ -1651,12 +1671,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        int sync = wake_flags & WF_SYNC;
        int scale = cfs_rq->nr_running >= sched_nr_latency;
-        update_curr(cfs_rq);
+        if (unlikely(rt_prio(p->prio)))
+                goto preempt;
-        if (unlikely(rt_prio(p->prio))) {
-                resched_task(curr);
-                return;
-        }
        if (unlikely(p->sched_class != &fair_sched_class))
                return;
@@ -1682,50 +1698,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
                return;
        /* Idle tasks are by definition preempted by everybody. */
-        if (unlikely(curr->policy == SCHED_IDLE)) {
+        if (unlikely(curr->policy == SCHED_IDLE))
-                resched_task(curr);
+                goto preempt;
-                return;
-        }
-        if ((sched_feat(WAKEUP_SYNC) && sync) ||
+        if (sched_feat(WAKEUP_SYNC) && sync)
-            (sched_feat(WAKEUP_OVERLAP) &&
+                goto preempt;
-             (se->avg_overlap < sysctl_sched_migration_cost &&
-              pse->avg_overlap < sysctl_sched_migration_cost))) {
-                resched_task(curr);
-                return;
-        }
-        if (sched_feat(WAKEUP_RUNNING)) {
+        if (sched_feat(WAKEUP_OVERLAP) &&
-                if (pse->avg_running < se->avg_running) {
+                        se->avg_overlap < sysctl_sched_migration_cost &&
-                        set_next_buddy(pse);
+                        pse->avg_overlap < sysctl_sched_migration_cost)
-                        resched_task(curr);
+                goto preempt;
-                        return;
-                }
-        }
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
+        update_curr(cfs_rq);
        find_matching_se(&se, &pse);
        BUG_ON(!pse);
+        if (wakeup_preempt_entity(se, pse) == 1)
+                goto preempt;
-        if (wakeup_preempt_entity(se, pse) == 1) {
+        return;
-                resched_task(curr);
-                /*
+preempt:
-                 * Only set the backward buddy when the current task is still
+        resched_task(curr);
-                 * on the rq. This can happen when a wakeup gets interleaved
+        /*
-                 * with schedule on the ->pre_schedule() or idle_balance()
+         * Only set the backward buddy when the current task is still
-                 * point, either of which can * drop the rq lock.
+         * on the rq. This can happen when a wakeup gets interleaved
-                 *
+         * with schedule on the ->pre_schedule() or idle_balance()
-                 * Also, during early boot the idle thread is in the fair class,
+         * point, either of which can * drop the rq lock.
-                 * for obvious reasons its a bad idea to schedule back to it.
+         *
-                 */
+         * Also, during early boot the idle thread is in the fair class,
-                if (unlikely(!se->on_rq || curr == rq->idle))
+         * for obvious reasons its a bad idea to schedule back to it.
-                        return;
+         */
-                if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+        if (unlikely(!se->on_rq || curr == rq->idle))
-                        set_last_buddy(se);
+                return;
-        }
+        if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+                set_last_buddy(se);
 }
 static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1905,6 +1915,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        return 0;
 }
+static void rq_online_fair(struct rq *rq)
+{
+        update_sysctl();
+}
+static void rq_offline_fair(struct rq *rq)
+{
+        update_sysctl();
+}
 #endif /* CONFIG_SMP */
 /*
@@ -1922,28 +1943,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 }
 /*
- * Share the fairness runtime between parent and child, thus the
+ * called on fork with the child task as argument from the parent's context
- * total amount of pressure for CPU stays equal - new tasks
+ *  - child not yet on the tasklist
- * get a chance to run but frequent forkers are not allowed to
+ *  - preemption disabled
- * monopolize the CPU. Note: the parent runqueue is locked,
- * the child is not running yet.
 */
-static void task_new_fair(struct rq *rq, struct task_struct *p)
+static void task_fork_fair(struct task_struct *p)
 {
-        struct cfs_rq *cfs_rq = task_cfs_rq(p);
+        struct cfs_rq *cfs_rq = task_cfs_rq(current);
        struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
        int this_cpu = smp_processor_id();
+        struct rq *rq = this_rq();
+        unsigned long flags;
+        spin_lock_irqsave(&rq->lock, flags);
-        sched_info_queued(p);
+        if (unlikely(task_cpu(p) != this_cpu))
+                __set_task_cpu(p, this_cpu);
        update_curr(cfs_rq);
        if (curr)
                se->vruntime = curr->vruntime;
        place_entity(cfs_rq, se, 1);
-        /* 'curr' will be NULL if the child belongs to a different group */
+        if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
-        if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
-                        curr && entity_before(curr, se)) {
                /*
                 * Upon rescheduling, sched_class::put_prev_task() will place
                 * 'current' within the tree based on its new key value.
@@ -1952,7 +1975,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
                resched_task(rq->curr);
        }
-        enqueue_task_fair(rq, p, 0);
+        spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
@@ -2014,21 +2037,17 @@ static void moved_group_fair(struct task_struct *p)
 }
 #endif
-unsigned int get_rr_interval_fair(struct task_struct *task)
+unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
 {
        struct sched_entity *se = &task->se;
-        unsigned long flags;
-        struct rq *rq;
        unsigned int rr_interval = 0;
        /*
         * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
         * idle runqueue:
         */
-        rq = task_rq_lock(task, &flags);
        if (rq->cfs.load.weight)
                rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
-        task_rq_unlock(rq, &flags);
        return rr_interval;
 }
@@ -2052,11 +2071,13 @@ static const struct sched_class fair_sched_class = {
        .load_balance           = load_balance_fair,
        .move_one_task          = move_one_task_fair,
+        .rq_online              = rq_online_fair,
+        .rq_offline             = rq_offline_fair,
 #endif
        .set_curr_task          = set_curr_task_fair,
        .task_tick              = task_tick_fair,
-        .task_new               = task_new_fair,
+        .task_fork              = task_fork_fair,
        .prio_changed           = prio_changed_fair,
        .switched_to            = switched_to_fair,
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 0d94083582c7..d5059fd761d9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -54,11 +54,6 @@ SCHED_FEAT(WAKEUP_SYNC, 0)
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
 /*
- * Wakeup preemption towards tasks that run short
- */
-SCHED_FEAT(WAKEUP_RUNNING, 0)
-/*
 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
 * the remote end is likely to consume the data we just wrote, and
 * therefore has cache benefit from being placed on the same cpu, see
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index b133a28fcde3..33d5384a73a8 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -97,7 +97,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
                check_preempt_curr(rq, p, 0);
 }
-unsigned int get_rr_interval_idle(struct task_struct *task)
+unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
 {
        return 0;
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 5c5fef378415..aecbd9c6b20c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1721,7 +1721,7 @@ static void set_curr_task_rt(struct rq *rq)
        dequeue_pushable_task(rq, p);
 }
-unsigned int get_rr_interval_rt(struct task_struct *task)
+unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
 {
        /*
         * Time slice is 0 for SCHED_FIFO tasks
diff --git a/kernel/sys.c b/kernel/sys.c
index 9968c5fb55b9..585d6cd10040 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
 #include <linux/mm.h>
 #include <linux/utsname.h>
 #include <linux/mman.h>
-#include <linux/smp_lock.h>
 #include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/prctl.h>
@@ -349,6 +348,9 @@ void kernel_power_off(void)
        machine_power_off();
 }
 EXPORT_SYMBOL_GPL(kernel_power_off);
+static DEFINE_MUTEX(reboot_mutex);
 /*
 * Reboot system call: for obvious reasons only root may call it,
 * and even root needs to set up some magic numbers in the registers
@@ -381,7 +383,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
        if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
                cmd = LINUX_REBOOT_CMD_HALT;
-        lock_kernel();
+        mutex_lock(&reboot_mutex);
        switch (cmd) {
        case LINUX_REBOOT_CMD_RESTART:
                kernel_restart(NULL);
@@ -397,20 +399,18 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
        case LINUX_REBOOT_CMD_HALT:
                kernel_halt();
-                unlock_kernel();
                do_exit(0);
                panic("cannot halt");
        case LINUX_REBOOT_CMD_POWER_OFF:
                kernel_power_off();
-                unlock_kernel();
                do_exit(0);
                break;
        case LINUX_REBOOT_CMD_RESTART2:
                if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
-                        unlock_kernel();
+                        ret = -EFAULT;
-                        return -EFAULT;
+                        break;
                }
                buffer[sizeof(buffer) - 1] = '\0';
@@ -433,7 +433,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
                ret = -EINVAL;
                break;
        }
-        unlock_kernel();
+        mutex_unlock(&reboot_mutex);
        return ret;
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9327a26765c5..554ac4894f0f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -244,6 +244,10 @@ static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;     /* 1 second */
 static int min_wakeup_granularity_ns;                   /* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;    /* 1 second */
+static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
+static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
+static int min_sched_shares_ratelimit = 100000; /* 100 usec */
+static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
 #endif
 static struct ctl_table kern_table[] = {
@@ -260,7 +264,7 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_sched_min_granularity,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = sched_nr_latency_handler,
+                .proc_handler   = sched_proc_update_handler,
                .extra1         = &min_sched_granularity_ns,
                .extra2         = &max_sched_granularity_ns,
        },
@@ -269,7 +273,7 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_sched_latency,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = sched_nr_latency_handler,
+                .proc_handler   = sched_proc_update_handler,
                .extra1         = &min_sched_granularity_ns,
                .extra2         = &max_sched_granularity_ns,
        },
@@ -278,7 +282,7 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_sched_wakeup_granularity,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = sched_proc_update_handler,
                .extra1         = &min_wakeup_granularity_ns,
                .extra2         = &max_wakeup_granularity_ns,
        },
@@ -287,7 +291,18 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_sched_shares_ratelimit,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = sched_proc_update_handler,
+                .extra1         = &min_sched_shares_ratelimit,
+                .extra2         = &max_sched_shares_ratelimit,
+        },
+        {
+                .procname       = "sched_tunable_scaling",
+                .data           = &sysctl_sched_tunable_scaling,
+                .maxlen         = sizeof(enum sched_tunable_scaling),
+                .mode           = 0644,
+                .proc_handler   = sched_proc_update_handler,
+                .extra1         = &min_sched_tunable_scaling,
+                .extra2         = &max_sched_tunable_scaling,
        },
        {
                .procname       = "sched_shares_thresh",
@@ -298,13 +313,6 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
        },
        {
-                .procname       = "sched_features",
-                .data           = &sysctl_sched_features,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-        {
                .procname       = "sched_migration_cost",
                .data           = &sysctl_sched_migration_cost,
                .maxlen         = sizeof(unsigned int),
diff --git a/kernel/time.c b/kernel/time.c
index 804798005d19..c6324d96009e 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -136,7 +136,6 @@ static inline void warp_clock(void)
        write_seqlock_irq(&xtime_lock);
        wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
        xtime.tv_sec += sys_tz.tz_minuteswest * 60;
-        update_xtime_cache(0);
        write_sequnlock_irq(&xtime_lock);
        clock_was_set();
 }
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 620b58abdc32..20a8920029ee 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -20,6 +20,8 @@
 #include <linux/sysdev.h>
 #include <linux/tick.h>
+#include "tick-internal.h"
 /* The registered clock event devices */
 static LIST_HEAD(clockevent_devices);
 static LIST_HEAD(clockevents_released);
@@ -37,10 +39,9 @@ static DEFINE_SPINLOCK(clockevents_lock);
 *
 * Math helper, returns latch value converted to nanoseconds (bound checked)
 */
-unsigned long clockevent_delta2ns(unsigned long latch,
+u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
-                                  struct clock_event_device *evt)
 {
-        u64 clc = ((u64) latch << evt->shift);
+        u64 clc = (u64) latch << evt->shift;
        if (unlikely(!evt->mult)) {
                evt->mult = 1;
@@ -50,10 +51,10 @@ unsigned long clockevent_delta2ns(unsigned long latch,
        do_div(clc, evt->mult);
        if (clc < 1000)
                clc = 1000;
-        if (clc > LONG_MAX)
+        if (clc > KTIME_MAX)
-                clc = LONG_MAX;
+                clc = KTIME_MAX;
-        return (unsigned long) clc;
+        return clc;
 }
 EXPORT_SYMBOL_GPL(clockevent_delta2ns);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 4a310906b3e8..e85c23404d34 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -107,6 +107,59 @@ u64 timecounter_cyc2time(struct timecounter *tc,
 }
 EXPORT_SYMBOL_GPL(timecounter_cyc2time);
+/**
+ * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
+ * @mult:       pointer to mult variable
+ * @shift:      pointer to shift variable
+ * @from:       frequency to convert from
+ * @to:         frequency to convert to
+ * @minsec:     guaranteed runtime conversion range in seconds
+ *
+ * The function evaluates the shift/mult pair for the scaled math
+ * operations of clocksources and clockevents.
+ *
+ * @to and @from are frequency values in HZ. For clock sources @to is
+ * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
+ * event @to is the counter frequency and @from is NSEC_PER_SEC.
+ *
+ * The @minsec conversion range argument controls the time frame in
+ * seconds which must be covered by the runtime conversion with the
+ * calculated mult and shift factors. This guarantees that no 64bit
+ * overflow happens when the input value of the conversion is
+ * multiplied with the calculated mult factor. Larger ranges may
+ * reduce the conversion accuracy by chosing smaller mult and shift
+ * factors.
+ */
+void
+clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
+{
+        u64 tmp;
+        u32 sft, sftacc= 32;
+        /*
+         * Calculate the shift factor which is limiting the conversion
+         * range:
+         */
+        tmp = ((u64)minsec * from) >> 32;
+        while (tmp) {
+                tmp >>=1;
+                sftacc--;
+        }
+        /*
+         * Find the conversion shift/mult pair which has the best
+         * accuracy and fits the maxsec conversion range:
+         */
+        for (sft = 32; sft > 0; sft--) {
+                tmp = (u64) to << sft;
+                do_div(tmp, from);
+                if ((tmp >> sftacc) == 0)
+                        break;
+        }
+        *mult = tmp;
+        *shift = sft;
+}
 /*[Clocksource internal variables]---------
 * curr_clocksource:
 *      currently selected clocksource.
@@ -413,6 +466,47 @@ void clocksource_touch_watchdog(void)
        clocksource_resume_watchdog();
 }
+/**
+ * clocksource_max_deferment - Returns max time the clocksource can be deferred
+ * @cs:         Pointer to clocksource
+ *
+ */
+static u64 clocksource_max_deferment(struct clocksource *cs)
+{
+        u64 max_nsecs, max_cycles;
+        /*
+         * Calculate the maximum number of cycles that we can pass to the
+         * cyc2ns function without overflowing a 64-bit signed result. The
+         * maximum number of cycles is equal to ULLONG_MAX/cs->mult which
+         * is equivalent to the below.
+         * max_cycles < (2^63)/cs->mult
+         * max_cycles < 2^(log2((2^63)/cs->mult))
+         * max_cycles < 2^(log2(2^63) - log2(cs->mult))
+         * max_cycles < 2^(63 - log2(cs->mult))
+         * max_cycles < 1 << (63 - log2(cs->mult))
+         * Please note that we add 1 to the result of the log2 to account for
+         * any rounding errors, ensure the above inequality is satisfied and
+         * no overflow will occur.
+         */
+        max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1));
+        /*
+         * The actual maximum number of cycles we can defer the clocksource is
+         * determined by the minimum of max_cycles and cs->mask.
+         */
+        max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
+        max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift);
+        /*
+         * To ensure that the clocksource does not wrap whilst we are idle,
+         * limit the time the clocksource can be deferred by 12.5%. Please
+         * note a margin of 12.5% is used because this can be computed with
+         * a shift, versus say 10% which would require division.
+         */
+        return max_nsecs - (max_nsecs >> 5);
+}
 #ifdef CONFIG_GENERIC_TIME
 /**
@@ -511,6 +605,9 @@ static void clocksource_enqueue(struct clocksource *cs)
 */
 int clocksource_register(struct clocksource *cs)
 {
+        /* calculate max idle time permitted for this clocksource */
+        cs->max_idle_ns = clocksource_max_deferment(cs);
        mutex_lock(&clocksource_mutex);
        clocksource_enqueue(cs);
        clocksource_select();
@@ -580,7 +677,7 @@ sysfs_show_current_clocksources(struct sys_device *dev,
 * @count:      length of buffer
 *
 * Takes input from sysfs interface for manually overriding the default
- * clocksource selction.
+ * clocksource selection.
 */
 static ssize_t sysfs_override_clocksource(struct sys_device *dev,
                                          struct sysdev_attribute *attr,
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index a96c0e2b89cf..0a8a213016f0 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -50,9 +50,9 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
                                dev->min_delta_ns += dev->min_delta_ns >> 1;
                        printk(KERN_WARNING
-                               "CE: %s increasing min_delta_ns to %lu nsec\n",
+                               "CE: %s increasing min_delta_ns to %llu nsec\n",
                               dev->name ? dev->name : "?",
-                               dev->min_delta_ns << 1);
+                               (unsigned long long) dev->min_delta_ns << 1);
                        i = 0;
                }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 89aed5933ed4..f992762d7f51 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,18 +134,13 @@ __setup("nohz=", setup_tick_nohz);
 * value. We do this unconditionally on any cpu, as we don't know whether the
 * cpu, which has the update task assigned is in a long sleep.
 */
-static void tick_nohz_update_jiffies(void)
+static void tick_nohz_update_jiffies(ktime_t now)
 {
        int cpu = smp_processor_id();
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        unsigned long flags;
-        ktime_t now;
-        if (!ts->tick_stopped)
-                return;
        cpumask_clear_cpu(cpu, nohz_cpu_mask);
-        now = ktime_get();
        ts->idle_waketime = now;
        local_irq_save(flags);
@@ -155,20 +150,17 @@ static void tick_nohz_update_jiffies(void)
        touch_softlockup_watchdog();
 }
-static void tick_nohz_stop_idle(int cpu)
+static void tick_nohz_stop_idle(int cpu, ktime_t now)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+        ktime_t delta;
-        if (ts->idle_active) {
+        delta = ktime_sub(now, ts->idle_entrytime);
-                ktime_t now, delta;
+        ts->idle_lastupdate = now;
-                now = ktime_get();
+        ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-                delta = ktime_sub(now, ts->idle_entrytime);
+        ts->idle_active = 0;
-                ts->idle_lastupdate = now;
-                ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-                ts->idle_active = 0;
-                sched_clock_idle_wakeup_event(0);
+        sched_clock_idle_wakeup_event(0);
-        }
 }
 static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
@@ -216,6 +208,7 @@ void tick_nohz_stop_sched_tick(int inidle)
        struct tick_sched *ts;
        ktime_t last_update, expires, now;
        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+        u64 time_delta;
        int cpu;
        local_irq_save(flags);
@@ -263,7 +256,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                if (ratelimit < 10) {
                        printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
-                               local_softirq_pending());
+                               (unsigned int) local_softirq_pending());
                        ratelimit++;
                }
                goto end;
@@ -275,14 +268,18 @@ void tick_nohz_stop_sched_tick(int inidle)
                seq = read_seqbegin(&xtime_lock);
                last_update = last_jiffies_update;
                last_jiffies = jiffies;
+                time_delta = timekeeping_max_deferment();
        } while (read_seqretry(&xtime_lock, seq));
-        /* Get the next timer wheel timer */
+        if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
-        next_jiffies = get_next_timer_interrupt(last_jiffies);
+            arch_needs_cpu(cpu)) {
-        delta_jiffies = next_jiffies - last_jiffies;
+                next_jiffies = last_jiffies + 1;
-        if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
                delta_jiffies = 1;
+        } else {
+                /* Get the next timer wheel timer */
+                next_jiffies = get_next_timer_interrupt(last_jiffies);
+                delta_jiffies = next_jiffies - last_jiffies;
+        }
        /*
         * Do not stop the tick, if we are only one off
         * or if the cpu is required for rcu
@@ -294,22 +291,51 @@ void tick_nohz_stop_sched_tick(int inidle)
        if ((long)delta_jiffies >= 1) {
                /*
-                * calculate the expiry time for the next timer wheel
-                * timer
-                */
-                expires = ktime_add_ns(last_update, tick_period.tv64 *
-                                   delta_jiffies);
-                /*
                 * If this cpu is the one which updates jiffies, then
                 * give up the assignment and let it be taken by the
                 * cpu which runs the tick timer next, which might be
                 * this cpu as well. If we don't drop this here the
                 * jiffies might be stale and do_timer() never
-                 * invoked.
+                 * invoked. Keep track of the fact that it was the one
+                 * which had the do_timer() duty last. If this cpu is
+                 * the one which had the do_timer() duty last, we
+                 * limit the sleep time to the timekeeping
+                 * max_deferement value which we retrieved
+                 * above. Otherwise we can sleep as long as we want.
                 */
-                if (cpu == tick_do_timer_cpu)
+                if (cpu == tick_do_timer_cpu) {
                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+                        ts->do_timer_last = 1;
+                } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
+                        time_delta = KTIME_MAX;
+                        ts->do_timer_last = 0;
+                } else if (!ts->do_timer_last) {
+                        time_delta = KTIME_MAX;
+                }
+                /*
+                 * calculate the expiry time for the next timer wheel
+                 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
+                 * that there is no timer pending or at least extremely
+                 * far into the future (12 days for HZ=1000). In this
+                 * case we set the expiry to the end of time.
+                 */
+                if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
+                        /*
+                         * Calculate the time delta for the next timer event.
+                         * If the time delta exceeds the maximum time delta
+                         * permitted by the current clocksource then adjust
+                         * the time delta accordingly to ensure the
+                         * clocksource does not wrap.
+                         */
+                        time_delta = min_t(u64, time_delta,
+                                           tick_period.tv64 * delta_jiffies);
+                }
+                if (time_delta < KTIME_MAX)
+                        expires = ktime_add_ns(last_update, time_delta);
+                else
+                        expires.tv64 = KTIME_MAX;
                if (delta_jiffies > 1)
                        cpumask_set_cpu(cpu, nohz_cpu_mask);
@@ -342,22 +368,19 @@ void tick_nohz_stop_sched_tick(int inidle)
                ts->idle_sleeps++;
+                /* Mark expires */
+                ts->idle_expires = expires;
                /*
-                 * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that
+                 * If the expiration time == KTIME_MAX, then
-                 * there is no timer pending or at least extremly far
+                 * in this case we simply stop the tick timer.
-                 * into the future (12 days for HZ=1000). In this case
-                 * we simply stop the tick timer:
                 */
-                if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) {
+                 if (unlikely(expires.tv64 == KTIME_MAX)) {
-                        ts->idle_expires.tv64 = KTIME_MAX;
                        if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
                                hrtimer_cancel(&ts->sched_timer);
                        goto out;
                }
-                /* Mark expiries */
-                ts->idle_expires = expires;
                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
                        hrtimer_start(&ts->sched_timer, expires,
                                      HRTIMER_MODE_ABS_PINNED);
@@ -436,7 +459,11 @@ void tick_nohz_restart_sched_tick(void)
        ktime_t now;
        local_irq_disable();
-        tick_nohz_stop_idle(cpu);
+        if (ts->idle_active || (ts->inidle && ts->tick_stopped))
+                now = ktime_get();
+        if (ts->idle_active)
+                tick_nohz_stop_idle(cpu, now);
        if (!ts->inidle || !ts->tick_stopped) {
                ts->inidle = 0;
@@ -450,7 +477,6 @@ void tick_nohz_restart_sched_tick(void)
        /* Update jiffies first */
        select_nohz_load_balancer(0);
-        now = ktime_get();
        tick_do_update_jiffies64(now);
        cpumask_clear_cpu(cpu, nohz_cpu_mask);
@@ -584,22 +610,18 @@ static void tick_nohz_switch_to_nohz(void)
 * timer and do not touch the other magic bits which need to be done
 * when idle is left.
 */
-static void tick_nohz_kick_tick(int cpu)
+static void tick_nohz_kick_tick(int cpu, ktime_t now)
 {
 #if 0
        /* Switch back to 2.6.27 behaviour */
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-        ktime_t delta, now;
+        ktime_t delta;
-        if (!ts->tick_stopped)
-                return;
        /*
         * Do not touch the tick device, when the next expiry is either
         * already reached or less/equal than the tick period.
         */
-        now = ktime_get();
        delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
        if (delta.tv64 <= tick_period.tv64)
                return;
@@ -608,9 +630,26 @@ static void tick_nohz_kick_tick(int cpu)
 #endif
 }
+static inline void tick_check_nohz(int cpu)
+{
+        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+        ktime_t now;
+        if (!ts->idle_active && !ts->tick_stopped)
+                return;
+        now = ktime_get();
+        if (ts->idle_active)
+                tick_nohz_stop_idle(cpu, now);
+        if (ts->tick_stopped) {
+                tick_nohz_update_jiffies(now);
+                tick_nohz_kick_tick(cpu, now);
+        }
+}
 #else
 static inline void tick_nohz_switch_to_nohz(void) { }
+static inline void tick_check_nohz(int cpu) { }
 #endif /* NO_HZ */
@@ -620,11 +659,7 @@ static inline void tick_nohz_switch_to_nohz(void) { }
 void tick_check_idle(int cpu)
 {
        tick_check_oneshot_broadcast(cpu);
-#ifdef CONFIG_NO_HZ
+        tick_check_nohz(cpu);
-        tick_nohz_stop_idle(cpu);
-        tick_nohz_update_jiffies();
-        tick_nohz_kick_tick(cpu);
-#endif
 }
 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c3a4e2907eaa..af4135f05825 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -165,19 +165,12 @@ struct timespec raw_time;
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
-static struct timespec xtime_cache __attribute__ ((aligned (16)));
-void update_xtime_cache(u64 nsec)
-{
-        xtime_cache = xtime;
-        timespec_add_ns(&xtime_cache, nsec);
-}
 /* must hold xtime_lock */
 void timekeeping_leap_insert(int leapsecond)
 {
        xtime.tv_sec += leapsecond;
        wall_to_monotonic.tv_sec -= leapsecond;
-        update_vsyscall(&xtime, timekeeper.clock);
+        update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
 }
 #ifdef CONFIG_GENERIC_TIME
@@ -332,12 +325,10 @@ int do_settimeofday(struct timespec *tv)
        xtime = *tv;
-        update_xtime_cache(0);
        timekeeper.ntp_error = 0;
        ntp_clear();
-        update_vsyscall(&xtime, timekeeper.clock);
+        update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
        write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -488,6 +479,17 @@ int timekeeping_valid_for_hres(void)
 }
 /**
+ * timekeeping_max_deferment - Returns max time the clocksource can be deferred
+ *
+ * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
+ * ensure that the clocksource does not change!
+ */
+u64 timekeeping_max_deferment(void)
+{
+        return timekeeper.clock->max_idle_ns;
+}
+/**
 * read_persistent_clock -  Return time from the persistent clock.
 *
 * Weak dummy function for arches that do not yet support it.
@@ -548,7 +550,6 @@ void __init timekeeping_init(void)
        }
        set_normalized_timespec(&wall_to_monotonic,
                                -boot.tv_sec, -boot.tv_nsec);
-        update_xtime_cache(0);
        total_sleep_time.tv_sec = 0;
        total_sleep_time.tv_nsec = 0;
        write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -582,7 +583,6 @@ static int timekeeping_resume(struct sys_device *dev)
                wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
                total_sleep_time = timespec_add_safe(total_sleep_time, ts);
        }
-        update_xtime_cache(0);
        /* re-base the last cycle value */
        timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
        timekeeper.ntp_error = 0;
@@ -723,6 +723,49 @@ static void timekeeping_adjust(s64 offset)
 }
 /**
+ * logarithmic_accumulation - shifted accumulation of cycles
+ *
+ * This functions accumulates a shifted interval of cycles into
+ * into a shifted interval nanoseconds. Allows for O(log) accumulation
+ * loop.
+ *
+ * Returns the unconsumed cycles.
+ */
+static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
+{
+        u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+        /* If the offset is smaller then a shifted interval, do nothing */
+        if (offset < timekeeper.cycle_interval<<shift)
+                return offset;
+        /* Accumulate one shifted interval */
+        offset -= timekeeper.cycle_interval << shift;
+        timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
+        timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
+        while (timekeeper.xtime_nsec >= nsecps) {
+                timekeeper.xtime_nsec -= nsecps;
+                xtime.tv_sec++;
+                second_overflow();
+        }
+        /* Accumulate into raw time */
+        raw_time.tv_nsec += timekeeper.raw_interval << shift;;
+        while (raw_time.tv_nsec >= NSEC_PER_SEC) {
+                raw_time.tv_nsec -= NSEC_PER_SEC;
+                raw_time.tv_sec++;
+        }
+        /* Accumulate error between NTP and clock interval */
+        timekeeper.ntp_error += tick_length << shift;
+        timekeeper.ntp_error -= timekeeper.xtime_interval <<
+                                (timekeeper.ntp_error_shift + shift);
+        return offset;
+}
+/**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
 * Called from the timer interrupt, must hold a write on xtime_lock.
@@ -731,7 +774,7 @@ void update_wall_time(void)
 {
        struct clocksource *clock;
        cycle_t offset;
-        u64 nsecs;
+        int shift = 0, maxshift;
        /* Make sure we're fully resumed: */
        if (unlikely(timekeeping_suspended))
@@ -745,33 +788,22 @@ void update_wall_time(void)
 #endif
        timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
-        /* normally this loop will run just once, however in the
+        /*
-         * case of lost or late ticks, it will accumulate correctly.
+         * With NO_HZ we may have to accumulate many cycle_intervals
+         * (think "ticks") worth of time at once. To do this efficiently,
+         * we calculate the largest doubling multiple of cycle_intervals
+         * that is smaller then the offset. We then accumulate that
+         * chunk in one go, and then try to consume the next smaller
+         * doubled multiple.
         */
+        shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
+        shift = max(0, shift);
+        /* Bound shift to one less then what overflows tick_length */
+        maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
+        shift = min(shift, maxshift);
        while (offset >= timekeeper.cycle_interval) {
-                u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+                offset = logarithmic_accumulation(offset, shift);
+                shift--;
-                /* accumulate one interval */
-                offset -= timekeeper.cycle_interval;
-                clock->cycle_last += timekeeper.cycle_interval;
-                timekeeper.xtime_nsec += timekeeper.xtime_interval;
-                if (timekeeper.xtime_nsec >= nsecps) {
-                        timekeeper.xtime_nsec -= nsecps;
-                        xtime.tv_sec++;
-                        second_overflow();
-                }
-                raw_time.tv_nsec += timekeeper.raw_interval;
-                if (raw_time.tv_nsec >= NSEC_PER_SEC) {
-                        raw_time.tv_nsec -= NSEC_PER_SEC;
-                        raw_time.tv_sec++;
-                }
-                /* accumulate error between NTP and clock interval */
-                timekeeper.ntp_error += tick_length;
-                timekeeper.ntp_error -= timekeeper.xtime_interval <<
-                                        timekeeper.ntp_error_shift;
        }
        /* correct the clock when NTP error is too big */
@@ -807,11 +839,8 @@ void update_wall_time(void)
        timekeeper.ntp_error += timekeeper.xtime_nsec <<
                                timekeeper.ntp_error_shift;
-        nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
-        update_xtime_cache(nsecs);
        /* check to see if there is a new clocksource to use */
-        update_vsyscall(&xtime, timekeeper.clock);
+        update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
 }
 /**
@@ -846,13 +875,13 @@ void monotonic_to_bootbased(struct timespec *ts)
 unsigned long get_seconds(void)
 {
-        return xtime_cache.tv_sec;
+        return xtime.tv_sec;
 }
 EXPORT_SYMBOL(get_seconds);
 struct timespec __current_kernel_time(void)
 {
-        return xtime_cache;
+        return xtime;
 }
 struct timespec current_kernel_time(void)
@@ -862,8 +891,7 @@ struct timespec current_kernel_time(void)
        do {
                seq = read_seqbegin(&xtime_lock);
+                now = xtime;
-                now = xtime_cache;
        } while (read_seqretry(&xtime_lock, seq));
        return now;
@@ -877,8 +905,7 @@ struct timespec get_monotonic_coarse(void)
        do {
                seq = read_seqbegin(&xtime_lock);
+                now = xtime;
-                now = xtime_cache;
                mono = wall_to_monotonic;
        } while (read_seqretry(&xtime_lock, seq));
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1b5b7aa2fdfd..9d80db4747d4 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
        P_ns(expires_next);
        P(hres_active);
        P(nr_events);
+        P(nr_retries);
+        P(nr_hangs);
+        P_ns(max_hang_time);
 #endif
 #undef P
 #undef P_ns
@@ -204,10 +207,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
                return;
        }
        SEQ_printf(m, "%s\n", dev->name);
-        SEQ_printf(m, " max_delta_ns:   %lu\n", dev->max_delta_ns);
+        SEQ_printf(m, " max_delta_ns:   %llu\n",
-        SEQ_printf(m, " min_delta_ns:   %lu\n", dev->min_delta_ns);
+                   (unsigned long long) dev->max_delta_ns);
-        SEQ_printf(m, " mult:           %lu\n", dev->mult);
+        SEQ_printf(m, " min_delta_ns:   %llu\n",
-        SEQ_printf(m, " shift:          %d\n", dev->shift);
+                   (unsigned long long) dev->min_delta_ns);
+        SEQ_printf(m, " mult:           %u\n", dev->mult);
+        SEQ_printf(m, " shift:          %u\n", dev->shift);
        SEQ_printf(m, " mode:           %d\n", dev->mode);
        SEQ_printf(m, " next_event:     %Ld nsecs\n",
                   (unsigned long long) ktime_to_ns(dev->next_event));
@@ -252,7 +257,7 @@ static int timer_list_show(struct seq_file *m, void *v)
        u64 now = ktime_to_ns(ktime_get());
        int cpu;
-        SEQ_printf(m, "Timer List Version: v0.4\n");
+        SEQ_printf(m, "Timer List Version: v0.5\n");
        SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
        SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 874f2893cff0..88bd9ae2a9ed 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1361,11 +1361,7 @@ int trace_array_vprintk(struct trace_array *tr,
        pause_graph_tracing();
        raw_local_irq_save(irq_flags);
        __raw_spin_lock(&trace_buf_lock);
-        if (args == NULL) {
+        len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
-                strncpy(trace_buf, fmt, TRACE_BUF_SIZE);
-                len = strlen(trace_buf);
-        } else
-                len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
        size = sizeof(*entry) + len + 1;
        buffer = tr->buffer;
@@ -1516,6 +1512,8 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
        int i = (int)*pos;
        void *ent;
+        WARN_ON_ONCE(iter->leftover);
        (*pos)++;
        /* can't go backwards */
@@ -1614,8 +1612,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
                        ;
        } else {
-                l = *pos - 1;
+                /*
-                p = s_next(m, p, &l);
+                 * If we overflowed the seq_file before, then we want
+                 * to just reuse the trace_seq buffer again.
+                 */
+                if (iter->leftover)
+                        p = iter;
+                else {
+                        l = *pos - 1;
+                        p = s_next(m, p, &l);
+                }
        }
        trace_event_read_lock();
@@ -1923,6 +1929,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 static int s_show(struct seq_file *m, void *v)
 {
        struct trace_iterator *iter = v;
+        int ret;
        if (iter->ent == NULL) {
                if (iter->tr) {
@@ -1942,9 +1949,27 @@ static int s_show(struct seq_file *m, void *v)
                        if (!(trace_flags & TRACE_ITER_VERBOSE))
                                print_func_help_header(m);
                }
+        } else if (iter->leftover) {
+                /*
+                 * If we filled the seq_file buffer earlier, we
+                 * want to just show it now.
+                 */
+                ret = trace_print_seq(m, &iter->seq);
+                /* ret should this time be zero, but you never know */
+                iter->leftover = ret;
        } else {
                print_trace_line(iter);
-                trace_print_seq(m, &iter->seq);
+                ret = trace_print_seq(m, &iter->seq);
+                /*
+                 * If we overflow the seq_file buffer, then it will
+                 * ask us for this data again at start up.
+                 * Use that instead.
+                 *  ret is 0 if seq_file write succeeded.
+                 *        -1 otherwise.
+                 */
+                iter->leftover = ret;
        }
        return 0;
@@ -2898,6 +2923,10 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
        else
                cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
+        if (iter->trace->pipe_close)
+                iter->trace->pipe_close(iter);
        mutex_unlock(&trace_types_lock);
        free_cpumask_var(iter->started);
@@ -3320,6 +3349,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
        return cnt;
 }
+static int mark_printk(const char *fmt, ...)
+{
+        int ret;
+        va_list args;
+        va_start(args, fmt);
+        ret = trace_vprintk(0, fmt, args);
+        va_end(args);
+        return ret;
+}
 static ssize_t
 tracing_mark_write(struct file *filp, const char __user *ubuf,
                                        size_t cnt, loff_t *fpos)
@@ -3346,7 +3385,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
        } else
                buf[cnt] = '\0';
-        cnt = trace_vprintk(0, buf, NULL);
+        cnt = mark_printk("%s", buf);
        kfree(buf);
        *fpos += cnt;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1d7f4830a80d..7fa33cab6962 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,6 +272,7 @@ struct tracer_flags {
 * @pipe_open: called when the trace_pipe file is opened
 * @wait_pipe: override how the user waits for traces on trace_pipe
 * @close: called when the trace file is released
+ * @pipe_close: called when the trace_pipe file is released
 * @read: override the default read callback on trace_pipe
 * @splice_read: override the default splice_read callback on trace_pipe
 * @selftest: selftest to run on boot (see trace_selftest.c)
@@ -290,6 +291,7 @@ struct tracer {
        void                    (*pipe_open)(struct trace_iterator *iter);
        void                    (*wait_pipe)(struct trace_iterator *iter);
        void                    (*close)(struct trace_iterator *iter);
+        void                    (*pipe_close)(struct trace_iterator *iter);
        ssize_t                 (*read)(struct trace_iterator *iter,
                                        struct file *filp, char __user *ubuf,
                                        size_t cnt, loff_t *ppos);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 45e6c01b2e4d..a43d009c561a 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -14,9 +14,20 @@
 #include "trace.h"
 #include "trace_output.h"
-struct fgraph_data {
+struct fgraph_cpu_data {
        pid_t           last_pid;
        int             depth;
+        int             ignore;
+};
+struct fgraph_data {
+        struct fgraph_cpu_data          *cpu_data;
+        /* Place to preserve last processed entry. */
+        struct ftrace_graph_ent_entry   ent;
+        struct ftrace_graph_ret_entry   ret;
+        int                             failed;
+        int                             cpu;
 };
 #define TRACE_GRAPH_INDENT      2
@@ -384,7 +395,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
        if (!data)
                return TRACE_TYPE_HANDLED;
-        last_pid = &(per_cpu_ptr(data, cpu)->last_pid);
+        last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
        if (*last_pid == pid)
                return TRACE_TYPE_HANDLED;
@@ -435,26 +446,49 @@ static struct ftrace_graph_ret_entry *
 get_return_for_leaf(struct trace_iterator *iter,
                struct ftrace_graph_ent_entry *curr)
 {
-        struct ring_buffer_iter *ring_iter;
+        struct fgraph_data *data = iter->private;
+        struct ring_buffer_iter *ring_iter = NULL;
        struct ring_buffer_event *event;
        struct ftrace_graph_ret_entry *next;
-        ring_iter = iter->buffer_iter[iter->cpu];
+        /*
+         * If the previous output failed to write to the seq buffer,
+         * then we just reuse the data from before.
+         */
+        if (data && data->failed) {
+                curr = &data->ent;
+                next = &data->ret;
+        } else {
-        /* First peek to compare current entry and the next one */
+                ring_iter = iter->buffer_iter[iter->cpu];
-        if (ring_iter)
-                event = ring_buffer_iter_peek(ring_iter, NULL);
+                /* First peek to compare current entry and the next one */
-        else {
+                if (ring_iter)
-        /* We need to consume the current entry to see the next one */
+                        event = ring_buffer_iter_peek(ring_iter, NULL);
-                ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+                else {
-                event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+                        /*
-                                        NULL);
+                         * We need to consume the current entry to see
-        }
+                         * the next one.
+                         */
+                        ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+                        event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+                                                 NULL);
+                }
-        if (!event)
+                if (!event)
-                return NULL;
+                        return NULL;
+                next = ring_buffer_event_data(event);
-        next = ring_buffer_event_data(event);
+                if (data) {
+                        /*
+                         * Save current and next entries for later reference
+                         * if the output fails.
+                         */
+                        data->ent = *curr;
+                        data->ret = *next;
+                }
+        }
        if (next->ent.type != TRACE_GRAPH_RET)
                return NULL;
@@ -640,7 +674,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
        if (data) {
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data, cpu)->depth);
+                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
                /*
                 * Comments display at + 1 to depth. Since
@@ -688,7 +722,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
        if (data) {
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data, cpu)->depth);
+                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
                *depth = call->depth;
        }
@@ -782,19 +816,34 @@ static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
                        struct trace_iterator *iter)
 {
-        int cpu = iter->cpu;
+        struct fgraph_data *data = iter->private;
        struct ftrace_graph_ent *call = &field->graph_ent;
        struct ftrace_graph_ret_entry *leaf_ret;
+        static enum print_line_t ret;
+        int cpu = iter->cpu;
        if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
                return TRACE_TYPE_PARTIAL_LINE;
        leaf_ret = get_return_for_leaf(iter, field);
        if (leaf_ret)
-                return print_graph_entry_leaf(iter, field, leaf_ret, s);
+                ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
        else
-                return print_graph_entry_nested(iter, field, s, cpu);
+                ret = print_graph_entry_nested(iter, field, s, cpu);
+        if (data) {
+                /*
+                 * If we failed to write our output, then we need to make
+                 * note of it. Because we already consumed our entry.
+                 */
+                if (s->full) {
+                        data->failed = 1;
+                        data->cpu = cpu;
+                } else
+                        data->failed = 0;
+        }
+        return ret;
 }
 static enum print_line_t
@@ -810,7 +859,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
        if (data) {
                int cpu = iter->cpu;
-                int *depth = &(per_cpu_ptr(data, cpu)->depth);
+                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
                /*
                 * Comments display at + 1 to depth. This is the
@@ -873,7 +922,7 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
        int i;
        if (data)
-                depth = per_cpu_ptr(data, iter->cpu)->depth;
+                depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
        if (print_graph_prologue(iter, s, 0, 0))
                return TRACE_TYPE_PARTIAL_LINE;
@@ -941,8 +990,33 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
 enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
+        struct ftrace_graph_ent_entry *field;
+        struct fgraph_data *data = iter->private;
        struct trace_entry *entry = iter->ent;
        struct trace_seq *s = &iter->seq;
+        int cpu = iter->cpu;
+        int ret;
+        if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
+                per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
+                return TRACE_TYPE_HANDLED;
+        }
+        /*
+         * If the last output failed, there's a possibility we need
+         * to print out the missing entry which would never go out.
+         */
+        if (data && data->failed) {
+                field = &data->ent;
+                iter->cpu = data->cpu;
+                ret = print_graph_entry(field, s, iter);
+                if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
+                        per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
+                        ret = TRACE_TYPE_NO_CONSUME;
+                }
+                iter->cpu = cpu;
+                return ret;
+        }
        switch (entry->type) {
        case TRACE_GRAPH_ENT: {
@@ -952,7 +1026,7 @@ print_graph_function(struct trace_iterator *iter)
                 * sizeof(struct ftrace_graph_ent_entry) is very small,
                 * it can be safely saved at the stack.
                 */
-                struct ftrace_graph_ent_entry *field, saved;
+                struct ftrace_graph_ent_entry saved;
                trace_assign_type(field, entry);
                saved = *field;
                return print_graph_entry(&saved, s, iter);
@@ -1030,31 +1104,54 @@ static void print_graph_headers(struct seq_file *s)
 static void graph_trace_open(struct trace_iterator *iter)
 {
        /* pid and depth on the last trace processed */
-        struct fgraph_data *data = alloc_percpu(struct fgraph_data);
+        struct fgraph_data *data;
        int cpu;
+        iter->private = NULL;
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (!data)
-                pr_warning("function graph tracer: not enough memory\n");
+                goto out_err;
-        else
-                for_each_possible_cpu(cpu) {
+        data->cpu_data = alloc_percpu(struct fgraph_cpu_data);
-                        pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid);
+        if (!data->cpu_data)
-                        int *depth = &(per_cpu_ptr(data, cpu)->depth);
+                goto out_err_free;
-                        *pid = -1;
-                        *depth = 0;
+        for_each_possible_cpu(cpu) {
-                }
+                pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
+                int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
+                int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
+                *pid = -1;
+                *depth = 0;
+                *ignore = 0;
+        }
        iter->private = data;
+        return;
+ out_err_free:
+        kfree(data);
+ out_err:
+        pr_warning("function graph tracer: not enough memory\n");
 }
 static void graph_trace_close(struct trace_iterator *iter)
 {
-        free_percpu(iter->private);
+        struct fgraph_data *data = iter->private;
+        if (data) {
+                free_percpu(data->cpu_data);
+                kfree(data);
+        }
 }
 static struct tracer graph_trace __read_mostly = {
        .name           = "function_graph",
        .open           = graph_trace_open,
+        .pipe_open      = graph_trace_open,
        .close          = graph_trace_close,
+        .pipe_close     = graph_trace_close,
        .wait_pipe      = poll_wait_pipe,
        .init           = graph_trace_init,
        .reset          = graph_trace_reset,
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index aff5f80b59b8..b52d397e57eb 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -606,23 +606,22 @@ static int create_trace_probe(int argc, char **argv)
         */
        struct trace_probe *tp;
        int i, ret = 0;
-        int is_return = 0;
+        int is_return = 0, is_delete = 0;
        char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
        unsigned long offset = 0;
        void *addr = NULL;
        char buf[MAX_EVENT_NAME_LEN];
-        if (argc < 2) {
+        /* argc must be >= 1 */
-                pr_info("Probe point is not specified.\n");
-                return -EINVAL;
-        }
        if (argv[0][0] == 'p')
                is_return = 0;
        else if (argv[0][0] == 'r')
                is_return = 1;
+        else if (argv[0][0] == '-')
+                is_delete = 1;
        else {
-                pr_info("Probe definition must be started with 'p' or 'r'.\n");
+                pr_info("Probe definition must be started with 'p', 'r' or"
+                        " '-'.\n");
                return -EINVAL;
        }
@@ -642,7 +641,29 @@ static int create_trace_probe(int argc, char **argv)
                        return -EINVAL;
                }
        }
+        if (!group)
+                group = KPROBE_EVENT_SYSTEM;
+        if (is_delete) {
+                if (!event) {
+                        pr_info("Delete command needs an event name.\n");
+                        return -EINVAL;
+                }
+                tp = find_probe_event(event, group);
+                if (!tp) {
+                        pr_info("Event %s/%s doesn't exist.\n", group, event);
+                        return -ENOENT;
+                }
+                /* delete an event */
+                unregister_trace_probe(tp);
+                free_trace_probe(tp);
+                return 0;
+        }
+        if (argc < 2) {
+                pr_info("Probe point is not specified.\n");
+                return -EINVAL;
+        }
        if (isdigit(argv[1][0])) {
                if (is_return) {
                        pr_info("Return probe point must be a symbol.\n");
@@ -671,8 +692,6 @@ static int create_trace_probe(int argc, char **argv)
        argc -= 2; argv += 2;
        /* setup a probe */
-        if (!group)
-                group = KPROBE_EVENT_SYSTEM;
        if (!event) {
                /* Make a new event name */
                if (symbol)
@@ -1114,7 +1133,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
        struct trace_probe *tp = (struct trace_probe *)event_call->data;
        ret = trace_define_common_fields(event_call);
-        if (!ret)
+        if (ret)
                return ret;
        DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
@@ -1132,7 +1151,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
        struct trace_probe *tp = (struct trace_probe *)event_call->data;
        ret = trace_define_common_fields(event_call);
-        if (!ret)
+        if (ret)
                return ret;
        DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index ddfa0fd43bc0..acb87d4a4ac1 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -79,11 +79,12 @@ void ksym_collect_stats(unsigned long hbp_hit_addr)
 }
 #endif /* CONFIG_PROFILE_KSYM_TRACER */
-void ksym_hbp_handler(struct perf_event *hbp, void *data)
+void ksym_hbp_handler(struct perf_event *hbp, int nmi,
+                      struct perf_sample_data *data,
+                      struct pt_regs *regs)
 {
        struct ring_buffer_event *event;
        struct ksym_trace_entry *entry;
-        struct pt_regs *regs = data;
        struct ring_buffer *buffer;
        int pc;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b6c12c6a1bcd..8e46b3323cdc 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -23,13 +23,21 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 static int next_event_type = __TRACE_LAST_TYPE + 1;
-void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+int trace_print_seq(struct seq_file *m, struct trace_seq *s)
 {
        int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+        int ret;
+        ret = seq_write(m, s->buffer, len);
-        seq_write(m, s->buffer, len);
+        /*
+         * Only reset this buffer if we successfully wrote to the
+         * seq_file buffer.
+         */
+        if (!ret)
+                trace_seq_init(s);
-        trace_seq_init(s);
+        return ret;
 }
 enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -85,7 +93,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
        va_list ap;
        int ret;
-        if (!len)
+        if (s->full || !len)
                return 0;
        va_start(ap, fmt);
@@ -93,8 +101,10 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
        va_end(ap);
        /* If we can't write it all, don't bother writing anything */
-        if (ret >= len)
+        if (ret >= len) {
+                s->full = 1;
                return 0;
+        }
        s->len += ret;
@@ -119,14 +129,16 @@ trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
        int len = (PAGE_SIZE - 1) - s->len;
        int ret;
-        if (!len)
+        if (s->full || !len)
                return 0;
        ret = vsnprintf(s->buffer + s->len, len, fmt, args);
        /* If we can't write it all, don't bother writing anything */
-        if (ret >= len)
+        if (ret >= len) {
+                s->full = 1;
                return 0;
+        }
        s->len += ret;
@@ -139,14 +151,16 @@ int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
        int len = (PAGE_SIZE - 1) - s->len;
        int ret;
-        if (!len)
+        if (s->full || !len)
                return 0;
        ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
        /* If we can't write it all, don't bother writing anything */
-        if (ret >= len)
+        if (ret >= len) {
+                s->full = 1;
                return 0;
+        }
        s->len += ret;
@@ -167,8 +181,13 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
 {
        int len = strlen(str);
-        if (len > ((PAGE_SIZE - 1) - s->len))
+        if (s->full)
+                return 0;
+        if (len > ((PAGE_SIZE - 1) - s->len)) {
+                s->full = 1;
                return 0;
+        }
        memcpy(s->buffer + s->len, str, len);
        s->len += len;
@@ -178,9 +197,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
 int trace_seq_putc(struct trace_seq *s, unsigned char c)
 {
-        if (s->len >= (PAGE_SIZE - 1))
+        if (s->full)
                return 0;
+        if (s->len >= (PAGE_SIZE - 1)) {
+                s->full = 1;
+                return 0;
+        }
        s->buffer[s->len++] = c;
        return 1;
@@ -188,9 +212,14 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
 int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
 {
-        if (len > ((PAGE_SIZE - 1) - s->len))
+        if (s->full)
                return 0;
+        if (len > ((PAGE_SIZE - 1) - s->len)) {
+                s->full = 1;
+                return 0;
+        }
        memcpy(s->buffer + s->len, mem, len);
        s->len += len;
@@ -203,6 +232,9 @@ int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
        const unsigned char *data = mem;
        int i, j;
+        if (s->full)
+                return 0;
 #ifdef __BIG_ENDIAN
        for (i = 0, j = 0; i < len; i++) {
 #else
@@ -220,8 +252,13 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
 {
        void *ret;
-        if (len > ((PAGE_SIZE - 1) - s->len))
+        if (s->full)
+                return 0;
+        if (len > ((PAGE_SIZE - 1) - s->len)) {
+                s->full = 1;
                return NULL;
+        }
        ret = s->buffer + s->len;
        s->len += len;
@@ -233,8 +270,14 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
 {
        unsigned char *p;
-        if (s->len >= (PAGE_SIZE - 1))
+        if (s->full)
+                return 0;
+        if (s->len >= (PAGE_SIZE - 1)) {
+                s->full = 1;
                return 0;
+        }
        p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
        if (!IS_ERR(p)) {
                p = mangle_path(s->buffer + s->len, p, "\n");
@@ -247,6 +290,7 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
                return 1;
        }
+        s->full = 1;
        return 0;
 }
@@ -373,6 +417,9 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
        unsigned long vmstart = 0;
        int ret = 1;
+        if (s->full)
+                return 0;
        if (mm) {
                const struct vm_area_struct *vma;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 67e526b6ae81..dee48658805c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,116 @@ struct workqueue_struct {
 #endif
 };
+#ifdef CONFIG_DEBUG_OBJECTS_WORK
+static struct debug_obj_descr work_debug_descr;
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
+ */
+static int work_fixup_init(void *addr, enum debug_obj_state state)
+{
+        struct work_struct *work = addr;
+        switch (state) {
+        case ODEBUG_STATE_ACTIVE:
+                cancel_work_sync(work);
+                debug_object_init(work, &work_debug_descr);
+                return 1;
+        default:
+                return 0;
+        }
+}
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ */
+static int work_fixup_activate(void *addr, enum debug_obj_state state)
+{
+        struct work_struct *work = addr;
+        switch (state) {
+        case ODEBUG_STATE_NOTAVAILABLE:
+                /*
+                 * This is not really a fixup. The work struct was
+                 * statically initialized. We just make sure that it
+                 * is tracked in the object tracker.
+                 */
+                if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
+                        debug_object_init(work, &work_debug_descr);
+                        debug_object_activate(work, &work_debug_descr);
+                        return 0;
+                }
+                WARN_ON_ONCE(1);
+                return 0;
+        case ODEBUG_STATE_ACTIVE:
+                WARN_ON(1);
+        default:
+                return 0;
+        }
+}
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int work_fixup_free(void *addr, enum debug_obj_state state)
+{
+        struct work_struct *work = addr;
+        switch (state) {
+        case ODEBUG_STATE_ACTIVE:
+                cancel_work_sync(work);
+                debug_object_free(work, &work_debug_descr);
+                return 1;
+        default:
+                return 0;
+        }
+}
+static struct debug_obj_descr work_debug_descr = {
+        .name           = "work_struct",
+        .fixup_init     = work_fixup_init,
+        .fixup_activate = work_fixup_activate,
+        .fixup_free     = work_fixup_free,
+};
+static inline void debug_work_activate(struct work_struct *work)
+{
+        debug_object_activate(work, &work_debug_descr);
+}
+static inline void debug_work_deactivate(struct work_struct *work)
+{
+        debug_object_deactivate(work, &work_debug_descr);
+}
+void __init_work(struct work_struct *work, int onstack)
+{
+        if (onstack)
+                debug_object_init_on_stack(work, &work_debug_descr);
+        else
+                debug_object_init(work, &work_debug_descr);
+}
+EXPORT_SYMBOL_GPL(__init_work);
+void destroy_work_on_stack(struct work_struct *work)
+{
+        debug_object_free(work, &work_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_work_on_stack);
+#else
+static inline void debug_work_activate(struct work_struct *work) { }
+static inline void debug_work_deactivate(struct work_struct *work) { }
+#endif
 /* Serializes the accesses to the list of workqueues. */
 static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
@@ -145,6 +255,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 {
        unsigned long flags;
+        debug_work_activate(work);
        spin_lock_irqsave(&cwq->lock, flags);
        insert_work(cwq, work, &cwq->worklist);
        spin_unlock_irqrestore(&cwq->lock, flags);
@@ -280,6 +391,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
                struct lockdep_map lockdep_map = work->lockdep_map;
 #endif
                trace_workqueue_execution(cwq->thread, work);
+                debug_work_deactivate(work);
                cwq->current_work = work;
                list_del_init(cwq->worklist.next);
                spin_unlock_irq(&cwq->lock);
@@ -350,11 +462,18 @@ static void wq_barrier_func(struct work_struct *work)
 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
                        struct wq_barrier *barr, struct list_head *head)
 {
-        INIT_WORK(&barr->work, wq_barrier_func);
+        /*
+         * debugobject calls are safe here even with cwq->lock locked
+         * as we know for sure that this will not trigger any of the
+         * checks and call back into the fixup functions where we
+         * might deadlock.
+         */
+        INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
        __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
        init_completion(&barr->done);
+        debug_work_activate(&barr->work);
        insert_work(cwq, &barr->work, head);
 }
@@ -372,8 +491,10 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
        }
        spin_unlock_irq(&cwq->lock);
-        if (active)
+        if (active) {
                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+        }
        return active;
 }
@@ -451,6 +572,7 @@ out:
                return 0;
        wait_for_completion(&barr.done);
+        destroy_work_on_stack(&barr.work);
        return 1;
 }
 EXPORT_SYMBOL_GPL(flush_work);
@@ -485,6 +607,7 @@ static int try_to_grab_pending(struct work_struct *work)
                 */
                smp_rmb();
                if (cwq == get_wq_data(work)) {
+                        debug_work_deactivate(work);
                        list_del_init(&work->entry);
                        ret = 1;
                }
@@ -507,8 +630,10 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
        }
        spin_unlock_irq(&cwq->lock);
-        if (unlikely(running))
+        if (unlikely(running)) {
                wait_for_completion(&barr.done);
+                destroy_work_on_stack(&barr.work);
+        }
 }
 static void wait_on_work(struct work_struct *work)