19 files changed, 252 insertions, 62 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c29831076e7a..5a54ff42874e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1115,8 +1115,10 @@ static void cgroup_kill_sb(struct super_block *sb) {
        }
        write_unlock(&css_set_lock);
-        list_del(&root->root_list);
+        if (!list_empty(&root->root_list)) {
-        root_count--;
+                list_del(&root->root_list);
+                root_count--;
+        }
        mutex_unlock(&cgroup_mutex);
@@ -2434,7 +2436,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 err_remove:
+        cgroup_lock_hierarchy(root);
        list_del(&cgrp->sibling);
+        cgroup_unlock_hierarchy(root);
        root->number_of_cgroups--;
 err_destroy:
@@ -2507,7 +2511,7 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
        for_each_subsys(cgrp->root, ss) {
                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
                int refcnt;
-                do {
+                while (1) {
                        /* We can only remove a CSS with a refcnt==1 */
                        refcnt = atomic_read(&css->refcnt);
                        if (refcnt > 1) {
@@ -2521,7 +2525,10 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
                         * css_tryget() to spin until we set the
                         * CSS_REMOVED bits or abort
                         */
-                } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt);
+                        if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
+                                break;
+                        cpu_relax();
+                }
        }
 done:
        for_each_subsys(cgrp->root, ss) {
@@ -2991,20 +2998,21 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
                mutex_unlock(&cgroup_mutex);
                return 0;
        }
-        task_lock(tsk);
-        cg = tsk->cgroups;
-        parent = task_cgroup(tsk, subsys->subsys_id);
        /* Pin the hierarchy */
-        if (!atomic_inc_not_zero(&parent->root->sb->s_active)) {
+        if (!atomic_inc_not_zero(&root->sb->s_active)) {
                /* We race with the final deactivate_super() */
                mutex_unlock(&cgroup_mutex);
                return 0;
        }
        /* Keep the cgroup alive */
+        task_lock(tsk);
+        parent = task_cgroup(tsk, subsys->subsys_id);
+        cg = tsk->cgroups;
        get_css_set(cg);
        task_unlock(tsk);
        mutex_unlock(&cgroup_mutex);
        /* Now do the VFS work to create a cgroup */
@@ -3043,7 +3051,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
                mutex_unlock(&inode->i_mutex);
                put_css_set(cg);
-                deactivate_super(parent->root->sb);
+                deactivate_super(root->sb);
                /* The cgroup is still accessible in the VFS, but
                 * we're not going to try to rmdir() it at this
                 * point. */
@@ -3069,7 +3077,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
        mutex_lock(&cgroup_mutex);
        put_css_set(cg);
        mutex_unlock(&cgroup_mutex);
-        deactivate_super(parent->root->sb);
+        deactivate_super(root->sb);
        return ret;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a85678865c5e..f76db9dcaa05 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,6 +61,14 @@
 #include <linux/cgroup.h>
 /*
+ * Workqueue for cpuset related tasks.
+ *
+ * Using kevent workqueue may cause deadlock when memory_migrate
+ * is set. So we create a separate workqueue thread for cpuset.
+ */
+static struct workqueue_struct *cpuset_wq;
+/*
 * Tracks how many cpusets are currently defined in system.
 * When there is only one cpuset (the root cpuset) we can
 * short circuit some hooks.
@@ -831,7 +839,7 @@ static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
 */
 static void async_rebuild_sched_domains(void)
 {
-        schedule_work(&rebuild_sched_domains_work);
+        queue_work(cpuset_wq, &rebuild_sched_domains_work);
 }
 /*
@@ -2111,6 +2119,9 @@ void __init cpuset_init_smp(void)
        hotcpu_notifier(cpuset_track_online_cpus, 0);
        hotplug_memory_notifier(cpuset_track_online_nodes, 10);
+        cpuset_wq = create_singlethread_workqueue("cpuset");
+        BUG_ON(!cpuset_wq);
 }
 /**
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f33afb0407bc..f394d2a42ca3 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -501,6 +501,13 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
                        continue;
                timer = rb_entry(base->first, struct hrtimer, node);
                expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+                /*
+                 * clock_was_set() has changed base->offset so the
+                 * result might be negative. Fix it up to prevent a
+                 * false positive in clockevents_program_event()
+                 */
+                if (expires.tv64 < 0)
+                        expires.tv64 = 0;
                if (expires.tv64 < cpu_base->expires_next.tv64)
                        cpu_base->expires_next = expires;
        }
@@ -1158,6 +1165,29 @@ static void __run_hrtimer(struct hrtimer *timer)
 #ifdef CONFIG_HIGH_RES_TIMERS
+static int force_clock_reprogram;
+/*
+ * After 5 iteration's attempts, we consider that hrtimer_interrupt()
+ * is hanging, which could happen with something that slows the interrupt
+ * such as the tracing. Then we force the clock reprogramming for each future
+ * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
+ * threshold that we will overwrite.
+ * The next tick event will be scheduled to 3 times we currently spend on
+ * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
+ * 1/4 of their time to process the hrtimer interrupts. This is enough to
+ * let it running without serious starvation.
+ */
+static inline void
+hrtimer_interrupt_hanging(struct clock_event_device *dev,
+                        ktime_t try_time)
+{
+        force_clock_reprogram = 1;
+        dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
+        printk(KERN_WARNING "hrtimer: interrupt too slow, "
+                "forcing clock min delta to %lu ns\n", dev->min_delta_ns);
+}
 /*
 * High resolution timer interrupt
 * Called with interrupts disabled
@@ -1167,6 +1197,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
        struct hrtimer_clock_base *base;
        ktime_t expires_next, now;
+        int nr_retries = 0;
        int i;
        BUG_ON(!cpu_base->hres_active);
@@ -1174,6 +1205,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
        dev->next_event.tv64 = KTIME_MAX;
 retry:
+        /* 5 retries is enough to notice a hang */
+        if (!(++nr_retries % 5))
+                hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
        now = ktime_get();
        expires_next.tv64 = KTIME_MAX;
@@ -1226,7 +1261,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
        /* Reprogramming necessary ? */
        if (expires_next.tv64 != KTIME_MAX) {
-                if (tick_program_event(expires_next, 0))
+                if (tick_program_event(expires_next, force_clock_reprogram))
                        goto retry;
        }
 }
@@ -1580,6 +1615,10 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
                break;
 #ifdef CONFIG_HOTPLUG_CPU
+        case CPU_DYING:
+        case CPU_DYING_FROZEN:
+                clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
+                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
        {
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f63c706d25e1..7de11bd64dfe 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -383,6 +383,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 out_unlock:
        spin_unlock(&desc->lock);
 }
+EXPORT_SYMBOL_GPL(handle_level_irq);
 /**
 *      handle_fasteoi_irq - irq handler for transparent controllers
@@ -593,6 +594,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
        }
        spin_unlock_irqrestore(&desc->lock, flags);
 }
+EXPORT_SYMBOL_GPL(__set_irq_handler);
 void
 set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c20db0be9173..3aba8d12f328 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -39,6 +39,18 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
        ack_bad_irq(irq);
 }
+#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
+static void __init init_irq_default_affinity(void)
+{
+        alloc_bootmem_cpumask_var(&irq_default_affinity);
+        cpumask_setall(irq_default_affinity);
+}
+#else
+static void __init init_irq_default_affinity(void)
+{
+}
+#endif
 /*
 * Linux has a controller-independent interrupt architecture.
 * Every controller has a 'controller-template', that is used
@@ -134,6 +146,8 @@ int __init early_irq_init(void)
        int legacy_count;
        int i;
+        init_irq_default_affinity();
        desc = irq_desc_legacy;
        legacy_count = ARRAY_SIZE(irq_desc_legacy);
@@ -219,6 +233,8 @@ int __init early_irq_init(void)
        int count;
        int i;
+        init_irq_default_affinity();
        desc = irq_desc;
        count = ARRAY_SIZE(irq_desc);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index cd0cd8dcb345..291f03664552 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -15,17 +15,9 @@
 #include "internals.h"
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
 cpumask_var_t irq_default_affinity;
-static int init_irq_default_affinity(void)
-{
-        alloc_cpumask_var(&irq_default_affinity, GFP_KERNEL);
-        cpumask_setall(irq_default_affinity);
-        return 0;
-}
-core_initcall(init_irq_default_affinity);
 /**
 *      synchronize_irq - wait for pending IRQ handlers (on other CPUs)
 *      @irq: interrupt number to wait for
diff --git a/kernel/module.c b/kernel/module.c
index e8b51d41dd72..ba22484a987e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -573,13 +573,13 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
 {
-        unsigned int i;
+        int cpu;
        INIT_LIST_HEAD(&mod->modules_which_use_me);
-        for (i = 0; i < NR_CPUS; i++)
+        for_each_possible_cpu(cpu)
-                local_set(&mod->ref[i].count, 0);
+                local_set(__module_ref_addr(mod, cpu), 0);
        /* Hold reference count during initialization. */
-        local_set(&mod->ref[raw_smp_processor_id()].count, 1);
+        local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
        /* Backwards compatibility macros put refcount during init. */
        mod->waiter = current;
 }
@@ -717,10 +717,11 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
 unsigned int module_refcount(struct module *mod)
 {
-        unsigned int i, total = 0;
+        unsigned int total = 0;
+        int cpu;
-        for (i = 0; i < NR_CPUS; i++)
+        for_each_possible_cpu(cpu)
-                total += local_read(&mod->ref[i].count);
+                total += local_read(__module_ref_addr(mod, cpu));
        return total;
 }
 EXPORT_SYMBOL(module_refcount);
@@ -894,7 +895,7 @@ void module_put(struct module *module)
 {
        if (module) {
                unsigned int cpu = get_cpu();
-                local_dec(&module->ref[cpu].count);
+                local_dec(__module_ref_addr(module, cpu));
                /* Maybe they're waiting for us to drop reference? */
                if (unlikely(!module_is_live(module)))
                        wake_up_process(module->waiter);
@@ -1464,7 +1465,10 @@ static void free_module(struct module *mod)
        kfree(mod->args);
        if (mod->percpu)
                percpu_modfree(mod->percpu);
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+        if (mod->refptr)
+                percpu_modfree(mod->refptr);
+#endif
        /* Free lock-classes: */
        lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -2011,6 +2015,14 @@ static noinline struct module *load_module(void __user *umod,
        if (err < 0)
                goto free_mod;
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+        mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
+                                      mod->name);
+        if (!mod->refptr) {
+                err = -ENOMEM;
+                goto free_mod;
+        }
+#endif
        if (pcpuindex) {
                /* We have a special allocation for this section. */
                percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
@@ -2018,7 +2030,7 @@ static noinline struct module *load_module(void __user *umod,
                                         mod->name);
                if (!percpu) {
                        err = -ENOMEM;
-                        goto free_mod;
+                        goto free_percpu;
                }
                sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
                mod->percpu = percpu;
@@ -2282,6 +2294,9 @@ static noinline struct module *load_module(void __user *umod,
 free_percpu:
        if (percpu)
                percpu_modfree(percpu);
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+        percpu_modfree(mod->refptr);
+#endif
 free_mod:
        kfree(args);
 free_hdr:
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 45e8541ab7e3..432ee575c9ee 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -71,6 +71,14 @@ void hibernation_set_ops(struct platform_hibernation_ops *ops)
        mutex_unlock(&pm_mutex);
 }
+static bool entering_platform_hibernation;
+bool system_entering_hibernation(void)
+{
+        return entering_platform_hibernation;
+}
+EXPORT_SYMBOL(system_entering_hibernation);
 #ifdef CONFIG_PM_DEBUG
 static void hibernation_debug_sleep(void)
 {
@@ -411,6 +419,7 @@ int hibernation_platform_enter(void)
        if (error)
                goto Close;
+        entering_platform_hibernation = true;
        suspend_console();
        error = device_suspend(PMSG_HIBERNATE);
        if (error) {
@@ -445,6 +454,7 @@ int hibernation_platform_enter(void)
 Finish:
        hibernation_ops->finish();
 Resume_devices:
+        entering_platform_hibernation = false;
        device_resume(PMSG_RESTORE);
        resume_console();
 Close:
diff --git a/kernel/sched.c b/kernel/sched.c
index 52bbf1c842a8..242d0d47a70d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2266,6 +2266,16 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
        if (!sched_feat(SYNC_WAKEUPS))
                sync = 0;
+        if (!sync) {
+                if (current->se.avg_overlap < sysctl_sched_migration_cost &&
+                          p->se.avg_overlap < sysctl_sched_migration_cost)
+                        sync = 1;
+        } else {
+                if (current->se.avg_overlap >= sysctl_sched_migration_cost ||
+                          p->se.avg_overlap >= sysctl_sched_migration_cost)
+                        sync = 0;
+        }
 #ifdef CONFIG_SMP
        if (sched_feat(LB_WAKEUP_UPDATE)) {
                struct sched_domain *sd;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5cc1c162044f..a7e50ba185ac 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -719,7 +719,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
                __enqueue_entity(cfs_rq, se);
 }
-static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        if (cfs_rq->last == se)
                cfs_rq->last = NULL;
@@ -728,6 +728,12 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
                cfs_rq->next = NULL;
 }
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+        for_each_sched_entity(se)
+                __clear_buddies(cfs_rq_of(se), se);
+}
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 {
@@ -768,8 +774,14 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
        ideal_runtime = sched_slice(cfs_rq, curr);
        delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
-        if (delta_exec > ideal_runtime)
+        if (delta_exec > ideal_runtime) {
                resched_task(rq_of(cfs_rq)->curr);
+                /*
+                 * The current task ran long enough, ensure it doesn't get
+                 * re-elected due to buddy favours.
+                 */
+                clear_buddies(cfs_rq, curr);
+        }
 }
 static void
@@ -1179,20 +1191,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
            int idx, unsigned long load, unsigned long this_load,
            unsigned int imbalance)
 {
-        struct task_struct *curr = this_rq->curr;
-        struct task_group *tg;
        unsigned long tl = this_load;
        unsigned long tl_per_task;
+        struct task_group *tg;
        unsigned long weight;
        int balanced;
        if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
                return 0;
-        if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
-                        p->se.avg_overlap > sysctl_sched_migration_cost))
-                sync = 0;
        /*
         * If sync wakeup then subtract the (maximum possible)
         * effect of the currently running task from the load
@@ -1419,9 +1426,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
-        if (sched_feat(WAKEUP_OVERLAP) && (sync ||
+        if (sched_feat(WAKEUP_OVERLAP) && sync) {
-                        (se->avg_overlap < sysctl_sched_migration_cost &&
-                         pse->avg_overlap < sysctl_sched_migration_cost))) {
                resched_task(curr);
                return;
        }
@@ -1452,6 +1457,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
        do {
                se = pick_next_entity(cfs_rq);
+                /*
+                 * If se was a buddy, clear it so that it will have to earn
+                 * the favour again.
+                 */
+                __clear_buddies(cfs_rq, se);
                set_next_entity(cfs_rq, se);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 954e1a81b796..bac1061cea2f 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -968,8 +968,8 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
        if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
                return this_cpu;
-        first = first_cpu(*mask);
+        first = cpumask_first(mask);
-        if (first != NR_CPUS)
+        if (first < nr_cpu_ids)
                return first;
        return -1;
diff --git a/kernel/signal.c b/kernel/signal.c
index e73759783dc8..b6b36768b758 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -909,7 +909,9 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
        }
 #endif
        printk("\n");
+        preempt_disable();
        show_regs(regs);
+        preempt_enable();
 }
 static int __init setup_print_fatal_signals(char *str)
diff --git a/kernel/smp.c b/kernel/smp.c
index 5cfa0e5e3e88..bbedbb7efe32 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -18,6 +18,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock);
 enum {
        CSD_FLAG_WAIT           = 0x01,
        CSD_FLAG_ALLOC          = 0x02,
+        CSD_FLAG_LOCK           = 0x04,
 };
 struct call_function_data {
@@ -186,6 +187,9 @@ void generic_smp_call_function_single_interrupt(void)
                        if (data_flags & CSD_FLAG_WAIT) {
                                smp_wmb();
                                data->flags &= ~CSD_FLAG_WAIT;
+                        } else if (data_flags & CSD_FLAG_LOCK) {
+                                smp_wmb();
+                                data->flags &= ~CSD_FLAG_LOCK;
                        } else if (data_flags & CSD_FLAG_ALLOC)
                                kfree(data);
                }
@@ -196,6 +200,8 @@ void generic_smp_call_function_single_interrupt(void)
        }
 }
+static DEFINE_PER_CPU(struct call_single_data, csd_data);
 /*
 * smp_call_function_single - Run a function on a specific CPU
 * @func: The function to run. This must be fast and non-blocking.
@@ -224,14 +230,38 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
                func(info);
                local_irq_restore(flags);
        } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
-                struct call_single_data *data = NULL;
+                struct call_single_data *data;
                if (!wait) {
+                        /*
+                         * We are calling a function on a single CPU
+                         * and we are not going to wait for it to finish.
+                         * We first try to allocate the data, but if we
+                         * fail, we fall back to use a per cpu data to pass
+                         * the information to that CPU. Since all callers
+                         * of this code will use the same data, we must
+                         * synchronize the callers to prevent a new caller
+                         * from corrupting the data before the callee
+                         * can access it.
+                         *
+                         * The CSD_FLAG_LOCK is used to let us know when
+                         * the IPI handler is done with the data.
+                         * The first caller will set it, and the callee
+                         * will clear it. The next caller must wait for
+                         * it to clear before we set it again. This
+                         * will make sure the callee is done with the
+                         * data before a new caller will use it.
+                         */
                        data = kmalloc(sizeof(*data), GFP_ATOMIC);
                        if (data)
                                data->flags = CSD_FLAG_ALLOC;
-                }
+                        else {
-                if (!data) {
+                                data = &per_cpu(csd_data, me);
+                                while (data->flags & CSD_FLAG_LOCK)
+                                        cpu_relax();
+                                data->flags = CSD_FLAG_LOCK;
+                        }
+                } else {
                        data = &d;
                        data->flags = CSD_FLAG_WAIT;
                }
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 63e05d423a09..21a5ca849514 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -274,6 +274,21 @@ out_bc:
 }
 /*
+ * Transfer the do_timer job away from a dying cpu.
+ *
+ * Called with interrupts disabled.
+ */
+static void tick_handover_do_timer(int *cpup)
+{
+        if (*cpup == tick_do_timer_cpu) {
+                int cpu = cpumask_first(cpu_online_mask);
+                tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
+                        TICK_DO_TIMER_NONE;
+        }
+}
+/*
 * Shutdown an event device on a given cpu:
 *
 * This is called on a life CPU, when a CPU is dead. So we cannot
@@ -297,13 +312,6 @@ static void tick_shutdown(unsigned int *cpup)
                clockevents_exchange_device(dev, NULL);
                td->evtdev = NULL;
        }
-        /* Transfer the do_timer job away from this cpu */
-        if (*cpup == tick_do_timer_cpu) {
-                int cpu = cpumask_first(cpu_online_mask);
-                tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
-                        TICK_DO_TIMER_NONE;
-        }
        spin_unlock_irqrestore(&tick_device_lock, flags);
 }
@@ -357,6 +365,10 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
                tick_broadcast_oneshot_control(reason);
                break;
+        case CLOCK_EVT_NOTIFY_CPU_DYING:
+                tick_handover_do_timer(dev);
+                break;
        case CLOCK_EVT_NOTIFY_CPU_DEAD:
                tick_shutdown_broadcast_oneshot(dev);
                tick_shutdown_broadcast(dev);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2f32969c09df..7dcf6e9f2b04 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -17,6 +17,7 @@
 #include <linux/clocksource.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
+#include <linux/suspend.h>
 #include <linux/debugfs.h>
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
@@ -1965,6 +1966,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static atomic_t ftrace_graph_active;
+static struct notifier_block ftrace_suspend_notifier;
 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
 {
@@ -2043,6 +2045,27 @@ static int start_graph_tracing(void)
        return ret;
 }
+/*
+ * Hibernation protection.
+ * The state of the current task is too much unstable during
+ * suspend/restore to disk. We want to protect against that.
+ */
+static int
+ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
+                                                        void *unused)
+{
+        switch (state) {
+        case PM_HIBERNATION_PREPARE:
+                pause_graph_tracing();
+                break;
+        case PM_POST_HIBERNATION:
+                unpause_graph_tracing();
+                break;
+        }
+        return NOTIFY_DONE;
+}
 int register_ftrace_graph(trace_func_graph_ret_t retfunc,
                        trace_func_graph_ent_t entryfunc)
 {
@@ -2050,6 +2073,9 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
        mutex_lock(&ftrace_sysctl_lock);
+        ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
+        register_pm_notifier(&ftrace_suspend_notifier);
        atomic_inc(&ftrace_graph_active);
        ret = start_graph_tracing();
        if (ret) {
@@ -2075,6 +2101,7 @@ void unregister_ftrace_graph(void)
        ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
        ftrace_graph_entry = ftrace_graph_entry_stub;
        ftrace_shutdown(FTRACE_STOP_FUNC_RET);
+        unregister_pm_notifier(&ftrace_suspend_notifier);
        mutex_unlock(&ftrace_sysctl_lock);
 }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8b0daf0662ef..bd38c5cfd8ad 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -246,7 +246,7 @@ static inline int test_time_stamp(u64 delta)
        return 0;
 }
-#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page))
+#define BUF_PAGE_SIZE (PAGE_SIZE - offsetof(struct buffer_data_page, data))
 /*
 * head_page == tail_page && head == tail then buffer is empty.
@@ -1025,12 +1025,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
                }
                if (next_page == head_page) {
-                        if (!(buffer->flags & RB_FL_OVERWRITE)) {
+                        if (!(buffer->flags & RB_FL_OVERWRITE))
-                                /* reset write */
-                                if (tail <= BUF_PAGE_SIZE)
-                                        local_set(&tail_page->write, tail);
                                goto out_unlock;
-                        }
                        /* tail_page has not moved yet? */
                        if (tail_page == cpu_buffer->tail_page) {
@@ -1105,6 +1101,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        return event;
 out_unlock:
+        /* reset write */
+        if (tail <= BUF_PAGE_SIZE)
+                local_set(&tail_page->write, tail);
        __raw_spin_unlock(&cpu_buffer->lock);
        local_irq_restore(flags);
        return NULL;
@@ -2174,6 +2174,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->overrun = 0;
        cpu_buffer->entries = 0;
+        cpu_buffer->write_stamp = 0;
+        cpu_buffer->read_stamp = 0;
 }
 /**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c580233add95..17bb88d86ac2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -40,7 +40,7 @@
 #define TRACE_BUFFER_FLAGS      (RB_FL_OVERWRITE)
-unsigned long __read_mostly     tracing_max_latency = (cycle_t)ULONG_MAX;
+unsigned long __read_mostly     tracing_max_latency;
 unsigned long __read_mostly     tracing_thresh;
 /*
@@ -3736,7 +3736,7 @@ static struct notifier_block trace_die_notifier = {
 * it if we decide to change what log level the ftrace dump
 * should be at.
 */
-#define KERN_TRACE              KERN_INFO
+#define KERN_TRACE              KERN_EMERG
 static void
 trace_printk_seq(struct trace_seq *s)
@@ -3770,6 +3770,7 @@ void ftrace_dump(void)
        dump_ran = 1;
        /* No turning back! */
+        tracing_off();
        ftrace_kill();
        for_each_tracing_cpu(cpu) {
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7c2e326bbc8b..62a78d943534 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -380,6 +380,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr)
 static void __irqsoff_tracer_init(struct trace_array *tr)
 {
+        tracing_max_latency = 0;
        irqsoff_trace = tr;
        /* make sure that the tracer is visible */
        smp_wmb();
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 43586b689e31..42ae1e77b6b3 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -333,6 +333,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
 static int wakeup_tracer_init(struct trace_array *tr)
 {
+        tracing_max_latency = 0;
        wakeup_trace = tr;
        start_wakeup_tracer(tr);
        return 0;