27 files changed, 308 insertions, 134 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 35536d9c0964..76768ee812b2 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -220,9 +220,16 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
 endif
+config ARCH_SUPPORTS_ATOMIC_RMW
+        bool
 config MUTEX_SPIN_ON_OWNER
        def_bool y
-        depends on SMP && !DEBUG_MUTEXES
+        depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+config RWSEM_SPIN_ON_OWNER
+       def_bool y
+       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
 config ARCH_USE_QUEUE_RWLOCK
        bool
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a33d9a2bcbd7..6b17ac1b0c2a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2320,7 +2320,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
        next_parent = rcu_dereference(next_ctx->parent_ctx);
        /* If neither context have a parent context; they cannot be clones. */
-        if (!parent && !next_parent)
+        if (!parent || !next_parent)
                goto unlock;
        if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
@@ -7458,7 +7458,19 @@ __perf_event_exit_task(struct perf_event *child_event,
                         struct perf_event_context *child_ctx,
                         struct task_struct *child)
 {
-        perf_remove_from_context(child_event, true);
+        /*
+         * Do not destroy the 'original' grouping; because of the context
+         * switch optimization the original events could've ended up in a
+         * random child task.
+         *
+         * If we were to destroy the original group, all group related
+         * operations would cease to function properly after this random
+         * child dies.
+         *
+         * Do destroy all inherited groups, we don't care about those
+         * and being thorough is better.
+         */
+        perf_remove_from_context(child_event, !!child_event->parent);
        /*
         * It can happen that the parent exits first, and has events
@@ -7474,7 +7486,7 @@ __perf_event_exit_task(struct perf_event *child_event,
 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 {
        struct perf_event *child_event, *next;
-        struct perf_event_context *child_ctx;
+        struct perf_event_context *child_ctx, *parent_ctx;
        unsigned long flags;
        if (likely(!child->perf_event_ctxp[ctxn])) {
@@ -7499,6 +7511,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
        raw_spin_lock(&child_ctx->lock);
        task_ctx_sched_out(child_ctx);
        child->perf_event_ctxp[ctxn] = NULL;
+        /*
+         * In order to avoid freeing: child_ctx->parent_ctx->task
+         * under perf_event_context::lock, grab another reference.
+         */
+        parent_ctx = child_ctx->parent_ctx;
+        if (parent_ctx)
+                get_ctx(parent_ctx);
        /*
         * If this context is a clone; unclone it so it can't get
         * swapped to another process while we're removing all
@@ -7509,6 +7530,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
        /*
+         * Now that we no longer hold perf_event_context::lock, drop
+         * our extra child_ctx->parent_ctx reference.
+         */
+        if (parent_ctx)
+                put_ctx(parent_ctx);
+        /*
         * Report the task dead after unscheduling the events so that we
         * won't get any samples after PERF_RECORD_EXIT. We can however still
         * get a few PERF_RECORD_READ events.
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 369f41a94124..4b8f0c925884 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
 #include <linux/swap.h>
 #include <linux/syscore_ops.h>
 #include <linux/compiler.h>
+#include <linux/hugetlb.h>
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -1619,6 +1620,9 @@ static int __init crash_save_vmcoreinfo_init(void)
 #endif
        VMCOREINFO_NUMBER(PG_head_mask);
        VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_HUGETLBFS
+        VMCOREINFO_SYMBOL(free_huge_page);
+#endif
        arch_crash_save_vmcoreinfo();
        update_vmcoreinfo_note();
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3214289df5a7..734e9a7d280b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2037,19 +2037,23 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
 {
        unsigned long *iter;
        struct kprobe_blacklist_entry *ent;
-        unsigned long offset = 0, size = 0;
+        unsigned long entry, offset = 0, size = 0;
        for (iter = start; iter < end; iter++) {
-                if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) {
+                entry = arch_deref_entry_point((void *)*iter);
-                        pr_err("Failed to find blacklist %p\n", (void *)*iter);
+                if (!kernel_text_address(entry) ||
+                    !kallsyms_lookup_size_offset(entry, &size, &offset)) {
+                        pr_err("Failed to find blacklist at %p\n",
+                                (void *)entry);
                        continue;
                }
                ent = kmalloc(sizeof(*ent), GFP_KERNEL);
                if (!ent)
                        return -ENOMEM;
-                ent->start_addr = *iter;
+                ent->start_addr = entry;
-                ent->end_addr = *iter + size;
+                ent->end_addr = entry + size;
                INIT_LIST_HEAD(&ent->list);
                list_add_tail(&ent->list, &kprobe_blacklist);
        }
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 838dc9e00669..be9ee1559fca 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -14,21 +14,47 @@
 * called from interrupt context and we have preemption disabled while
 * spinning.
 */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
+/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int encode_cpu(int cpu_nr)
+{
+        return cpu_nr + 1;
+}
+static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+{
+        int cpu_nr = encoded_cpu_val - 1;
+        return per_cpu_ptr(&osq_node, cpu_nr);
+}
 /*
 * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
 * Can return NULL in case we were the last queued and we updated @lock instead.
 */
-static inline struct optimistic_spin_queue *
+static inline struct optimistic_spin_node *
-osq_wait_next(struct optimistic_spin_queue **lock,
+osq_wait_next(struct optimistic_spin_queue *lock,
-              struct optimistic_spin_queue *node,
+              struct optimistic_spin_node *node,
-              struct optimistic_spin_queue *prev)
+              struct optimistic_spin_node *prev)
 {
-        struct optimistic_spin_queue *next = NULL;
+        struct optimistic_spin_node *next = NULL;
+        int curr = encode_cpu(smp_processor_id());
+        int old;
+        /*
+         * If there is a prev node in queue, then the 'old' value will be
+         * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
+         * we're currently last in queue, then the queue will then become empty.
+         */
+        old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
        for (;;) {
-                if (*lock == node && cmpxchg(lock, node, prev) == node) {
+                if (atomic_read(&lock->tail) == curr &&
+                    atomic_cmpxchg(&lock->tail, curr, old) == curr) {
                        /*
                         * We were the last queued, we moved @lock back. @prev
                         * will now observe @lock and will complete its
@@ -59,18 +85,23 @@ osq_wait_next(struct optimistic_spin_queue **lock,
        return next;
 }
-bool osq_lock(struct optimistic_spin_queue **lock)
+bool osq_lock(struct optimistic_spin_queue *lock)
 {
-        struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
+        struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
-        struct optimistic_spin_queue *prev, *next;
+        struct optimistic_spin_node *prev, *next;
+        int curr = encode_cpu(smp_processor_id());
+        int old;
        node->locked = 0;
        node->next = NULL;
+        node->cpu = curr;
-        node->prev = prev = xchg(lock, node);
+        old = atomic_xchg(&lock->tail, curr);
-        if (likely(prev == NULL))
+        if (old == OSQ_UNLOCKED_VAL)
                return true;
+        prev = decode_cpu(old);
+        node->prev = prev;
        ACCESS_ONCE(prev->next) = node;
        /*
@@ -149,20 +180,21 @@ unqueue:
        return false;
 }
-void osq_unlock(struct optimistic_spin_queue **lock)
+void osq_unlock(struct optimistic_spin_queue *lock)
 {
-        struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
+        struct optimistic_spin_node *node, *next;
-        struct optimistic_spin_queue *next;
+        int curr = encode_cpu(smp_processor_id());
        /*
         * Fast path for the uncontended case.
         */
-        if (likely(cmpxchg(lock, node, NULL) == node))
+        if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
                return;
        /*
         * Second most likely case.
         */
+        node = this_cpu_ptr(&osq_node);
        next = xchg(&node->next, NULL);
        if (next) {
                ACCESS_ONCE(next->locked) = 1;
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index a2dbac4aca6b..74356dc0ce29 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -118,12 +118,13 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 * mutex_lock()/rwsem_down_{read,write}() etc.
 */
-struct optimistic_spin_queue {
+struct optimistic_spin_node {
-        struct optimistic_spin_queue *next, *prev;
+        struct optimistic_spin_node *next, *prev;
        int locked; /* 1 if lock acquired */
+        int cpu; /* encoded CPU # value */
 };
-extern bool osq_lock(struct optimistic_spin_queue **lock);
+extern bool osq_lock(struct optimistic_spin_queue *lock);
-extern void osq_unlock(struct optimistic_spin_queue **lock);
+extern void osq_unlock(struct optimistic_spin_queue *lock);
 #endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index bc73d33c6760..acca2c1a3c5e 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
        INIT_LIST_HEAD(&lock->wait_list);
        mutex_clear_owner(lock);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-        lock->osq = NULL;
+        osq_lock_init(&lock->osq);
 #endif
        debug_mutex_init(lock, name, key);
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 9be8a9144978..2c93571162cb 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem)
        unsigned long flags;
        if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
-                ret = (sem->activity != 0);
+                ret = (sem->count != 0);
                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
        }
        return ret;
@@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
        debug_check_no_locks_freed((void *)sem, sizeof(*sem));
        lockdep_init_map(&sem->dep_map, name, key, 0);
 #endif
-        sem->activity = 0;
+        sem->count = 0;
        raw_spin_lock_init(&sem->wait_lock);
        INIT_LIST_HEAD(&sem->wait_list);
 }
@@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
                waiter = list_entry(next, struct rwsem_waiter, list);
        } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
-        sem->activity += woken;
+        sem->count += woken;
 out:
        return sem;
@@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+        if (sem->count >= 0 && list_empty(&sem->wait_list)) {
                /* granted */
-                sem->activity++;
+                sem->count++;
                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
                goto out;
        }
@@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+        if (sem->count >= 0 && list_empty(&sem->wait_list)) {
                /* granted */
-                sem->activity++;
+                sem->count++;
                ret = 1;
        }
@@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
                 * itself into sleep and waiting for system woke it or someone
                 * else in the head of the wait list up.
                 */
-                if (sem->activity == 0)
+                if (sem->count == 0)
                        break;
                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
                raw_spin_lock_irqsave(&sem->wait_lock, flags);
        }
        /* got the lock */
-        sem->activity = -1;
+        sem->count = -1;
        list_del(&waiter.list);
        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (sem->activity == 0) {
+        if (sem->count == 0) {
                /* got the lock */
-                sem->activity = -1;
+                sem->count = -1;
                ret = 1;
        }
@@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (--sem->activity == 0 && !list_empty(&sem->wait_list))
+        if (--sem->count == 0 && !list_empty(&sem->wait_list))
                sem = __rwsem_wake_one_writer(sem);
        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        sem->activity = 0;
+        sem->count = 0;
        if (!list_empty(&sem->wait_list))
                sem = __rwsem_do_wake(sem, 1);
@@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        sem->activity = 1;
+        sem->count = 1;
        if (!list_empty(&sem->wait_list))
                sem = __rwsem_do_wake(sem, 0);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index dacc32142fcc..a2391ac135c8 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -82,9 +82,9 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
        sem->count = RWSEM_UNLOCKED_VALUE;
        raw_spin_lock_init(&sem->wait_lock);
        INIT_LIST_HEAD(&sem->wait_list);
-#ifdef CONFIG_SMP
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
        sem->owner = NULL;
-        sem->osq = NULL;
+        osq_lock_init(&sem->osq);
 #endif
 }
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
        return false;
 }
-#ifdef CONFIG_SMP
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 /*
 * Try to acquire write lock before the writer has been put on wait queue.
 */
@@ -285,10 +285,10 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 {
        struct task_struct *owner;
-        bool on_cpu = true;
+        bool on_cpu = false;
        if (need_resched())
-                return 0;
+                return false;
        rcu_read_lock();
        owner = ACCESS_ONCE(sem->owner);
@@ -297,9 +297,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
        rcu_read_unlock();
        /*
-         * If sem->owner is not set, the rwsem owner may have
+         * If sem->owner is not set, yet we have just recently entered the
-         * just acquired it and not set the owner yet or the rwsem
+         * slowpath, then there is a possibility reader(s) may have the lock.
-         * has been released.
+         * To be safe, avoid spinning in these situations.
         */
        return on_cpu;
 }
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 42f806de49d4..e2d3bc7f03b4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,7 +12,7 @@
 #include <linux/atomic.h>
-#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM)
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 static inline void rwsem_set_owner(struct rw_semaphore *sem)
 {
        sem->owner = current;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0ca8d83e2369..4ee194eb524b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -186,6 +186,7 @@ void thaw_processes(void)
        printk("Restarting tasks ... ");
+        __usermodehelper_set_disable_depth(UMH_FREEZING);
        thaw_workqueues();
        read_lock(&tasklist_lock);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4dd8822f732a..ed35a4790afe 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -306,7 +306,7 @@ int suspend_devices_and_enter(suspend_state_t state)
                error = suspend_ops->begin(state);
                if (error)
                        goto Close;
-        } else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) {
+        } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) {
                error = freeze_ops->begin();
                if (error)
                        goto Close;
@@ -335,7 +335,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 Close:
        if (need_suspend_ops(state) && suspend_ops->end)
                suspend_ops->end();
-        else if (state == PM_SUSPEND_FREEZE && freeze_ops->end)
+        else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
                freeze_ops->end();
        return error;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 7fa34f86e5ba..948a7693748e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -18,7 +18,7 @@
 * Copyright (C) IBM Corporation, 2005, 2006
 *
 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
- *        Josh Triplett <josh@freedesktop.org>
+ *        Josh Triplett <josh@joshtriplett.org>
 *
 * See also:  Documentation/RCU/torture.txt
 */
@@ -51,7 +51,7 @@
 #include <linux/torture.h>
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
 torture_param(int, fqs_duration, 0,
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f1ba77363fbb..625d0b0cd75a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -206,6 +206,70 @@ void rcu_bh_qs(int cpu)
        rdp->passed_quiesce = 1;
 }
+static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
+static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
+        .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
+        .dynticks = ATOMIC_INIT(1),
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+        .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
+        .dynticks_idle = ATOMIC_INIT(1),
+#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+};
+/*
+ * Let the RCU core know that this CPU has gone through the scheduler,
+ * which is a quiescent state.  This is called when the need for a
+ * quiescent state is urgent, so we burn an atomic operation and full
+ * memory barriers to let the RCU core know about it, regardless of what
+ * this CPU might (or might not) do in the near future.
+ *
+ * We inform the RCU core by emulating a zero-duration dyntick-idle
+ * period, which we in turn do by incrementing the ->dynticks counter
+ * by two.
+ */
+static void rcu_momentary_dyntick_idle(void)
+{
+        unsigned long flags;
+        struct rcu_data *rdp;
+        struct rcu_dynticks *rdtp;
+        int resched_mask;
+        struct rcu_state *rsp;
+        local_irq_save(flags);
+        /*
+         * Yes, we can lose flag-setting operations.  This is OK, because
+         * the flag will be set again after some delay.
+         */
+        resched_mask = raw_cpu_read(rcu_sched_qs_mask);
+        raw_cpu_write(rcu_sched_qs_mask, 0);
+        /* Find the flavor that needs a quiescent state. */
+        for_each_rcu_flavor(rsp) {
+                rdp = raw_cpu_ptr(rsp->rda);
+                if (!(resched_mask & rsp->flavor_mask))
+                        continue;
+                smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
+                if (ACCESS_ONCE(rdp->mynode->completed) !=
+                    ACCESS_ONCE(rdp->cond_resched_completed))
+                        continue;
+                /*
+                 * Pretend to be momentarily idle for the quiescent state.
+                 * This allows the grace-period kthread to record the
+                 * quiescent state, with no need for this CPU to do anything
+                 * further.
+                 */
+                rdtp = this_cpu_ptr(&rcu_dynticks);
+                smp_mb__before_atomic(); /* Earlier stuff before QS. */
+                atomic_add(2, &rdtp->dynticks);  /* QS. */
+                smp_mb__after_atomic(); /* Later stuff after QS. */
+                break;
+        }
+        local_irq_restore(flags);
+}
 /*
 * Note a context switch.  This is a quiescent state for RCU-sched,
 * and requires special handling for preemptible RCU.
@@ -216,19 +280,12 @@ void rcu_note_context_switch(int cpu)
        trace_rcu_utilization(TPS("Start context switch"));
        rcu_sched_qs(cpu);
        rcu_preempt_note_context_switch(cpu);
+        if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+                rcu_momentary_dyntick_idle();
        trace_rcu_utilization(TPS("End context switch"));
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
-static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
-        .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
-        .dynticks = ATOMIC_INIT(1),
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-        .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
-        .dynticks_idle = ATOMIC_INIT(1),
-#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-};
 static long blimit = 10;        /* Maximum callbacks per rcu_do_batch. */
 static long qhimark = 10000;    /* If this many pending, ignore blimit. */
 static long qlowmark = 100;     /* Once only this many pending, use blimit. */
@@ -243,6 +300,13 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;
 module_param(jiffies_till_first_fqs, ulong, 0644);
 module_param(jiffies_till_next_fqs, ulong, 0644);
+/*
+ * How long the grace period must be before we start recruiting
+ * quiescent-state help from rcu_note_context_switch().
+ */
+static ulong jiffies_till_sched_qs = HZ / 20;
+module_param(jiffies_till_sched_qs, ulong, 0644);
 static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
                                  struct rcu_data *rdp);
 static void force_qs_rnp(struct rcu_state *rsp,
@@ -853,6 +917,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
                                    bool *isidle, unsigned long *maxj)
 {
        unsigned int curr;
+        int *rcrmp;
        unsigned int snap;
        curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
@@ -893,27 +958,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
        }
        /*
-         * There is a possibility that a CPU in adaptive-ticks state
+         * A CPU running for an extended time within the kernel can
-         * might run in the kernel with the scheduling-clock tick disabled
+         * delay RCU grace periods.  When the CPU is in NO_HZ_FULL mode,
-         * for an extended time period.  Invoke rcu_kick_nohz_cpu() to
+         * even context-switching back and forth between a pair of
-         * force the CPU to restart the scheduling-clock tick in this
+         * in-kernel CPU-bound tasks cannot advance grace periods.
-         * CPU is in this state.
+         * So if the grace period is old enough, make the CPU pay attention.
-         */
+         * Note that the unsynchronized assignments to the per-CPU
-        rcu_kick_nohz_cpu(rdp->cpu);
+         * rcu_sched_qs_mask variable are safe.  Yes, setting of
+         * bits can be lost, but they will be set again on the next
-        /*
+         * force-quiescent-state pass.  So lost bit sets do not result
-         * Alternatively, the CPU might be running in the kernel
+         * in incorrect behavior, merely in a grace period lasting
-         * for an extended period of time without a quiescent state.
+         * a few jiffies longer than it might otherwise.  Because
-         * Attempt to force the CPU through the scheduler to gain the
+         * there are at most four threads involved, and because the
-         * needed quiescent state, but only if the grace period has gone
+         * updates are only once every few jiffies, the probability of
-         * on for an uncommonly long time.  If there are many stuck CPUs,
+         * lossage (and thus of slight grace-period extension) is
-         * we will beat on the first one until it gets unstuck, then move
+         * quite low.
-         * to the next.  Only do this for the primary flavor of RCU.
+         *
+         * Note that if the jiffies_till_sched_qs boot/sysfs parameter
+         * is set too high, we override with half of the RCU CPU stall
+         * warning delay.
         */
-        if (rdp->rsp == rcu_state_p &&
+        rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
+        if (ULONG_CMP_GE(jiffies,
+                         rdp->rsp->gp_start + jiffies_till_sched_qs) ||
            ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
-                rdp->rsp->jiffies_resched += 5;
+                if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
-                resched_cpu(rdp->cpu);
+                        ACCESS_ONCE(rdp->cond_resched_completed) =
+                                ACCESS_ONCE(rdp->mynode->completed);
+                        smp_mb(); /* ->cond_resched_completed before *rcrmp. */
+                        ACCESS_ONCE(*rcrmp) =
+                                ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
+                        resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
+                        rdp->rsp->jiffies_resched += 5; /* Enable beating. */
+                } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
+                        /* Time to beat on that CPU again! */
+                        resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
+                        rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
+                }
        }
        return 0;
@@ -3491,6 +3572,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
                               "rcu_node_fqs_1",
                               "rcu_node_fqs_2",
                               "rcu_node_fqs_3" };  /* Match MAX_RCU_LVLS */
+        static u8 fl_mask = 0x1;
        int cpustride = 1;
        int i;
        int j;
@@ -3509,6 +3591,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
        for (i = 1; i < rcu_num_lvls; i++)
                rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
        rcu_init_levelspread(rsp);
+        rsp->flavor_mask = fl_mask;
+        fl_mask <<= 1;
        /* Initialize the elements themselves, starting from the leaves. */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bf2c1e669691..0f69a79c5b7d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -307,6 +307,9 @@ struct rcu_data {
        /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
        unsigned long dynticks_fqs;     /* Kicked due to dynticks idle. */
        unsigned long offline_fqs;      /* Kicked due to being offline. */
+        unsigned long cond_resched_completed;
+                                        /* Grace period that needs help */
+                                        /*  from cond_resched(). */
        /* 5) __rcu_pending() statistics. */
        unsigned long n_rcu_pending;    /* rcu_pending() calls since boot. */
@@ -392,6 +395,7 @@ struct rcu_state {
        struct rcu_node *level[RCU_NUM_LVLS];   /* Hierarchy levels. */
        u32 levelcnt[MAX_RCU_LVLS + 1];         /* # nodes in each level. */
        u8 levelspread[RCU_NUM_LVLS];           /* kids/node in each level. */
+        u8 flavor_mask;                         /* bit in flavor mask. */
        struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
        void (*call)(struct rcu_head *head,     /* call_rcu() flavor. */
                     void (*func)(struct rcu_head *head));
@@ -563,7 +567,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
 static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
 static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
 static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
-static void rcu_kick_nohz_cpu(int cpu);
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
 static bool init_nocb_callback_list(struct rcu_data *rdp);
 static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
 static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index cbc2c45265e2..02ac0fb186b8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2404,7 +2404,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
 * if an adaptive-ticks CPU is failing to respond to the current grace
 * period and has not be idle from an RCU perspective, kick it.
 */
-static void rcu_kick_nohz_cpu(int cpu)
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
 {
 #ifdef CONFIG_NO_HZ_FULL
        if (tick_nohz_full_cpu(cpu))
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index a2aeb4df0f60..bc7883570530 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -200,12 +200,12 @@ void wait_rcu_gp(call_rcu_func_t crf)
 EXPORT_SYMBOL_GPL(wait_rcu_gp);
 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-static inline void debug_init_rcu_head(struct rcu_head *head)
+void init_rcu_head(struct rcu_head *head)
 {
        debug_object_init(head, &rcuhead_debug_descr);
 }
-static inline void debug_rcu_head_free(struct rcu_head *head)
+void destroy_rcu_head(struct rcu_head *head)
 {
        debug_object_free(head, &rcuhead_debug_descr);
 }
@@ -350,21 +350,3 @@ static int __init check_cpu_stall_init(void)
 early_initcall(check_cpu_stall_init);
 #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
-/*
- * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
- */
-DEFINE_PER_CPU(int, rcu_cond_resched_count);
-/*
- * Report a set of RCU quiescent states, for use by cond_resched()
- * and friends.  Out of line due to being called infrequently.
- */
-void rcu_resched(void)
-{
-        preempt_disable();
-        __this_cpu_write(rcu_cond_resched_count, 0);
-        rcu_note_context_switch(smp_processor_id());
-        preempt_enable();
-}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3bdf01b494fe..bc1638b33449 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4147,7 +4147,6 @@ static void __cond_resched(void)
 int __sched _cond_resched(void)
 {
-        rcu_cond_resched();
        if (should_resched()) {
                __cond_resched();
                return 1;
@@ -4166,18 +4165,15 @@ EXPORT_SYMBOL(_cond_resched);
 */
 int __cond_resched_lock(spinlock_t *lock)
 {
-        bool need_rcu_resched = rcu_should_resched();
        int resched = should_resched();
        int ret = 0;
        lockdep_assert_held(lock);
-        if (spin_needbreak(lock) || resched || need_rcu_resched) {
+        if (spin_needbreak(lock) || resched) {
                spin_unlock(lock);
                if (resched)
                        __cond_resched();
-                else if (unlikely(need_rcu_resched))
-                        rcu_resched();
                else
                        cpu_relax();
                ret = 1;
@@ -4191,7 +4187,6 @@ int __sched __cond_resched_softirq(void)
 {
        BUG_ON(!in_softirq());
-        rcu_cond_resched();  /* BH disabled OK, just recording QSes. */
        if (should_resched()) {
                local_bh_enable();
                __cond_resched();
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 695f9773bb60..627b3c34b821 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -608,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
                avg_atom = p->se.sum_exec_runtime;
                if (nr_switches)
-                        do_div(avg_atom, nr_switches);
+                        avg_atom = div64_ul(avg_atom, nr_switches);
                else
                        avg_atom = -1LL;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 88c9c65a430d..fe75444ae7ec 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -585,9 +585,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
                                struct itimerspec *new_setting,
                                struct itimerspec *old_setting)
 {
+        ktime_t exp;
        if (!rtcdev)
                return -ENOTSUPP;
+        if (flags & ~TIMER_ABSTIME)
+                return -EINVAL;
        if (old_setting)
                alarm_timer_get(timr, old_setting);
@@ -597,8 +602,16 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
        /* start the timer */
        timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
-        alarm_start(&timr->it.alarm.alarmtimer,
+        exp = timespec_to_ktime(new_setting->it_value);
-                        timespec_to_ktime(new_setting->it_value));
+        /* Convert (if necessary) to absolute time */
+        if (flags != TIMER_ABSTIME) {
+                ktime_t now;
+                now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
+                exp = ktime_add(now, exp);
+        }
+        alarm_start(&timr->it.alarm.alarmtimer, exp);
        return 0;
 }
@@ -730,6 +743,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
        if (!alarmtimer_get_rtcdev())
                return -ENOTSUPP;
+        if (flags & ~TIMER_ABSTIME)
+                return -EINVAL;
        if (!capable(CAP_WAKE_ALARM))
                return -EPERM;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index ad362c260ef4..9c94c19f1305 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -146,7 +146,8 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
 {
        /* Nothing to do if we already reached the limit */
        if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
-                printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n");
+                printk_deferred(KERN_WARNING
+                                "CE: Reprogramming failure. Giving up\n");
                dev->next_event.tv64 = KTIME_MAX;
                return -ETIME;
        }
@@ -159,9 +160,10 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
        if (dev->min_delta_ns > MIN_DELTA_LIMIT)
                dev->min_delta_ns = MIN_DELTA_LIMIT;
-        printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
+        printk_deferred(KERN_WARNING
-               dev->name ? dev->name : "?",
+                        "CE: %s increased min_delta_ns to %llu nsec\n",
-               (unsigned long long) dev->min_delta_ns);
+                        dev->name ? dev->name : "?",
+                        (unsigned long long) dev->min_delta_ns);
        return 0;
 }
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 445106d2c729..01d2d15aa662 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -191,7 +191,8 @@ void __init sched_clock_postinit(void)
 static int sched_clock_suspend(void)
 {
-        sched_clock_poll(&sched_clock_timer);
+        update_sched_clock();
+        hrtimer_cancel(&sched_clock_timer);
        cd.suspended = true;
        return 0;
 }
@@ -199,6 +200,7 @@ static int sched_clock_suspend(void)
 static void sched_clock_resume(void)
 {
        cd.epoch_cyc = read_sched_clock();
+        hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
        cd.suspended = false;
 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5b372e3ed675..ac9d1dad630b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -265,12 +265,12 @@ static void update_ftrace_function(void)
                func = ftrace_ops_list_func;
        }
+        update_function_graph_func();
        /* If there's no change, then do nothing more here */
        if (ftrace_trace_function == func)
                return;
-        update_function_graph_func();
        /*
         * If we are using the list function, it doesn't care
         * about the function_trace_ops.
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7c56c3d06943..ff7027199a9a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -616,10 +616,6 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
        struct ring_buffer_per_cpu *cpu_buffer;
        struct rb_irq_work *work;
-        if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
-            (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
-                return POLLIN | POLLRDNORM;
        if (cpu == RING_BUFFER_ALL_CPUS)
                work = &buffer->irq_work;
        else {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f243444a3772..291397e66669 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -466,6 +466,12 @@ int __trace_puts(unsigned long ip, const char *str, int size)
        struct print_entry *entry;
        unsigned long irq_flags;
        int alloc;
+        int pc;
+        if (!(trace_flags & TRACE_ITER_PRINTK))
+                return 0;
+        pc = preempt_count();
        if (unlikely(tracing_selftest_running || tracing_disabled))
                return 0;
@@ -475,7 +481,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
        local_save_flags(irq_flags);
        buffer = global_trace.trace_buffer.buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, 
-                                          irq_flags, preempt_count());
+                                          irq_flags, pc);
        if (!event)
                return 0;
@@ -492,6 +498,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
                entry->buf[size] = '\0';
        __buffer_unlock_commit(buffer, event);
+        ftrace_trace_stack(buffer, irq_flags, 4, pc);
        return size;
 }
@@ -509,6 +516,12 @@ int __trace_bputs(unsigned long ip, const char *str)
        struct bputs_entry *entry;
        unsigned long irq_flags;
        int size = sizeof(struct bputs_entry);
+        int pc;
+        if (!(trace_flags & TRACE_ITER_PRINTK))
+                return 0;
+        pc = preempt_count();
        if (unlikely(tracing_selftest_running || tracing_disabled))
                return 0;
@@ -516,7 +529,7 @@ int __trace_bputs(unsigned long ip, const char *str)
        local_save_flags(irq_flags);
        buffer = global_trace.trace_buffer.buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
-                                          irq_flags, preempt_count());
+                                          irq_flags, pc);
        if (!event)
                return 0;
@@ -525,6 +538,7 @@ int __trace_bputs(unsigned long ip, const char *str)
        entry->str                      = str;
        __buffer_unlock_commit(buffer, event);
+        ftrace_trace_stack(buffer, irq_flags, 4, pc);
        return 1;
 }
@@ -809,7 +823,7 @@ static struct {
        { trace_clock_local,    "local",        1 },
        { trace_clock_global,   "global",       1 },
        { trace_clock_counter,  "counter",      0 },
-        { trace_clock_jiffies,  "uptime",       1 },
+        { trace_clock_jiffies,  "uptime",       0 },
        { trace_clock,          "perf",         1 },
        ARCH_TRACE_CLOCKS
 };
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 26dc348332b7..57b67b1f24d1 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -59,13 +59,14 @@ u64 notrace trace_clock(void)
 /*
 * trace_jiffy_clock(): Simply use jiffies as a clock counter.
+ * Note that this use of jiffies_64 is not completely safe on
+ * 32-bit systems. But the window is tiny, and the effect if
+ * we are affected is that we will have an obviously bogus
+ * timestamp on a trace event - i.e. not life threatening.
 */
 u64 notrace trace_clock_jiffies(void)
 {
-        u64 jiffy = jiffies - INITIAL_JIFFIES;
+        return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES);
-        /* Return nsecs */
-        return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
 }
 /*
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f99e0b3bca8c..2de53628689f 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -470,6 +470,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
        list_del(&file->list);
        remove_subsystem(file->system);
+        free_event_filter(file->filter);
        kmem_cache_free(file_cachep, file);
 }