24 files changed, 996 insertions, 384 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0f3527d6184a..2097684cf194 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -255,12 +255,17 @@ int cgroup_lock_is_held(void)
 EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
+static int css_unbias_refcnt(int refcnt)
+{
+        return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
+}
 /* the current nr of refs, always >= 0 whether @css is deactivated or not */
 static int css_refcnt(struct cgroup_subsys_state *css)
 {
        int v = atomic_read(&css->refcnt);
-        return v >= 0 ? v : v - CSS_DEACT_BIAS;
+        return css_unbias_refcnt(v);
 }
 /* convenient tests for these bits */
@@ -896,10 +901,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                mutex_unlock(&cgroup_mutex);
                /*
-                 * Drop the active superblock reference that we took when we
+                 * We want to drop the active superblock reference from the
-                 * created the cgroup
+                 * cgroup creation after all the dentry refs are gone -
+                 * kill_sb gets mighty unhappy otherwise.  Mark
+                 * dentry->d_fsdata with cgroup_diput() to tell
+                 * cgroup_d_release() to call deactivate_super().
                 */
-                deactivate_super(cgrp->root->sb);
+                dentry->d_fsdata = cgroup_diput;
                /*
                 * if we're getting rid of the cgroup, refcount should ensure
@@ -925,6 +933,13 @@ static int cgroup_delete(const struct dentry *d)
        return 1;
 }
+static void cgroup_d_release(struct dentry *dentry)
+{
+        /* did cgroup_diput() tell me to deactivate super? */
+        if (dentry->d_fsdata == cgroup_diput)
+                deactivate_super(dentry->d_sb);
+}
 static void remove_dir(struct dentry *d)
 {
        struct dentry *parent = dget(d->d_parent);
@@ -1532,6 +1547,7 @@ static int cgroup_get_rootdir(struct super_block *sb)
        static const struct dentry_operations cgroup_dops = {
                .d_iput = cgroup_diput,
                .d_delete = cgroup_delete,
+                .d_release = cgroup_d_release,
        };
        struct inode *inode =
@@ -4971,10 +4987,12 @@ EXPORT_SYMBOL_GPL(__css_tryget);
 void __css_put(struct cgroup_subsys_state *css)
 {
        struct cgroup *cgrp = css->cgroup;
+        int v;
        rcu_read_lock();
-        atomic_dec(&css->refcnt);
+        v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
-        switch (css_refcnt(css)) {
+        switch (v) {
        case 1:
                if (notify_on_release(cgrp)) {
                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5b06cbbf6931..d7d71d6ec972 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event)
        return !event->cgrp || event->cgrp == cpuctx->cgrp;
 }
-static inline void perf_get_cgroup(struct perf_event *event)
+static inline bool perf_tryget_cgroup(struct perf_event *event)
 {
-        css_get(&event->cgrp->css);
+        return css_tryget(&event->cgrp->css);
 }
 static inline void perf_put_cgroup(struct perf_event *event)
@@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
        event->cgrp = cgrp;
        /* must be done before we fput() the file */
-        perf_get_cgroup(event);
+        if (!perf_tryget_cgroup(event)) {
+                event->cgrp = NULL;
+                ret = -ENOENT;
+                goto out;
+        }
        /*
         * all events in a group must monitor
@@ -3181,7 +3185,6 @@ static void perf_event_for_each(struct perf_event *event,
        event = event->group_leader;
        perf_event_for_each_child(event, func);
-        func(event);
        list_for_each_entry(sibling, &event->sibling_list, group_entry)
                perf_event_for_each_child(sibling, func);
        mutex_unlock(&ctx->mutex);
diff --git a/kernel/exit.c b/kernel/exit.c
index 34867cc5b42a..2f59cc334516 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,6 +72,18 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
                list_del_rcu(&p->tasks);
                list_del_init(&p->sibling);
                __this_cpu_dec(process_counts);
+                /*
+                 * If we are the last child process in a pid namespace to be
+                 * reaped, notify the reaper sleeping zap_pid_ns_processes().
+                 */
+                if (IS_ENABLED(CONFIG_PID_NS)) {
+                        struct task_struct *parent = p->real_parent;
+                        if ((task_active_pid_ns(parent)->child_reaper == parent) &&
+                            list_empty(&parent->children) &&
+                            (parent->flags & PF_EXITING))
+                                wake_up_process(parent);
+                }
        }
        list_del_rcu(&p->thread_group);
 }
@@ -643,6 +655,7 @@ static void exit_mm(struct task_struct * tsk)
        mm_release(tsk, mm);
        if (!mm)
                return;
+        sync_mm_rss(mm);
        /*
         * Serialize with any possible pending coredump.
         * We must hold mmap_sem around checking core_state
@@ -719,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
                zap_pid_ns_processes(pid_ns);
                write_lock_irq(&tasklist_lock);
-                /*
-                 * We can not clear ->child_reaper or leave it alone.
-                 * There may by stealth EXIT_DEAD tasks on ->children,
-                 * forget_original_parent() must move them somewhere.
-                 */
-                pid_ns->child_reaper = init_pid_ns.child_reaper;
        } else if (father->signal->has_child_subreaper) {
                struct task_struct *reaper;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index fc275e4f629b..eebd6d5cfb44 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -275,8 +275,10 @@ void handle_nested_irq(unsigned int irq)
        kstat_incr_irqs_this_cpu(irq, desc);
        action = desc->action;
-        if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
+        if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
+                desc->istate |= IRQS_PENDING;
                goto out_unlock;
+        }
        irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
        raw_spin_unlock_irq(&desc->lock);
@@ -324,8 +326,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
        kstat_incr_irqs_this_cpu(irq, desc);
-        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
+        if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+                desc->istate |= IRQS_PENDING;
                goto out_unlock;
+        }
        handle_irq_event(desc);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 8e5c56b3b7d9..001fa5bab490 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -101,6 +101,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
 extern void irq_set_thread_affinity(struct irq_desc *desc);
+extern int irq_do_set_affinity(struct irq_data *data,
+                               const struct cpumask *dest, bool force);
 /* Inline functions for support of irq chips on slow busses */
 static inline void chip_bus_lock(struct irq_desc *desc)
 {
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ea0c6c2ae6f7..8c548232ba39 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -142,6 +142,25 @@ static inline void
 irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
 #endif
+int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                        bool force)
+{
+        struct irq_desc *desc = irq_data_to_desc(data);
+        struct irq_chip *chip = irq_data_get_irq_chip(data);
+        int ret;
+        ret = chip->irq_set_affinity(data, mask, false);
+        switch (ret) {
+        case IRQ_SET_MASK_OK:
+                cpumask_copy(data->affinity, mask);
+        case IRQ_SET_MASK_OK_NOCOPY:
+                irq_set_thread_affinity(desc);
+                ret = 0;
+        }
+        return ret;
+}
 int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
 {
        struct irq_chip *chip = irq_data_get_irq_chip(data);
@@ -152,14 +171,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
                return -EINVAL;
        if (irq_can_move_pcntxt(data)) {
-                ret = chip->irq_set_affinity(data, mask, false);
+                ret = irq_do_set_affinity(data, mask, false);
-                switch (ret) {
-                case IRQ_SET_MASK_OK:
-                        cpumask_copy(data->affinity, mask);
-                case IRQ_SET_MASK_OK_NOCOPY:
-                        irq_set_thread_affinity(desc);
-                        ret = 0;
-                }
        } else {
                irqd_set_move_pending(data);
                irq_copy_pending(desc, mask);
@@ -283,9 +295,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
 static int
 setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
-        struct irq_chip *chip = irq_desc_get_chip(desc);
        struct cpumask *set = irq_default_affinity;
-        int ret, node = desc->irq_data.node;
+        int node = desc->irq_data.node;
        /* Excludes PER_CPU and NO_BALANCE interrupts */
        if (!irq_can_set_affinity(irq))
@@ -311,13 +322,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
                if (cpumask_intersects(mask, nodemask))
                        cpumask_and(mask, mask, nodemask);
        }
-        ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
+        irq_do_set_affinity(&desc->irq_data, mask, false);
-        switch (ret) {
-        case IRQ_SET_MASK_OK:
-                cpumask_copy(desc->irq_data.affinity, mask);
-        case IRQ_SET_MASK_OK_NOCOPY:
-                irq_set_thread_affinity(desc);
-        }
        return 0;
 }
 #else
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index c3c89751b327..ca3f4aaff707 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -42,17 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata)
         * For correct operation this depends on the caller
         * masking the irqs.
         */
-        if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
+        if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)
-                   < nr_cpu_ids)) {
+                irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false);
-                int ret = chip->irq_set_affinity(&desc->irq_data,
-                                                 desc->pending_mask, false);
-                switch (ret) {
-                case IRQ_SET_MASK_OK:
-                        cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
-                case IRQ_SET_MASK_OK_NOCOPY:
-                        irq_set_thread_affinity(desc);
-                }
-        }
        cpumask_clear(desc->pending_mask);
 }
diff --git a/kernel/panic.c b/kernel/panic.c
index 8ed89a175d79..d2a5f4ecc6dd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,7 +27,7 @@
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
-int panic_on_oops;
+int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
 static unsigned long tainted_mask;
 static int pause_on_oops;
 static int pause_on_oops_flag;
@@ -108,8 +108,6 @@ void panic(const char *fmt, ...)
         */
        crash_kexec(NULL);
-        kmsg_dump(KMSG_DUMP_PANIC);
        /*
         * Note smp_send_stop is the usual smp shutdown function, which
         * unfortunately means it may not be hardened to work in a panic
@@ -117,6 +115,8 @@ void panic(const char *fmt, ...)
         */
        smp_send_stop();
+        kmsg_dump(KMSG_DUMP_PANIC);
        atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
        bust_spinlocks(0);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 16b20e38c4a1..b3c7fd554250 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -184,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        }
        read_unlock(&tasklist_lock);
+        /* Firstly reap the EXIT_ZOMBIE children we may have. */
        do {
                clear_thread_flag(TIF_SIGPENDING);
                rc = sys_wait4(-1, NULL, __WALL, NULL);
        } while (rc != -ECHILD);
+        /*
+         * sys_wait4() above can't reap the TASK_DEAD children.
+         * Make sure they all go away, see __unhash_process().
+         */
+        for (;;) {
+                bool need_wait = false;
+                read_lock(&tasklist_lock);
+                if (!list_empty(&current->children)) {
+                        __set_current_state(TASK_UNINTERRUPTIBLE);
+                        need_wait = true;
+                }
+                read_unlock(&tasklist_lock);
+                if (!need_wait)
+                        break;
+                schedule();
+        }
        if (pid_ns->reboot)
                current->signal->group_exit_code = pid_ns->reboot;
diff --git a/kernel/printk.c b/kernel/printk.c
index 32462d2b364a..dba18211685e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -193,12 +193,19 @@ static int console_may_schedule;
 * separated by ',', and find the message after the ';' character.
 */
+enum log_flags {
+        LOG_DEFAULT = 0,
+        LOG_NOCONS = 1,         /* already flushed, do not print to console */
+};
 struct log {
        u64 ts_nsec;            /* timestamp in nanoseconds */
        u16 len;                /* length of entire record */
        u16 text_len;           /* length of text buffer */
        u16 dict_len;           /* length of dictionary buffer */
-        u16 level;              /* syslog level + facility */
+        u8 facility;            /* syslog facility */
+        u8 flags:5;             /* internal record flags */
+        u8 level:3;             /* syslog level */
 };
 /*
@@ -227,10 +234,10 @@ static u32 clear_idx;
 #define LOG_LINE_MAX 1024
 /* record buffer */
-#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
 #define LOG_ALIGN 4
 #else
-#define LOG_ALIGN 8
+#define LOG_ALIGN __alignof__(struct log)
 #endif
 #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
 static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
@@ -286,6 +293,7 @@ static u32 log_next(u32 idx)
 /* insert record into the buffer, discard old ones, update heads */
 static void log_store(int facility, int level,
+                      enum log_flags flags, u64 ts_nsec,
                      const char *dict, u16 dict_len,
                      const char *text, u16 text_len)
 {
@@ -329,8 +337,13 @@ static void log_store(int facility, int level,
        msg->text_len = text_len;
        memcpy(log_dict(msg), dict, dict_len);
        msg->dict_len = dict_len;
-        msg->level = (facility << 3) | (level & 7);
+        msg->facility = facility;
-        msg->ts_nsec = local_clock();
+        msg->level = level & 7;
+        msg->flags = flags & 0x1f;
+        if (ts_nsec > 0)
+                msg->ts_nsec = ts_nsec;
+        else
+                msg->ts_nsec = local_clock();
        memset(log_dict(msg) + dict_len, 0, pad_len);
        msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
@@ -414,7 +427,9 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
        if (!user)
                return -EBADF;
-        mutex_lock(&user->lock);
+        ret = mutex_lock_interruptible(&user->lock);
+        if (ret)
+                return ret;
        raw_spin_lock(&logbuf_lock);
        while (user->seq == log_next_seq) {
                if (file->f_flags & O_NONBLOCK) {
@@ -444,7 +459,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
        ts_usec = msg->ts_nsec;
        do_div(ts_usec, 1000);
        len = sprintf(user->buf, "%u,%llu,%llu;",
-                      msg->level, user->seq, ts_usec);
+                      (msg->facility << 3) | msg->level, user->seq, ts_usec);
        /* escape non-printable characters */
        for (i = 0; i < msg->text_len; i++) {
@@ -785,6 +800,21 @@ static bool printk_time;
 #endif
 module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
+static size_t print_time(u64 ts, char *buf)
+{
+        unsigned long rem_nsec;
+        if (!printk_time)
+                return 0;
+        if (!buf)
+                return 15;
+        rem_nsec = do_div(ts, 1000000000);
+        return sprintf(buf, "[%5lu.%06lu] ",
+                       (unsigned long)ts, rem_nsec / 1000);
+}
 static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
 {
        size_t len = 0;
@@ -801,18 +831,7 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
                }
        }
-        if (printk_time) {
+        len += print_time(msg->ts_nsec, buf ? buf + len : NULL);
-                if (buf) {
-                        unsigned long long ts = msg->ts_nsec;
-                        unsigned long rem_nsec = do_div(ts, 1000000000);
-                        len += sprintf(buf + len, "[%5lu.%06lu] ",
-                                         (unsigned long) ts, rem_nsec / 1000);
-                } else {
-                        len += 15;
-                }
-        }
        return len;
 }
@@ -860,26 +879,49 @@ static int syslog_print(char __user *buf, int size)
 {
        char *text;
        struct log *msg;
-        int len;
+        int len = 0;
        text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
        if (!text)
                return -ENOMEM;
-        raw_spin_lock_irq(&logbuf_lock);
+        while (size > 0) {
-        if (syslog_seq < log_first_seq) {
+                size_t n;
-                /* messages are gone, move to first one */
-                syslog_seq = log_first_seq;
+                raw_spin_lock_irq(&logbuf_lock);
-                syslog_idx = log_first_idx;
+                if (syslog_seq < log_first_seq) {
-        }
+                        /* messages are gone, move to first one */
-        msg = log_from_idx(syslog_idx);
+                        syslog_seq = log_first_seq;
-        len = msg_print_text(msg, true, text, LOG_LINE_MAX);
+                        syslog_idx = log_first_idx;
-        syslog_idx = log_next(syslog_idx);
+                }
-        syslog_seq++;
+                if (syslog_seq == log_next_seq) {
-        raw_spin_unlock_irq(&logbuf_lock);
+                        raw_spin_unlock_irq(&logbuf_lock);
+                        break;
+                }
+                msg = log_from_idx(syslog_idx);
+                n = msg_print_text(msg, true, text, LOG_LINE_MAX);
+                if (n <= size) {
+                        syslog_idx = log_next(syslog_idx);
+                        syslog_seq++;
+                } else
+                        n = 0;
+                raw_spin_unlock_irq(&logbuf_lock);
+                if (!n)
+                        break;
+                len += n;
+                size -= n;
+                buf += n;
+                n = copy_to_user(buf - n, text, n);
-        if (len > 0 && copy_to_user(buf, text, len))
+                if (n) {
-                len = -EFAULT;
+                        len -= n;
+                        if (!len)
+                                len = -EFAULT;
+                        break;
+                }
+        }
        kfree(text);
        return len;
@@ -909,7 +951,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                /*
                 * Find first record that fits, including all following records,
                 * into the user-provided buffer for this dump.
-                */
+                 */
                seq = clear_seq;
                idx = clear_idx;
                while (seq < log_next_seq) {
@@ -919,6 +961,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                        idx = log_next(idx);
                        seq++;
                }
+                /* move first record forward until length fits into the buffer */
                seq = clear_seq;
                idx = clear_idx;
                while (len > size && seq < log_next_seq) {
@@ -929,7 +973,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                        seq++;
                }
-                /* last message in this dump */
+                /* last message fitting into this dump */
                next_seq = log_next_seq;
                len = 0;
@@ -974,6 +1018,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 {
        bool clear = false;
        static int saved_console_loglevel = -1;
+        static DEFINE_MUTEX(syslog_mutex);
        int error;
        error = check_syslog_permissions(type, from_file);
@@ -1000,11 +1045,17 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                        error = -EFAULT;
                        goto out;
                }
+                error = mutex_lock_interruptible(&syslog_mutex);
+                if (error)
+                        goto out;
                error = wait_event_interruptible(log_wait,
                                                 syslog_seq != log_next_seq);
-                if (error)
+                if (error) {
+                        mutex_unlock(&syslog_mutex);
                        goto out;
+                }
                error = syslog_print(buf, len);
+                mutex_unlock(&syslog_mutex);
                break;
        /* Read/clear last kernel messages */
        case SYSLOG_ACTION_READ_CLEAR:
@@ -1027,6 +1078,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
        /* Clear ring buffer */
        case SYSLOG_ACTION_CLEAR:
                syslog_print_all(NULL, 0, true);
+                break;
        /* Disable logging to console */
        case SYSLOG_ACTION_CONSOLE_OFF:
                if (saved_console_loglevel == -1)
@@ -1259,15 +1311,92 @@ static inline void printk_delay(void)
        }
 }
+/*
+ * Continuation lines are buffered, and not committed to the record buffer
+ * until the line is complete, or a race forces it. The line fragments
+ * though, are printed immediately to the consoles to ensure everything has
+ * reached the console in case of a kernel crash.
+ */
+static struct cont {
+        char buf[LOG_LINE_MAX];
+        size_t len;                     /* length == 0 means unused buffer */
+        size_t cons;                    /* bytes written to console */
+        struct task_struct *owner;      /* task of first print*/
+        u64 ts_nsec;                    /* time of first print */
+        u8 level;                       /* log level of first message */
+        u8 facility;                    /* log level of first message */
+        bool flushed:1;                 /* buffer sealed and committed */
+} cont;
+static void cont_flush(void)
+{
+        if (cont.flushed)
+                return;
+        if (cont.len == 0)
+                return;
+        log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec,
+                  NULL, 0, cont.buf, cont.len);
+        cont.flushed = true;
+}
+static bool cont_add(int facility, int level, const char *text, size_t len)
+{
+        if (cont.len && cont.flushed)
+                return false;
+        if (cont.len + len > sizeof(cont.buf)) {
+                cont_flush();
+                return false;
+        }
+        if (!cont.len) {
+                cont.facility = facility;
+                cont.level = level;
+                cont.owner = current;
+                cont.ts_nsec = local_clock();
+                cont.cons = 0;
+                cont.flushed = false;
+        }
+        memcpy(cont.buf + cont.len, text, len);
+        cont.len += len;
+        return true;
+}
+static size_t cont_print_text(char *text, size_t size)
+{
+        size_t textlen = 0;
+        size_t len;
+        if (cont.cons == 0) {
+                textlen += print_time(cont.ts_nsec, text);
+                size -= textlen;
+        }
+        len = cont.len - cont.cons;
+        if (len > 0) {
+                if (len+1 > size)
+                        len = size-1;
+                memcpy(text + textlen, cont.buf + cont.cons, len);
+                textlen += len;
+                cont.cons = cont.len;
+        }
+        if (cont.flushed) {
+                text[textlen++] = '\n';
+                /* got everything, release buffer */
+                cont.len = 0;
+        }
+        return textlen;
+}
 asmlinkage int vprintk_emit(int facility, int level,
                            const char *dict, size_t dictlen,
                            const char *fmt, va_list args)
 {
        static int recursion_bug;
-        static char cont_buf[LOG_LINE_MAX];
-        static size_t cont_len;
-        static int cont_level;
-        static struct task_struct *cont_task;
        static char textbuf[LOG_LINE_MAX];
        char *text = textbuf;
        size_t text_len;
@@ -1313,7 +1442,8 @@ asmlinkage int vprintk_emit(int facility, int level,
                recursion_bug = 0;
                printed_len += strlen(recursion_msg);
                /* emit KERN_CRIT message */
-                log_store(0, 2, NULL, 0, recursion_msg, printed_len);
+                log_store(0, 2, LOG_DEFAULT, 0,
+                          NULL, 0, recursion_msg, printed_len);
        }
        /*
@@ -1351,55 +1481,37 @@ asmlinkage int vprintk_emit(int facility, int level,
        }
        if (!newline) {
-                if (cont_len && (prefix || cont_task != current)) {
+                /*
-                        /*
+                 * Flush the conflicting buffer. An earlier newline was missing,
-                         * Flush earlier buffer, which is either from a
+                 * or another task also prints continuation lines.
-                         * different thread, or when we got a new prefix.
+                 */
-                         */
+                if (cont.len && (prefix || cont.owner != current))
-                        log_store(facility, cont_level, NULL, 0, cont_buf, cont_len);
+                        cont_flush();
-                        cont_len = 0;
-                }
-                if (!cont_len) {
-                        cont_level = level;
-                        cont_task = current;
-                }
-                /* buffer or append to earlier buffer from the same thread */
+                /* buffer line if possible, otherwise store it right away */
-                if (cont_len + text_len > sizeof(cont_buf))
+                if (!cont_add(facility, level, text, text_len))
-                        text_len = sizeof(cont_buf) - cont_len;
+                        log_store(facility, level, LOG_DEFAULT, 0,
-                memcpy(cont_buf + cont_len, text, text_len);
+                                  dict, dictlen, text, text_len);
-                cont_len += text_len;
        } else {
-                if (cont_len && cont_task == current) {
+                bool stored = false;
-                        if (prefix) {
-                                /*
-                                 * New prefix from the same thread; flush. We
-                                 * either got no earlier newline, or we race
-                                 * with an interrupt.
-                                 */
-                                log_store(facility, cont_level,
-                                          NULL, 0, cont_buf, cont_len);
-                                cont_len = 0;
-                        }
-                        /* append to the earlier buffer and flush */
+                /*
-                        if (cont_len + text_len > sizeof(cont_buf))
+                 * If an earlier newline was missing and it was the same task,
-                                text_len = sizeof(cont_buf) - cont_len;
+                 * either merge it with the current buffer and flush, or if
-                        memcpy(cont_buf + cont_len, text, text_len);
+                 * there was a race with interrupts (prefix == true) then just
-                        cont_len += text_len;
+                 * flush it out and store this line separately.
-                        log_store(facility, cont_level,
+                 */
-                                  NULL, 0, cont_buf, cont_len);
+                if (cont.len && cont.owner == current) {
-                        cont_len = 0;
+                        if (!prefix)
-                        cont_task = NULL;
+                                stored = cont_add(facility, level, text, text_len);
-                        printed_len = cont_len;
+                        cont_flush();
-                } else {
-                        /* ordinary single and terminated line */
-                        log_store(facility, level,
-                                  dict, dictlen, text, text_len);
-                        printed_len = text_len;
                }
+                if (!stored)
+                        log_store(facility, level, LOG_DEFAULT, 0,
+                                  dict, dictlen, text, text_len);
        }
+        printed_len += text_len;
        /*
         * Try to acquire and then immediately release the console semaphore.
@@ -1486,11 +1598,18 @@ EXPORT_SYMBOL(printk);
 #else
 #define LOG_LINE_MAX 0
+static struct cont {
+        size_t len;
+        size_t cons;
+        u8 level;
+        bool flushed:1;
+} cont;
 static struct log *log_from_idx(u32 idx) { return NULL; }
 static u32 log_next(u32 idx) { return 0; }
 static void call_console_drivers(int level, const char *text, size_t len) {}
 static size_t msg_print_text(const struct log *msg, bool syslog,
                             char *buf, size_t size) { return 0; }
+static size_t cont_print_text(char *text, size_t size) { return 0; }
 #endif /* CONFIG_PRINTK */
@@ -1782,6 +1901,7 @@ static u32 console_idx;
 */
 void console_unlock(void)
 {
+        static char text[LOG_LINE_MAX];
        static u64 seen_seq;
        unsigned long flags;
        bool wake_klogd = false;
@@ -1794,10 +1914,23 @@ void console_unlock(void)
        console_may_schedule = 0;
+        /* flush buffered message fragment immediately to console */
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        if (cont.len && (cont.cons < cont.len || cont.flushed)) {
+                size_t len;
+                len = cont_print_text(text, sizeof(text));
+                raw_spin_unlock(&logbuf_lock);
+                stop_critical_timings();
+                call_console_drivers(cont.level, text, len);
+                start_critical_timings();
+                local_irq_restore(flags);
+        } else
+                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 again:
        for (;;) {
                struct log *msg;
-                static char text[LOG_LINE_MAX];
                size_t len;
                int level;
@@ -1812,13 +1945,22 @@ again:
                        console_seq = log_first_seq;
                        console_idx = log_first_idx;
                }
+skip:
                if (console_seq == log_next_seq)
                        break;
                msg = log_from_idx(console_idx);
-                level = msg->level & 7;
+                if (msg->flags & LOG_NOCONS) {
+                        /*
+                         * Skip record we have buffered and already printed
+                         * directly to the console when we received it.
+                         */
+                        console_idx = log_next(console_idx);
+                        console_seq++;
+                        goto skip;
+                }
+                level = msg->level;
                len = msg_print_text(msg, false, text, sizeof(text));
                console_idx = log_next(console_idx);
@@ -2300,48 +2442,210 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
 * kmsg_dump - dump kernel log to kernel message dumpers.
 * @reason: the reason (oops, panic etc) for dumping
 *
- * Iterate through each of the dump devices and call the oops/panic
+ * Call each of the registered dumper's dump() callback, which can
- * callbacks with the log buffer.
+ * retrieve the kmsg records with kmsg_dump_get_line() or
+ * kmsg_dump_get_buffer().
 */
 void kmsg_dump(enum kmsg_dump_reason reason)
 {
-        u64 idx;
        struct kmsg_dumper *dumper;
-        const char *s1, *s2;
-        unsigned long l1, l2;
        unsigned long flags;
        if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
                return;
-        /* Theoretically, the log could move on after we do this, but
+        rcu_read_lock();
-           there's not a lot we can do about that. The new messages
+        list_for_each_entry_rcu(dumper, &dump_list, list) {
-           will overwrite the start of what we dump. */
+                if (dumper->max_reason && reason > dumper->max_reason)
+                        continue;
+                /* initialize iterator with data about the stored records */
+                dumper->active = true;
+                raw_spin_lock_irqsave(&logbuf_lock, flags);
+                dumper->cur_seq = clear_seq;
+                dumper->cur_idx = clear_idx;
+                dumper->next_seq = log_next_seq;
+                dumper->next_idx = log_next_idx;
+                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+                /* invoke dumper which will iterate over records */
+                dumper->dump(dumper, reason);
+                /* reset iterator */
+                dumper->active = false;
+        }
+        rcu_read_unlock();
+}
+/**
+ * kmsg_dump_get_line - retrieve one kmsg log line
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @line: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the beginning of the kmsg buffer, with the oldest kmsg
+ * record, and copy one record into the provided buffer.
+ *
+ * Consecutive calls will return the next available record moving
+ * towards the end of the buffer with the youngest messages.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ */
+bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+                        char *line, size_t size, size_t *len)
+{
+        unsigned long flags;
+        struct log *msg;
+        size_t l = 0;
+        bool ret = false;
+        if (!dumper->active)
+                goto out;
        raw_spin_lock_irqsave(&logbuf_lock, flags);
-        if (syslog_seq < log_first_seq)
+        if (dumper->cur_seq < log_first_seq) {
-                idx = syslog_idx;
+                /* messages are gone, move to first available one */
-        else
+                dumper->cur_seq = log_first_seq;
-                idx = log_first_idx;
+                dumper->cur_idx = log_first_idx;
+        }
-        if (idx > log_next_idx) {
+        /* last entry */
-                s1 = log_buf;
+        if (dumper->cur_seq >= log_next_seq) {
-                l1 = log_next_idx;
+                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+                goto out;
+        }
-                s2 = log_buf + idx;
+        msg = log_from_idx(dumper->cur_idx);
-                l2 = log_buf_len - idx;
+        l = msg_print_text(msg, syslog,
-        } else {
+                              line, size);
-                s1 = "";
-                l1 = 0;
-                s2 = log_buf + idx;
+        dumper->cur_idx = log_next(dumper->cur_idx);
-                l2 = log_next_idx - idx;
+        dumper->cur_seq++;
+        ret = true;
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+out:
+        if (len)
+                *len = l;
+        return ret;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
+/**
+ * kmsg_dump_get_buffer - copy kmsg log lines
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @buf: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the end of the kmsg buffer and fill the provided buffer
+ * with as many of the the *youngest* kmsg records that fit into it.
+ * If the buffer is large enough, all available kmsg records will be
+ * copied with a single call.
+ *
+ * Consecutive calls will fill the buffer with the next block of
+ * available older records, not including the earlier retrieved ones.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ */
+bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
+                          char *buf, size_t size, size_t *len)
+{
+        unsigned long flags;
+        u64 seq;
+        u32 idx;
+        u64 next_seq;
+        u32 next_idx;
+        size_t l = 0;
+        bool ret = false;
+        if (!dumper->active)
+                goto out;
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        if (dumper->cur_seq < log_first_seq) {
+                /* messages are gone, move to first available one */
+                dumper->cur_seq = log_first_seq;
+                dumper->cur_idx = log_first_idx;
        }
+        /* last entry */
+        if (dumper->cur_seq >= dumper->next_seq) {
+                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+                goto out;
+        }
+        /* calculate length of entire buffer */
+        seq = dumper->cur_seq;
+        idx = dumper->cur_idx;
+        while (seq < dumper->next_seq) {
+                struct log *msg = log_from_idx(idx);
+                l += msg_print_text(msg, true, NULL, 0);
+                idx = log_next(idx);
+                seq++;
+        }
+        /* move first record forward until length fits into the buffer */
+        seq = dumper->cur_seq;
+        idx = dumper->cur_idx;
+        while (l > size && seq < dumper->next_seq) {
+                struct log *msg = log_from_idx(idx);
+                l -= msg_print_text(msg, true, NULL, 0);
+                idx = log_next(idx);
+                seq++;
+        }
+        /* last message in next interation */
+        next_seq = seq;
+        next_idx = idx;
+        l = 0;
+        while (seq < dumper->next_seq) {
+                struct log *msg = log_from_idx(idx);
+                l += msg_print_text(msg, syslog,
+                                    buf + l, size - l);
+                idx = log_next(idx);
+                seq++;
+        }
+        dumper->next_seq = next_seq;
+        dumper->next_idx = next_idx;
+        ret = true;
        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+out:
+        if (len)
+                *len = l;
+        return ret;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
-        rcu_read_lock();
+/**
-        list_for_each_entry_rcu(dumper, &dump_list, list)
+ * kmsg_dump_rewind - reset the interator
-                dumper->dump(dumper, reason, s1, l1, s2, l2);
+ * @dumper: registered kmsg dumper
-        rcu_read_unlock();
+ *
+ * Reset the dumper's iterator so that kmsg_dump_get_line() and
+ * kmsg_dump_get_buffer() can be called again and used multiple
+ * times within the same dumper.dump() callback.
+ */
+void kmsg_dump_rewind(struct kmsg_dumper *dumper)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        dumper->cur_seq = clear_seq;
+        dumper->cur_idx = clear_idx;
+        dumper->next_seq = log_next_seq;
+        dumper->next_idx = log_next_idx;
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 }
+EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
 #endif
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0da7b88d92d0..38ecdda3f55f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1397,6 +1397,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
        rdp->qlen_lazy += rsp->qlen_lazy;
        rdp->qlen += rsp->qlen;
        rdp->n_cbs_adopted += rsp->qlen;
+        if (rsp->qlen_lazy != rsp->qlen)
+                rcu_idle_count_callbacks_posted();
        rsp->qlen_lazy = 0;
        rsp->qlen = 0;
@@ -1528,7 +1530,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 {
        unsigned long flags;
        struct rcu_head *next, *list, **tail;
-        int bl, count, count_lazy;
+        int bl, count, count_lazy, i;
        /* If no callbacks are ready, just return.*/
        if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
@@ -1551,9 +1553,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
        *rdp->nxttail[RCU_DONE_TAIL] = NULL;
        tail = rdp->nxttail[RCU_DONE_TAIL];
-        for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
+        for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
-                if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
+                if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
-                        rdp->nxttail[count] = &rdp->nxtlist;
+                        rdp->nxttail[i] = &rdp->nxtlist;
        local_irq_restore(flags);
        /* Invoke callbacks. */
@@ -1581,9 +1583,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        if (list != NULL) {
                *tail = rdp->nxtlist;
                rdp->nxtlist = list;
-                for (count = 0; count < RCU_NEXT_SIZE; count++)
+                for (i = 0; i < RCU_NEXT_SIZE; i++)
-                        if (&rdp->nxtlist == rdp->nxttail[count])
+                        if (&rdp->nxtlist == rdp->nxttail[i])
-                                rdp->nxttail[count] = tail;
+                                rdp->nxttail[i] = tail;
                        else
                                break;
        }
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 7f5d138dedf5..ea056495783e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,6 +84,20 @@ struct rcu_dynticks {
                                    /* Process level is worth LLONG_MAX/2. */
        int dynticks_nmi_nesting;   /* Track NMI nesting level. */
        atomic_t dynticks;          /* Even value for idle, else odd. */
+#ifdef CONFIG_RCU_FAST_NO_HZ
+        int dyntick_drain;          /* Prepare-for-idle state variable. */
+        unsigned long dyntick_holdoff;
+                                    /* No retries for the jiffy of failure. */
+        struct timer_list idle_gp_timer;
+                                    /* Wake up CPU sleeping with callbacks. */
+        unsigned long idle_gp_timer_expires;
+                                    /* When to wake up CPU (for repost). */
+        bool idle_first_pass;       /* First pass of attempt to go idle? */
+        unsigned long nonlazy_posted;
+                                    /* # times non-lazy CBs posted to CPU. */
+        unsigned long nonlazy_posted_snap;
+                                    /* idle-period nonlazy_posted snapshot. */
+#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 };
 /* RCU's kthread states for tracing. */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 2411000d9869..5271a020887e 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1886,8 +1886,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
 * any flavor of RCU.
 */
-int rcu_needs_cpu(int cpu)
+int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 {
+        *delta_jiffies = ULONG_MAX;
        return rcu_cpu_has_callbacks(cpu);
 }
@@ -1962,41 +1963,6 @@ static void rcu_idle_count_callbacks_posted(void)
 #define RCU_IDLE_GP_DELAY 6             /* Roughly one grace period. */
 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
-/* Loop counter for rcu_prepare_for_idle(). */
-static DEFINE_PER_CPU(int, rcu_dyntick_drain);
-/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */
-static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
-/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */
-static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer);
-/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */
-static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires);
-/* Enable special processing on first attempt to enter dyntick-idle mode. */
-static DEFINE_PER_CPU(bool, rcu_idle_first_pass);
-/* Running count of non-lazy callbacks posted, never decremented. */
-static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted);
-/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */
-static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap);
-/*
- * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
- * callbacks on this CPU, (2) this CPU has not yet attempted to enter
- * dyntick-idle mode, or (3) this CPU is in the process of attempting to
- * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
- * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
- * it is better to incur scheduling-clock interrupts than to spin
- * continuously for the same time duration!
- */
-int rcu_needs_cpu(int cpu)
-{
-        /* Flag a new idle sojourn to the idle-entry state machine. */
-        per_cpu(rcu_idle_first_pass, cpu) = 1;
-        /* If no callbacks, RCU doesn't need the CPU. */
-        if (!rcu_cpu_has_callbacks(cpu))
-                return 0;
-        /* Otherwise, RCU needs the CPU only if it recently tried and failed. */
-        return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
-}
 /*
 * Does the specified flavor of RCU have non-lazy callbacks pending on
 * the specified CPU?  Both RCU flavor and CPU are specified by the
@@ -2040,6 +2006,47 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
 }
 /*
+ * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
+ * callbacks on this CPU, (2) this CPU has not yet attempted to enter
+ * dyntick-idle mode, or (3) this CPU is in the process of attempting to
+ * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
+ * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
+ * it is better to incur scheduling-clock interrupts than to spin
+ * continuously for the same time duration!
+ *
+ * The delta_jiffies argument is used to store the time when RCU is
+ * going to need the CPU again if it still has callbacks.  The reason
+ * for this is that rcu_prepare_for_idle() might need to post a timer,
+ * but if so, it will do so after tick_nohz_stop_sched_tick() has set
+ * the wakeup time for this CPU.  This means that RCU's timer can be
+ * delayed until the wakeup time, which defeats the purpose of posting
+ * a timer.
+ */
+int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
+{
+        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+        /* Flag a new idle sojourn to the idle-entry state machine. */
+        rdtp->idle_first_pass = 1;
+        /* If no callbacks, RCU doesn't need the CPU. */
+        if (!rcu_cpu_has_callbacks(cpu)) {
+                *delta_jiffies = ULONG_MAX;
+                return 0;
+        }
+        if (rdtp->dyntick_holdoff == jiffies) {
+                /* RCU recently tried and failed, so don't try again. */
+                *delta_jiffies = 1;
+                return 1;
+        }
+        /* Set up for the possibility that RCU will post a timer. */
+        if (rcu_cpu_has_nonlazy_callbacks(cpu))
+                *delta_jiffies = RCU_IDLE_GP_DELAY;
+        else
+                *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY;
+        return 0;
+}
+/*
 * Handler for smp_call_function_single().  The only point of this
 * handler is to wake the CPU up, so the handler does only tracing.
 */
@@ -2075,21 +2082,24 @@ static void rcu_idle_gp_timer_func(unsigned long cpu_in)
 */
 static void rcu_prepare_for_idle_init(int cpu)
 {
-        per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-        setup_timer(&per_cpu(rcu_idle_gp_timer, cpu),
-                    rcu_idle_gp_timer_func, cpu);
+        rdtp->dyntick_holdoff = jiffies - 1;
-        per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1;
+        setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
-        per_cpu(rcu_idle_first_pass, cpu) = 1;
+        rdtp->idle_gp_timer_expires = jiffies - 1;
+        rdtp->idle_first_pass = 1;
 }
 /*
 * Clean up for exit from idle.  Because we are exiting from idle, there
- * is no longer any point to rcu_idle_gp_timer, so cancel it.  This will
+ * is no longer any point to ->idle_gp_timer, so cancel it.  This will
 * do nothing if this timer is not active, so just cancel it unconditionally.
 */
 static void rcu_cleanup_after_idle(int cpu)
 {
-        del_timer(&per_cpu(rcu_idle_gp_timer, cpu));
+        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+        del_timer(&rdtp->idle_gp_timer);
        trace_rcu_prep_idle("Cleanup after idle");
 }
@@ -2108,42 +2118,41 @@ static void rcu_cleanup_after_idle(int cpu)
 * Because it is not legal to invoke rcu_process_callbacks() with irqs
 * disabled, we do one pass of force_quiescent_state(), then do a
 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
- * later.  The per-cpu rcu_dyntick_drain variable controls the sequencing.
+ * later.  The ->dyntick_drain field controls the sequencing.
 *
 * The caller must have disabled interrupts.
 */
 static void rcu_prepare_for_idle(int cpu)
 {
        struct timer_list *tp;
+        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
        /*
         * If this is an idle re-entry, for example, due to use of
         * RCU_NONIDLE() or the new idle-loop tracing API within the idle
         * loop, then don't take any state-machine actions, unless the
         * momentary exit from idle queued additional non-lazy callbacks.
-         * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks
+         * Instead, repost the ->idle_gp_timer if this CPU has callbacks
         * pending.
         */
-        if (!per_cpu(rcu_idle_first_pass, cpu) &&
+        if (!rdtp->idle_first_pass &&
-            (per_cpu(rcu_nonlazy_posted, cpu) ==
+            (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
-             per_cpu(rcu_nonlazy_posted_snap, cpu))) {
                if (rcu_cpu_has_callbacks(cpu)) {
-                        tp = &per_cpu(rcu_idle_gp_timer, cpu);
+                        tp = &rdtp->idle_gp_timer;
-                        mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
+                        mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
                }
                return;
        }
-        per_cpu(rcu_idle_first_pass, cpu) = 0;
+        rdtp->idle_first_pass = 0;
-        per_cpu(rcu_nonlazy_posted_snap, cpu) =
+        rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
-                per_cpu(rcu_nonlazy_posted, cpu) - 1;
        /*
         * If there are no callbacks on this CPU, enter dyntick-idle mode.
         * Also reset state to avoid prejudicing later attempts.
         */
        if (!rcu_cpu_has_callbacks(cpu)) {
-                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+                rdtp->dyntick_holdoff = jiffies - 1;
-                per_cpu(rcu_dyntick_drain, cpu) = 0;
+                rdtp->dyntick_drain = 0;
                trace_rcu_prep_idle("No callbacks");
                return;
        }
@@ -2152,36 +2161,37 @@ static void rcu_prepare_for_idle(int cpu)
         * If in holdoff mode, just return.  We will presumably have
         * refrained from disabling the scheduling-clock tick.
         */
-        if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
+        if (rdtp->dyntick_holdoff == jiffies) {
                trace_rcu_prep_idle("In holdoff");
                return;
        }
-        /* Check and update the rcu_dyntick_drain sequencing. */
+        /* Check and update the ->dyntick_drain sequencing. */
-        if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
+        if (rdtp->dyntick_drain <= 0) {
                /* First time through, initialize the counter. */
-                per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
+                rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
-        } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
+        } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
                   !rcu_pending(cpu) &&
                   !local_softirq_pending()) {
                /* Can we go dyntick-idle despite still having callbacks? */
-                trace_rcu_prep_idle("Dyntick with callbacks");
+                rdtp->dyntick_drain = 0;
-                per_cpu(rcu_dyntick_drain, cpu) = 0;
+                rdtp->dyntick_holdoff = jiffies;
-                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
+                if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-                if (rcu_cpu_has_nonlazy_callbacks(cpu))
+                        trace_rcu_prep_idle("Dyntick with callbacks");
-                        per_cpu(rcu_idle_gp_timer_expires, cpu) =
+                        rdtp->idle_gp_timer_expires =
                                           jiffies + RCU_IDLE_GP_DELAY;
-                else
+                } else {
-                        per_cpu(rcu_idle_gp_timer_expires, cpu) =
+                        rdtp->idle_gp_timer_expires =
                                           jiffies + RCU_IDLE_LAZY_GP_DELAY;
-                tp = &per_cpu(rcu_idle_gp_timer, cpu);
+                        trace_rcu_prep_idle("Dyntick with lazy callbacks");
-                mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
+                }
-                per_cpu(rcu_nonlazy_posted_snap, cpu) =
+                tp = &rdtp->idle_gp_timer;
-                        per_cpu(rcu_nonlazy_posted, cpu);
+                mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
+                rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
                return; /* Nothing more to do immediately. */
-        } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
+        } else if (--(rdtp->dyntick_drain) <= 0) {
                /* We have hit the limit, so time to give up. */
-                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
+                rdtp->dyntick_holdoff = jiffies;
                trace_rcu_prep_idle("Begin holdoff");
                invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */
                return;
@@ -2227,7 +2237,7 @@ static void rcu_prepare_for_idle(int cpu)
 */
 static void rcu_idle_count_callbacks_posted(void)
 {
-        __this_cpu_add(rcu_nonlazy_posted, 1);
+        __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
 }
 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
@@ -2238,11 +2248,12 @@ static void rcu_idle_count_callbacks_posted(void)
 static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 {
-        struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu);
+        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+        struct timer_list *tltp = &rdtp->idle_gp_timer;
        sprintf(cp, "drain=%d %c timer=%lu",
-                per_cpu(rcu_dyntick_drain, cpu),
+                rdtp->dyntick_drain,
-                per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.',
+                rdtp->dyntick_holdoff == jiffies ? 'H' : '.',
                timer_pending(tltp) ? tltp->expires - jiffies : -1);
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 39eb6011bc38..d5594a4268d4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
 #define SCHED_FEAT(name, enabled)       \
        #name ,
-static __read_mostly char *sched_feat_names[] = {
+static const char * const sched_feat_names[] = {
 #include "features.h"
-        NULL
 };
 #undef SCHED_FEAT
@@ -2517,25 +2516,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
        sched_avg_update(this_rq);
 }
+#ifdef CONFIG_NO_HZ
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
 /*
 * Called from nohz_idle_balance() to update the load ratings before doing the
 * idle balance.
 */
 void update_idle_cpu_load(struct rq *this_rq)
 {
-        unsigned long curr_jiffies = jiffies;
+        unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
        unsigned long load = this_rq->load.weight;
        unsigned long pending_updates;
        /*
-         * Bloody broken means of dealing with nohz, but better than nothing..
+         * bail if there's load or we're actually up-to-date.
-         * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
-         * update and see 0 difference the one time and 2 the next, even though
-         * we ticked at roughtly the same rate.
-         *
-         * Hence we only use this from nohz_idle_balance() and skip this
-         * nonsense when called from the scheduler_tick() since that's
-         * guaranteed a stable rate.
         */
        if (load || curr_jiffies == this_rq->last_load_update_tick)
                return;
@@ -2547,12 +2553,38 @@ void update_idle_cpu_load(struct rq *this_rq)
 }
 /*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+        struct rq *this_rq = this_rq();
+        unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+        unsigned long pending_updates;
+        if (curr_jiffies == this_rq->last_load_update_tick)
+                return;
+        raw_spin_lock(&this_rq->lock);
+        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+        if (pending_updates) {
+                this_rq->last_load_update_tick = curr_jiffies;
+                /*
+                 * We were idle, this means load 0, the current load might be
+                 * !0 due to remote wakeups and the sort.
+                 */
+                __update_cpu_load(this_rq, 0, pending_updates);
+        }
+        raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+/*
 * Called from scheduler_tick()
 */
 static void update_cpu_load_active(struct rq *this_rq)
 {
        /*
-         * See the mess in update_idle_cpu_load().
+         * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
         */
        this_rq->last_load_update_tick = jiffies;
        __update_cpu_load(this_rq, this_rq->load.weight, 1);
@@ -4982,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
                p->sched_class->set_cpus_allowed(p, new_mask);
        cpumask_copy(&p->cpus_allowed, new_mask);
-        p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+        p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 /*
@@ -5524,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 #ifdef CONFIG_SCHED_DEBUG
-static __read_mostly int sched_domain_debug_enabled;
+static __read_mostly int sched_debug_enabled;
-static int __init sched_domain_debug_setup(char *str)
+static int __init sched_debug_setup(char *str)
 {
-        sched_domain_debug_enabled = 1;
+        sched_debug_enabled = 1;
        return 0;
 }
-early_param("sched_debug", sched_domain_debug_setup);
+early_param("sched_debug", sched_debug_setup);
+static inline bool sched_debug(void)
+{
+        return sched_debug_enabled;
+}
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  struct cpumask *groupmask)
@@ -5572,7 +5609,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
-                if (!group->sgp->power) {
+                /*
+                 * Even though we initialize ->power to something semi-sane,
+                 * we leave power_orig unset. This allows us to detect if
+                 * domain iteration is still funny without causing /0 traps.
+                 */
+                if (!group->sgp->power_orig) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: domain->cpu_power not "
                                        "set\n");
@@ -5620,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
        int level = 0;
-        if (!sched_domain_debug_enabled)
+        if (!sched_debug_enabled)
                return;
        if (!sd) {
@@ -5641,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+        return false;
+}
 #endif /* CONFIG_SCHED_DEBUG */
 static int sd_degenerate(struct sched_domain *sd)
@@ -5962,6 +6008,44 @@ struct sched_domain_topology_level {
        struct sd_data      data;
 };
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * cpu they're built on, so check that.
+ *
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+        const struct cpumask *span = sched_domain_span(sd);
+        struct sd_data *sdd = sd->private;
+        struct sched_domain *sibling;
+        int i;
+        for_each_cpu(i, span) {
+                sibling = *per_cpu_ptr(sdd->sd, i);
+                if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+                        continue;
+                cpumask_set_cpu(i, sched_group_mask(sg));
+        }
+}
+/*
+ * Return the canonical balance cpu for this group, this is the first cpu
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+        return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
 static int
 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 {
@@ -5980,6 +6064,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                if (cpumask_test_cpu(i, covered))
                        continue;
+                child = *per_cpu_ptr(sdd->sd, i);
+                /* See the comment near build_group_mask(). */
+                if (!cpumask_test_cpu(i, sched_domain_span(child)))
+                        continue;
                sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
                                GFP_KERNEL, cpu_to_node(cpu));
@@ -5987,8 +6077,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                        goto fail;
                sg_span = sched_group_cpus(sg);
-                child = *per_cpu_ptr(sdd->sd, i);
                if (child->child) {
                        child = child->child;
                        cpumask_copy(sg_span, sched_domain_span(child));
@@ -5997,10 +6085,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                cpumask_or(covered, covered, sg_span);
-                sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+                sg->sgp = *per_cpu_ptr(sdd->sgp, i);
-                atomic_inc(&sg->sgp->ref);
+                if (atomic_inc_return(&sg->sgp->ref) == 1)
+                        build_group_mask(sd, sg);
-                if (cpumask_test_cpu(cpu, sg_span))
+                /*
+                 * Initialize sgp->power such that even if we mess up the
+                 * domains and no possible iteration will get us here, we won't
+                 * die on a /0 trap.
+                 */
+                sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+                /*
+                 * Make sure the first group of this domain contains the
+                 * canonical balance cpu. Otherwise the sched_domain iteration
+                 * breaks. See update_sg_lb_stats().
+                 */
+                if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+                    group_balance_cpu(sg) == cpu)
                        groups = sg;
                if (!first)
@@ -6074,6 +6176,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
                cpumask_clear(sched_group_cpus(sg));
                sg->sgp->power = 0;
+                cpumask_setall(sched_group_mask(sg));
                for_each_cpu(j, span) {
                        if (get_group(j, sdd, NULL) != group)
@@ -6115,7 +6218,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
                sg = sg->next;
        } while (sg != sd->groups);
-        if (cpu != group_first_cpu(sg))
+        if (cpu != group_balance_cpu(sg))
                return;
        update_group_power(sd, cpu);
@@ -6165,11 +6268,8 @@ int sched_domain_level_max;
 static int __init setup_relax_domain_level(char *str)
 {
-        unsigned long val;
+        if (kstrtoint(str, 0, &default_relax_domain_level))
+                pr_warn("Unable to set relax_domain_level\n");
-        val = simple_strtoul(str, NULL, 0);
-        if (val < sched_domain_level_max)
-                default_relax_domain_level = val;
        return 1;
 }
@@ -6279,14 +6379,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol
 #ifdef CONFIG_NUMA
 static int sched_domains_numa_levels;
-static int sched_domains_numa_scale;
 static int *sched_domains_numa_distance;
 static struct cpumask ***sched_domains_numa_masks;
 static int sched_domains_curr_level;
 static inline int sd_local_flags(int level)
 {
-        if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
+        if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
                return 0;
        return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
@@ -6344,6 +6443,42 @@ static const struct cpumask *sd_numa_mask(int cpu)
        return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
 }
+static void sched_numa_warn(const char *str)
+{
+        static int done = false;
+        int i,j;
+        if (done)
+                return;
+        done = true;
+        printk(KERN_WARNING "ERROR: %s\n\n", str);
+        for (i = 0; i < nr_node_ids; i++) {
+                printk(KERN_WARNING "  ");
+                for (j = 0; j < nr_node_ids; j++)
+                        printk(KERN_CONT "%02d ", node_distance(i,j));
+                printk(KERN_CONT "\n");
+        }
+        printk(KERN_WARNING "\n");
+}
+static bool find_numa_distance(int distance)
+{
+        int i;
+        if (distance == node_distance(0, 0))
+                return true;
+        for (i = 0; i < sched_domains_numa_levels; i++) {
+                if (sched_domains_numa_distance[i] == distance)
+                        return true;
+        }
+        return false;
+}
 static void sched_init_numa(void)
 {
        int next_distance, curr_distance = node_distance(0, 0);
@@ -6351,7 +6486,6 @@ static void sched_init_numa(void)
        int level = 0;
        int i, j, k;
-        sched_domains_numa_scale = curr_distance;
        sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
        if (!sched_domains_numa_distance)
                return;
@@ -6362,23 +6496,41 @@ static void sched_init_numa(void)
         *
         * Assumes node_distance(0,j) includes all distances in
         * node_distance(i,j) in order to avoid cubic time.
-         *
-         * XXX: could be optimized to O(n log n) by using sort()
         */
        next_distance = curr_distance;
        for (i = 0; i < nr_node_ids; i++) {
                for (j = 0; j < nr_node_ids; j++) {
-                        int distance = node_distance(0, j);
+                        for (k = 0; k < nr_node_ids; k++) {
-                        if (distance > curr_distance &&
+                                int distance = node_distance(i, k);
-                                        (distance < next_distance ||
-                                         next_distance == curr_distance))
+                                if (distance > curr_distance &&
-                                next_distance = distance;
+                                    (distance < next_distance ||
+                                     next_distance == curr_distance))
+                                        next_distance = distance;
+                                /*
+                                 * While not a strong assumption it would be nice to know
+                                 * about cases where if node A is connected to B, B is not
+                                 * equally connected to A.
+                                 */
+                                if (sched_debug() && node_distance(k, i) != distance)
+                                        sched_numa_warn("Node-distance not symmetric");
+                                if (sched_debug() && i && !find_numa_distance(distance))
+                                        sched_numa_warn("Node-0 not representative");
+                        }
+                        if (next_distance != curr_distance) {
+                                sched_domains_numa_distance[level++] = next_distance;
+                                sched_domains_numa_levels = level;
+                                curr_distance = next_distance;
+                        } else break;
                }
-                if (next_distance != curr_distance) {
-                        sched_domains_numa_distance[level++] = next_distance;
+                /*
-                        sched_domains_numa_levels = level;
+                 * In case of sched_debug() we verify the above assumption.
-                        curr_distance = next_distance;
+                 */
-                } else break;
+                if (!sched_debug())
+                        break;
        }
        /*
         * 'level' contains the number of unique distances, excluding the
@@ -6403,7 +6555,7 @@ static void sched_init_numa(void)
                        return;
                for (j = 0; j < nr_node_ids; j++) {
-                        struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
+                        struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
                        if (!mask)
                                return;
@@ -6490,7 +6642,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                        *per_cpu_ptr(sdd->sg, j) = sg;
-                        sgp = kzalloc_node(sizeof(struct sched_group_power),
+                        sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
                                        GFP_KERNEL, cpu_to_node(j));
                        if (!sgp)
                                return -ENOMEM;
@@ -6543,7 +6695,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
        if (!sd)
                return child;
-        set_domain_attribute(sd, attr);
        cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
        if (child) {
                sd->level = child->level + 1;
@@ -6551,6 +6702,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                child->parent = sd;
        }
        sd->child = child;
+        set_domain_attribute(sd, attr);
        return sd;
 }
@@ -6691,7 +6843,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
        if (!doms_cur)
                doms_cur = &fallback_doms;
        cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
-        dattr_cur = NULL;
        err = build_sched_domains(doms_cur[0], NULL);
        register_sched_domain_sysctl();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 940e6d17cf96..c099cc6eebe3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
        int want_sd = 1;
        int sync = wake_flags & WF_SYNC;
-        if (p->rt.nr_cpus_allowed == 1)
+        if (p->nr_cpus_allowed == 1)
                return prev_cpu;
        if (sd_flag & SD_BALANCE_WAKE) {
@@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
 unsigned long scale_rt_power(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-        u64 total, available;
+        u64 total, available, age_stamp, avg;
-        total = sched_avg_period() + (rq->clock - rq->age_stamp);
+        /*
+         * Since we're reading these variables without serialization make sure
+         * we read them once before doing sanity checks on them.
+         */
+        age_stamp = ACCESS_ONCE(rq->age_stamp);
+        avg = ACCESS_ONCE(rq->rt_avg);
+        total = sched_avg_period() + (rq->clock - age_stamp);
-        if (unlikely(total < rq->rt_avg)) {
+        if (unlikely(total < avg)) {
                /* Ensures that power won't end up being negative */
                available = 0;
        } else {
-                available = total - rq->rt_avg;
+                available = total - avg;
        }
        if (unlikely((s64)total < SCHED_POWER_SCALE))
@@ -3574,13 +3581,28 @@ void update_group_power(struct sched_domain *sd, int cpu)
        power = 0;
-        group = child->groups;
+        if (child->flags & SD_OVERLAP) {
-        do {
+                /*
-                power += group->sgp->power;
+                 * SD_OVERLAP domains cannot assume that child groups
-                group = group->next;
+                 * span the current group.
-        } while (group != child->groups);
+                 */
-        sdg->sgp->power = power;
+                for_each_cpu(cpu, sched_group_cpus(sdg))
+                        power += power_of(cpu);
+        } else  {
+                /*
+                 * !SD_OVERLAP domains can assume that child groups
+                 * span the current group.
+                 */ 
+                group = child->groups;
+                do {
+                        power += group->sgp->power;
+                        group = group->next;
+                } while (group != child->groups);
+        }
+        sdg->sgp->power_orig = sdg->sgp->power = power;
 }
 /*
@@ -3610,7 +3632,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 /**
 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: The sched_domain whose statistics are to be updated.
+ * @env: The load balancing environment.
 * @group: sched_group whose statistics are to be updated.
 * @load_idx: Load index of sched_domain of this_cpu for load calc.
 * @local_group: Does group contain this_cpu.
@@ -3630,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        int i;
        if (local_group)
-                balance_cpu = group_first_cpu(group);
+                balance_cpu = group_balance_cpu(group);
        /* Tally up the load of all CPUs in the group */
        max_cpu_load = 0;
@@ -3645,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                /* Bias balancing toward cpus of our domain */
                if (local_group) {
-                        if (idle_cpu(i) && !first_idle_cpu) {
+                        if (idle_cpu(i) && !first_idle_cpu &&
+                                        cpumask_test_cpu(i, sched_group_mask(group))) {
                                first_idle_cpu = 1;
                                balance_cpu = i;
                        }
@@ -3719,11 +3742,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 /**
 * update_sd_pick_busiest - return 1 on busiest group
- * @sd: sched_domain whose statistics are to be checked
+ * @env: The load balancing environment.
 * @sds: sched_domain statistics
 * @sg: sched_group candidate to be checked for being the busiest
 * @sgs: sched_group statistics
- * @this_cpu: the current cpu
 *
 * Determine if @sg is a busier group than the previously selected
 * busiest group.
@@ -3761,9 +3783,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 /**
 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
- * @sd: sched_domain whose statistics are to be updated.
+ * @env: The load balancing environment.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
 * @cpus: Set of cpus considered for load balancing.
 * @balance: Should we balance.
 * @sds: variable to hold the statistics for this sched_domain.
@@ -3852,10 +3872,8 @@ static inline void update_sd_lb_stats(struct lb_env *env,
 * Returns 1 when packing is required and a task should be moved to
 * this CPU.  The amount of the imbalance is returned in *imbalance.
 *
- * @sd: The sched_domain whose packing is to be checked.
+ * @env: The load balancing environment.
 * @sds: Statistics of the sched_domain which is to be packed
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: returns amount of imbalanced due to packing.
 */
 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
 {
@@ -3881,9 +3899,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
 * fix_small_imbalance - Calculate the minor imbalance that exists
 *                      amongst the groups of a sched_domain, during
 *                      load balancing.
+ * @env: The load balancing environment.
 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: Variable to store the imbalance.
 */
 static inline
 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
@@ -4026,11 +4043,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 * Also calculates the amount of weighted load which should be moved
 * to restore balance.
 *
- * @sd: The sched_domain whose busiest group is to be returned.
+ * @env: The load balancing environment.
- * @this_cpu: The cpu for which load balancing is currently being performed.
- * @imbalance: Variable which stores amount of weighted load which should
- *              be moved to restore balance/put a group to idle.
- * @idle: The idle status of this_cpu.
 * @cpus: The set of CPUs under consideration for load-balancing.
 * @balance: Pointer to a variable indicating if this_cpu
 *      is the appropriate cpu to perform load balancing at this_level.
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c5565c3c515f..573e1ca01102 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq)
 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+        struct task_struct *p;
        if (!rt_entity_is_task(rt_se))
                return;
+        p = rt_task_of(rt_se);
        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
        rt_rq->rt_nr_total++;
-        if (rt_se->nr_cpus_allowed > 1)
+        if (p->nr_cpus_allowed > 1)
                rt_rq->rt_nr_migratory++;
        update_rt_migration(rt_rq);
@@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+        struct task_struct *p;
        if (!rt_entity_is_task(rt_se))
                return;
+        p = rt_task_of(rt_se);
        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
        rt_rq->rt_nr_total--;
-        if (rt_se->nr_cpus_allowed > 1)
+        if (p->nr_cpus_allowed > 1)
                rt_rq->rt_nr_migratory--;
        update_rt_migration(rt_rq);
@@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
-        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
        inc_nr_running(rq);
@@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
        cpu = task_cpu(p);
-        if (p->rt.nr_cpus_allowed == 1)
+        if (p->nr_cpus_allowed == 1)
                goto out;
        /* For anything but wake ups, just return the task_cpu */
@@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
         * will have to sort it out.
         */
        if (curr && unlikely(rt_task(curr)) &&
-            (curr->rt.nr_cpus_allowed < 2 ||
+            (curr->nr_cpus_allowed < 2 ||
             curr->prio <= p->prio) &&
-            (p->rt.nr_cpus_allowed > 1)) {
+            (p->nr_cpus_allowed > 1)) {
                int target = find_lowest_rq(p);
                if (target != -1)
@@ -1276,10 +1282,10 @@ out:
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 {
-        if (rq->curr->rt.nr_cpus_allowed == 1)
+        if (rq->curr->nr_cpus_allowed == 1)
                return;
-        if (p->rt.nr_cpus_allowed != 1
+        if (p->nr_cpus_allowed != 1
            && cpupri_find(&rq->rd->cpupri, p, NULL))
                return;
@@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
         * The previous task needs to be made eligible for pushing
         * if it is still active
         */
-        if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
+        if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
 }
@@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
            (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
-            (p->rt.nr_cpus_allowed > 1))
+            (p->nr_cpus_allowed > 1))
                return 1;
        return 0;
 }
@@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task)
        if (unlikely(!lowest_mask))
                return -1;
-        if (task->rt.nr_cpus_allowed == 1)
+        if (task->nr_cpus_allowed == 1)
                return -1; /* No other targets possible */
        if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1556,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                                     task_running(rq, task) ||
                                     !task->on_rq)) {
-                                raw_spin_unlock(&lowest_rq->lock);
+                                double_unlock_balance(rq, lowest_rq);
                                lowest_rq = NULL;
                                break;
                        }
@@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
        BUG_ON(rq->cpu != task_cpu(p));
        BUG_ON(task_current(rq, p));
-        BUG_ON(p->rt.nr_cpus_allowed <= 1);
+        BUG_ON(p->nr_cpus_allowed <= 1);
        BUG_ON(!p->on_rq);
        BUG_ON(!rt_task(p));
@@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
        if (!task_running(rq, p) &&
            !test_tsk_need_resched(rq->curr) &&
            has_pushable_tasks(rq) &&
-            p->rt.nr_cpus_allowed > 1 &&
+            p->nr_cpus_allowed > 1 &&
            rt_task(rq->curr) &&
-            (rq->curr->rt.nr_cpus_allowed < 2 ||
+            (rq->curr->nr_cpus_allowed < 2 ||
             rq->curr->prio <= p->prio))
                push_rt_tasks(rq);
 }
@@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
         * Only update if the process changes its state from whether it
         * can migrate or not.
         */
-        if ((p->rt.nr_cpus_allowed > 1) == (weight > 1))
+        if ((p->nr_cpus_allowed > 1) == (weight > 1))
                return;
        rq = task_rq(p);
@@ -1979,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
+        struct sched_rt_entity *rt_se = &p->rt;
        update_curr_rt(rq);
        watchdog(rq, p);
@@ -1996,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
        p->rt.time_slice = RR_TIMESLICE;
        /*
-         * Requeue to the end of queue if we are not the only element
+         * Requeue to the end of queue if we (and all of our ancestors) are the
-         * on the queue:
+         * only element on the queue
         */
-        if (p->rt.run_list.prev != p->rt.run_list.next) {
+        for_each_sched_rt_entity(rt_se) {
-                requeue_task_rt(rq, p, 0);
+                if (rt_se->run_list.prev != rt_se->run_list.next) {
-                set_tsk_need_resched(p);
+                        requeue_task_rt(rq, p, 0);
+                        set_tsk_need_resched(p);
+                        return;
+                }
        }
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ba9dccfd24ce..6d52cea7f33d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_id);
+extern int group_balance_cpu(struct sched_group *sg);
 #endif /* CONFIG_SMP */
 #include "stats.h"
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index e1a797e028a3..98f60c5caa1b 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -31,6 +31,12 @@ void __init idle_thread_set_boot_cpu(void)
        per_cpu(idle_threads, smp_processor_id()) = current;
 }
+/**
+ * idle_init - Initialize the idle thread for a cpu
+ * @cpu:        The cpu for which the idle thread should be initialized
+ *
+ * Creates the thread if it does not exist.
+ */
 static inline void idle_init(unsigned int cpu)
 {
        struct task_struct *tsk = per_cpu(idle_threads, cpu);
@@ -45,17 +51,16 @@ static inline void idle_init(unsigned int cpu)
 }
 /**
- * idle_thread_init - Initialize the idle thread for a cpu
+ * idle_threads_init - Initialize idle threads for all cpus
- * @cpu:        The cpu for which the idle thread should be initialized
- *
- * Creates the thread if it does not exist.
 */
 void __init idle_threads_init(void)
 {
-        unsigned int cpu;
+        unsigned int cpu, boot_cpu;
+        boot_cpu = smp_processor_id();
        for_each_possible_cpu(cpu) {
-                if (cpu != smp_processor_id())
+                if (cpu != boot_cpu)
                        idle_init(cpu);
        }
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index 9ff89cb9657a..e0c8ffc50d7f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1786,27 +1786,13 @@ SYSCALL_DEFINE1(umask, int, mask)
 }
 #ifdef CONFIG_CHECKPOINT_RESTORE
-static bool vma_flags_mismatch(struct vm_area_struct *vma,
-                               unsigned long required,
-                               unsigned long banned)
-{
-        return (vma->vm_flags & required) != required ||
-                (vma->vm_flags & banned);
-}
 static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
+        struct vm_area_struct *vma;
        struct file *exe_file;
        struct dentry *dentry;
        int err;
-        /*
-         * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's
-         * remain. So perform a quick test first.
-         */
-        if (mm->num_exe_file_vmas)
-                return -EBUSY;
        exe_file = fget(fd);
        if (!exe_file)
                return -EBADF;
@@ -1827,17 +1813,30 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
        if (err)
                goto exit;
+        down_write(&mm->mmap_sem);
+        /*
+         * Forbid mm->exe_file change if there are mapped other files.
+         */
+        err = -EBUSY;
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (vma->vm_file && !path_equal(&vma->vm_file->f_path,
+                                                &exe_file->f_path))
+                        goto exit_unlock;
+        }
        /*
         * The symlink can be changed only once, just to disallow arbitrary
         * transitions malicious software might bring in. This means one
         * could make a snapshot over all processes running and monitor
         * /proc/pid/exe changes to notice unusual activity if needed.
         */
-        down_write(&mm->mmap_sem);
+        err = -EPERM;
-        if (likely(!mm->exe_file))
+        if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
-                set_mm_exe_file(mm, exe_file);
+                goto exit_unlock;
-        else
-                err = -EBUSY;
+        set_mm_exe_file(mm, exe_file);
+exit_unlock:
        up_write(&mm->mmap_sem);
 exit:
@@ -1862,7 +1861,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
        if (opt == PR_SET_MM_EXE_FILE)
                return prctl_set_mm_exe_file(mm, (unsigned int)addr);
-        if (addr >= TASK_SIZE)
+        if (addr >= TASK_SIZE || addr < mmap_min_addr)
                return -EINVAL;
        error = -EINVAL;
@@ -1924,12 +1923,6 @@ static int prctl_set_mm(int opt, unsigned long addr,
                        error = -EFAULT;
                        goto out;
                }
-#ifdef CONFIG_STACK_GROWSUP
-                if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0))
-#else
-                if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0))
-#endif
-                        goto out;
                if (opt == PR_SET_MM_START_STACK)
                        mm->start_stack = addr;
                else if (opt == PR_SET_MM_ARG_START)
@@ -1981,12 +1974,22 @@ out:
        up_read(&mm->mmap_sem);
        return error;
 }
+static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
+{
+        return put_user(me->clear_child_tid, tid_addr);
+}
 #else /* CONFIG_CHECKPOINT_RESTORE */
 static int prctl_set_mm(int opt, unsigned long addr,
                        unsigned long arg4, unsigned long arg5)
 {
        return -EINVAL;
 }
+static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
+{
+        return -EINVAL;
+}
 #endif
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
@@ -2141,6 +2144,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                case PR_SET_MM:
                        error = prctl_set_mm(arg2, arg3, arg4, arg5);
                        break;
+                case PR_GET_TID_ADDRESS:
+                        error = prctl_get_tid_address(me, (int __user **)arg2);
+                        break;
                case PR_SET_CHILD_SUBREAPER:
                        me->signal->is_child_subreaper = !!arg2;
                        error = 0;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9cd928f7a7c6..7e1ce012a851 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev)
 }
 EXPORT_SYMBOL_GPL(clockevents_register_device);
-static void clockevents_config(struct clock_event_device *dev,
+void clockevents_config(struct clock_event_device *dev, u32 freq)
-                               u32 freq)
 {
        u64 sec;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6a3a5b9ff561..869997833928 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -274,6 +274,7 @@ EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
 {
        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
+        unsigned long rcu_delta_jiffies;
        ktime_t last_update, expires, now;
        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
        u64 time_delta;
@@ -322,7 +323,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
                time_delta = timekeeping_max_deferment();
        } while (read_seqretry(&xtime_lock, seq));
-        if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
+        if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) ||
            arch_needs_cpu(cpu)) {
                next_jiffies = last_jiffies + 1;
                delta_jiffies = 1;
@@ -330,6 +331,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
                /* Get the next timer wheel timer */
                next_jiffies = get_next_timer_interrupt(last_jiffies);
                delta_jiffies = next_jiffies - last_jiffies;
+                if (rcu_delta_jiffies < delta_jiffies) {
+                        next_jiffies = last_jiffies + rcu_delta_jiffies;
+                        delta_jiffies = rcu_delta_jiffies;
+                }
        }
        /*
         * Do not stop the tick, if we are only one off
@@ -576,6 +581,7 @@ void tick_nohz_idle_exit(void)
        /* Update jiffies first */
        select_nohz_load_balancer(0);
        tick_do_update_jiffies64(now);
+        update_cpu_load_nohz();
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
        /*
@@ -814,6 +820,16 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
        return HRTIMER_RESTART;
 }
+static int sched_skew_tick;
+static int __init skew_tick(char *str)
+{
+        get_option(&str, &sched_skew_tick);
+        return 0;
+}
+early_param("skew_tick", skew_tick);
 /**
 * tick_setup_sched_timer - setup the tick emulation timer
 */
@@ -831,6 +847,14 @@ void tick_setup_sched_timer(void)
        /* Get the next period (per cpu) */
        hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
+        /* Offset the tick to avert xtime_lock contention. */
+        if (sched_skew_tick) {
+                u64 offset = ktime_to_ns(tick_period) >> 1;
+                do_div(offset, num_possible_cpus());
+                offset *= smp_processor_id();
+                hrtimer_add_expires_ns(&ts->sched_timer, offset);
+        }
        for (;;) {
                hrtimer_forward(&ts->sched_timer, now, tick_period);
                hrtimer_start_expires(&ts->sched_timer,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6e46cacf5969..6f46a00a1e8a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -962,6 +962,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
                timekeeper.xtime.tv_sec++;
                leap = second_overflow(timekeeper.xtime.tv_sec);
                timekeeper.xtime.tv_sec += leap;
+                timekeeper.wall_to_monotonic.tv_sec -= leap;
        }
        /* Accumulate raw time */
@@ -1077,6 +1078,7 @@ static void update_wall_time(void)
                timekeeper.xtime.tv_sec++;
                leap = second_overflow(timekeeper.xtime.tv_sec);
                timekeeper.xtime.tv_sec += leap;
+                timekeeper.wall_to_monotonic.tv_sec -= leap;
        }
        timekeeping_update(false);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 288488082224..a7fa0702be1c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -371,7 +371,7 @@ EXPORT_SYMBOL_GPL(tracing_on);
 void tracing_off(void)
 {
        if (global_trace.buffer)
-                ring_buffer_record_on(global_trace.buffer);
+                ring_buffer_record_off(global_trace.buffer);
        /*
         * This flag is only looked at when buffers haven't been
         * allocated yet. We don't really care about the race
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index e5e1d85b8c7c..4b1dfba70f7c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -372,6 +372,13 @@ static int watchdog(void *unused)
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
+/*
+ * People like the simple clean cpu node info on boot.
+ * Reduce the watchdog noise by only printing messages
+ * that are different from what cpu0 displayed.
+ */
+static unsigned long cpu0_err;
 static int watchdog_nmi_enable(int cpu)
 {
        struct perf_event_attr *wd_attr;
@@ -390,11 +397,21 @@ static int watchdog_nmi_enable(int cpu)
        /* Try to register using hardware perf events */
        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+        /* save cpu0 error for future comparision */
+        if (cpu == 0 && IS_ERR(event))
+                cpu0_err = PTR_ERR(event);
        if (!IS_ERR(event)) {
-                pr_info("enabled, takes one hw-pmu counter.\n");
+                /* only print for cpu0 or different than cpu0 */
+                if (cpu == 0 || cpu0_err)
+                        pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
                goto out_save;
        }
+        /* skip displaying the same error again */
+        if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
+                return PTR_ERR(event);
        /* vary the KERN level based on the returned errno */
        if (PTR_ERR(event) == -EOPNOTSUPP)