Merge branch 'timers/urgent' into timers/core

Reason: Update to upstream changes to avoid further conflicts. Fixup a trivial merge conflict in kernel/time/tick-sched.c Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
author: Thomas Gleixner <tglx@linutronix.de> 2012-07-15 04:24:53 -0400
committer: Thomas Gleixner <tglx@linutronix.de> 2012-07-15 04:24:53 -0400
commit: e8b9dd7e2471b1274e3be719fcc385e0a710e46f (patch)
tree: 030d7ce20e8f8767d9423f78c102aba089eec372 /kernel
parent: 924412f66fd9d21212e560a93792b0b607d46c6e (diff)
parent: 6b1859dba01c7d512b72d77e3fd7da8354235189 (diff)
22 files changed, 1061 insertions, 394 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 72fcd3069a90..b303dfc7dce0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -255,12 +255,17 @@ int cgroup_lock_is_held(void)
 EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
+static int css_unbias_refcnt(int refcnt)
+{
+        return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
+}
 /* the current nr of refs, always >= 0 whether @css is deactivated or not */
 static int css_refcnt(struct cgroup_subsys_state *css)
 {
        int v = atomic_read(&css->refcnt);
-        return v >= 0 ? v : v - CSS_DEACT_BIAS;
+        return css_unbias_refcnt(v);
 }
 /* convenient tests for these bits */
@@ -896,13 +901,10 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                mutex_unlock(&cgroup_mutex);
                /*
-                 * We want to drop the active superblock reference from the
+                 * Drop the active superblock reference that we took when we
-                 * cgroup creation after all the dentry refs are gone -
+                 * created the cgroup
-                 * kill_sb gets mighty unhappy otherwise.  Mark
-                 * dentry->d_fsdata with cgroup_diput() to tell
-                 * cgroup_d_release() to call deactivate_super().
                 */
-                dentry->d_fsdata = cgroup_diput;
+                deactivate_super(cgrp->root->sb);
                /*
                 * if we're getting rid of the cgroup, refcount should ensure
@@ -928,13 +930,6 @@ static int cgroup_delete(const struct dentry *d)
        return 1;
 }
-static void cgroup_d_release(struct dentry *dentry)
-{
-        /* did cgroup_diput() tell me to deactivate super? */
-        if (dentry->d_fsdata == cgroup_diput)
-                deactivate_super(dentry->d_sb);
-}
 static void remove_dir(struct dentry *d)
 {
        struct dentry *parent = dget(d->d_parent);
@@ -1542,7 +1537,6 @@ static int cgroup_get_rootdir(struct super_block *sb)
        static const struct dentry_operations cgroup_dops = {
                .d_iput = cgroup_diput,
                .d_delete = cgroup_delete,
-                .d_release = cgroup_d_release,
        };
        struct inode *inode =
@@ -3889,8 +3883,12 @@ static void css_dput_fn(struct work_struct *work)
 {
        struct cgroup_subsys_state *css =
                container_of(work, struct cgroup_subsys_state, dput_work);
+        struct dentry *dentry = css->cgroup->dentry;
+        struct super_block *sb = dentry->d_sb;
-        dput(css->cgroup->dentry);
+        atomic_inc(&sb->s_active);
+        dput(dentry);
+        deactivate_super(sb);
 }
 static void init_cgroup_css(struct cgroup_subsys_state *css,
@@ -4982,10 +4980,12 @@ EXPORT_SYMBOL_GPL(__css_tryget);
 void __css_put(struct cgroup_subsys_state *css)
 {
        struct cgroup *cgrp = css->cgroup;
+        int v;
        rcu_read_lock();
-        atomic_dec(&css->refcnt);
+        v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
-        switch (css_refcnt(css)) {
+        switch (v) {
        case 1:
                if (notify_on_release(cgrp)) {
                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f85c0154b333..d7d71d6ec972 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event)
        return !event->cgrp || event->cgrp == cpuctx->cgrp;
 }
-static inline void perf_get_cgroup(struct perf_event *event)
+static inline bool perf_tryget_cgroup(struct perf_event *event)
 {
-        css_get(&event->cgrp->css);
+        return css_tryget(&event->cgrp->css);
 }
 static inline void perf_put_cgroup(struct perf_event *event)
@@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
        event->cgrp = cgrp;
        /* must be done before we fput() the file */
-        perf_get_cgroup(event);
+        if (!perf_tryget_cgroup(event)) {
+                event->cgrp = NULL;
+                ret = -ENOENT;
+                goto out;
+        }
        /*
         * all events in a group must monitor
diff --git a/kernel/exit.c b/kernel/exit.c
index 34867cc5b42a..2f59cc334516 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,6 +72,18 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
                list_del_rcu(&p->tasks);
                list_del_init(&p->sibling);
                __this_cpu_dec(process_counts);
+                /*
+                 * If we are the last child process in a pid namespace to be
+                 * reaped, notify the reaper sleeping zap_pid_ns_processes().
+                 */
+                if (IS_ENABLED(CONFIG_PID_NS)) {
+                        struct task_struct *parent = p->real_parent;
+                        if ((task_active_pid_ns(parent)->child_reaper == parent) &&
+                            list_empty(&parent->children) &&
+                            (parent->flags & PF_EXITING))
+                                wake_up_process(parent);
+                }
        }
        list_del_rcu(&p->thread_group);
 }
@@ -643,6 +655,7 @@ static void exit_mm(struct task_struct * tsk)
        mm_release(tsk, mm);
        if (!mm)
                return;
+        sync_mm_rss(mm);
        /*
         * Serialize with any possible pending coredump.
         * We must hold mmap_sem around checking core_state
@@ -719,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
                zap_pid_ns_processes(pid_ns);
                write_lock_irq(&tasklist_lock);
-                /*
-                 * We can not clear ->child_reaper or leave it alone.
-                 * There may by stealth EXIT_DEAD tasks on ->children,
-                 * forget_original_parent() must move them somewhere.
-                 */
-                pid_ns->child_reaper = init_pid_ns.child_reaper;
        } else if (father->signal->has_child_subreaper) {
                struct task_struct *reaper;
diff --git a/kernel/fork.c b/kernel/fork.c
index ab5211b9e622..f00e319d8376 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -304,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        }
        err = arch_dup_task_struct(tsk, orig);
-        if (err)
-                goto out;
+        /*
+         * We defer looking at err, because we will need this setup
+         * for the clean up path to work correctly.
+         */
        tsk->stack = ti;
        setup_thread_stack(tsk, orig);
+        if (err)
+                goto out;
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
        stackend = end_of_stack(tsk);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ae34bf51682b..6db7a5ed52b5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
        return 0;
 }
+static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+{
+        ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+        ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+        return ktime_get_update_offsets(offs_real, offs_boot);
+}
 /*
 * Retrigger next event is called after clock was set
 *
@@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 static void retrigger_next_event(void *arg)
 {
        struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
-        struct timespec realtime_offset, xtim, wtm, sleep;
        if (!hrtimer_hres_active())
                return;
-        /* Optimized out for !HIGH_RES */
-        get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
-        set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
-        /* Adjust CLOCK_REALTIME offset */
        raw_spin_lock(&base->lock);
-        base->clock_base[HRTIMER_BASE_REALTIME].offset =
+        hrtimer_update_base(base);
-                timespec_to_ktime(realtime_offset);
-        base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
-                timespec_to_ktime(sleep);
        hrtimer_force_reprogram(base, 0);
        raw_spin_unlock(&base->lock);
 }
@@ -710,13 +708,25 @@ static int hrtimer_switch_to_hres(void)
                base->clock_base[i].resolution = KTIME_HIGH_RES;
        tick_setup_sched_timer();
        /* "Retrigger" the interrupt to get things going */
        retrigger_next_event(NULL);
        local_irq_restore(flags);
        return 1;
 }
+/*
+ * Called from timekeeping code to reprogramm the hrtimer interrupt
+ * device. If called from the timer interrupt context we defer it to
+ * softirq context.
+ */
+void clock_was_set_delayed(void)
+{
+        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+        cpu_base->clock_was_set = 1;
+        __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+}
 #else
 static inline int hrtimer_hres_active(void) { return 0; }
@@ -1250,11 +1260,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
        cpu_base->nr_events++;
        dev->next_event.tv64 = KTIME_MAX;
-        entry_time = now = ktime_get();
+        raw_spin_lock(&cpu_base->lock);
+        entry_time = now = hrtimer_update_base(cpu_base);
 retry:
        expires_next.tv64 = KTIME_MAX;
-        raw_spin_lock(&cpu_base->lock);
        /*
         * We set expires_next to KTIME_MAX here with cpu_base->lock
         * held to prevent that a timer is enqueued in our queue via
@@ -1330,8 +1339,12 @@ retry:
         * We need to prevent that we loop forever in the hrtimer
         * interrupt routine. We give it 3 attempts to avoid
         * overreacting on some spurious event.
+         *
+         * Acquire base lock for updating the offsets and retrieving
+         * the current time.
         */
-        now = ktime_get();
+        raw_spin_lock(&cpu_base->lock);
+        now = hrtimer_update_base(cpu_base);
        cpu_base->nr_retries++;
        if (++retries < 3)
                goto retry;
@@ -1343,6 +1356,7 @@ retry:
         */
        cpu_base->nr_hangs++;
        cpu_base->hang_detected = 1;
+        raw_spin_unlock(&cpu_base->lock);
        delta = ktime_sub(now, entry_time);
        if (delta.tv64 > cpu_base->max_hang_time.tv64)
                cpu_base->max_hang_time = delta;
@@ -1395,6 +1409,13 @@ void hrtimer_peek_ahead_timers(void)
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
+        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+        if (cpu_base->clock_was_set) {
+                cpu_base->clock_was_set = 0;
+                clock_was_set();
+        }
        hrtimer_peek_ahead_timers();
 }
diff --git a/kernel/panic.c b/kernel/panic.c
index 8ed89a175d79..d2a5f4ecc6dd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,7 +27,7 @@
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
-int panic_on_oops;
+int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
 static unsigned long tainted_mask;
 static int pause_on_oops;
 static int pause_on_oops_flag;
@@ -108,8 +108,6 @@ void panic(const char *fmt, ...)
         */
        crash_kexec(NULL);
-        kmsg_dump(KMSG_DUMP_PANIC);
        /*
         * Note smp_send_stop is the usual smp shutdown function, which
         * unfortunately means it may not be hardened to work in a panic
@@ -117,6 +115,8 @@ void panic(const char *fmt, ...)
         */
        smp_send_stop();
+        kmsg_dump(KMSG_DUMP_PANIC);
        atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
        bust_spinlocks(0);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 16b20e38c4a1..b3c7fd554250 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -184,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        }
        read_unlock(&tasklist_lock);
+        /* Firstly reap the EXIT_ZOMBIE children we may have. */
        do {
                clear_thread_flag(TIF_SIGPENDING);
                rc = sys_wait4(-1, NULL, __WALL, NULL);
        } while (rc != -ECHILD);
+        /*
+         * sys_wait4() above can't reap the TASK_DEAD children.
+         * Make sure they all go away, see __unhash_process().
+         */
+        for (;;) {
+                bool need_wait = false;
+                read_lock(&tasklist_lock);
+                if (!list_empty(&current->children)) {
+                        __set_current_state(TASK_UNINTERRUPTIBLE);
+                        need_wait = true;
+                }
+                read_unlock(&tasklist_lock);
+                if (!need_wait)
+                        break;
+                schedule();
+        }
        if (pid_ns->reboot)
                current->signal->group_exit_code = pid_ns->reboot;
diff --git a/kernel/printk.c b/kernel/printk.c
index 32462d2b364a..177fa49357a5 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -193,12 +193,21 @@ static int console_may_schedule;
 * separated by ',', and find the message after the ';' character.
 */
+enum log_flags {
+        LOG_NOCONS      = 1,    /* already flushed, do not print to console */
+        LOG_NEWLINE     = 2,    /* text ended with a newline */
+        LOG_PREFIX      = 4,    /* text started with a prefix */
+        LOG_CONT        = 8,    /* text is a fragment of a continuation line */
+};
 struct log {
        u64 ts_nsec;            /* timestamp in nanoseconds */
        u16 len;                /* length of entire record */
        u16 text_len;           /* length of text buffer */
        u16 dict_len;           /* length of dictionary buffer */
-        u16 level;              /* syslog level + facility */
+        u8 facility;            /* syslog facility */
+        u8 flags:5;             /* internal record flags */
+        u8 level:3;             /* syslog level */
 };
 /*
@@ -210,6 +219,8 @@ static DEFINE_RAW_SPINLOCK(logbuf_lock);
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
 static u64 syslog_seq;
 static u32 syslog_idx;
+static enum log_flags syslog_prev;
+static size_t syslog_partial;
 /* index and sequence number of the first record stored in the buffer */
 static u64 log_first_seq;
@@ -227,10 +238,10 @@ static u32 clear_idx;
 #define LOG_LINE_MAX 1024
 /* record buffer */
-#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
 #define LOG_ALIGN 4
 #else
-#define LOG_ALIGN 8
+#define LOG_ALIGN __alignof__(struct log)
 #endif
 #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
 static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
@@ -286,6 +297,7 @@ static u32 log_next(u32 idx)
 /* insert record into the buffer, discard old ones, update heads */
 static void log_store(int facility, int level,
+                      enum log_flags flags, u64 ts_nsec,
                      const char *dict, u16 dict_len,
                      const char *text, u16 text_len)
 {
@@ -329,8 +341,13 @@ static void log_store(int facility, int level,
        msg->text_len = text_len;
        memcpy(log_dict(msg), dict, dict_len);
        msg->dict_len = dict_len;
-        msg->level = (facility << 3) | (level & 7);
+        msg->facility = facility;
-        msg->ts_nsec = local_clock();
+        msg->level = level & 7;
+        msg->flags = flags & 0x1f;
+        if (ts_nsec > 0)
+                msg->ts_nsec = ts_nsec;
+        else
+                msg->ts_nsec = local_clock();
        memset(log_dict(msg) + dict_len, 0, pad_len);
        msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
@@ -414,21 +431,23 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
        if (!user)
                return -EBADF;
-        mutex_lock(&user->lock);
+        ret = mutex_lock_interruptible(&user->lock);
-        raw_spin_lock(&logbuf_lock);
+        if (ret)
+                return ret;
+        raw_spin_lock_irq(&logbuf_lock);
        while (user->seq == log_next_seq) {
                if (file->f_flags & O_NONBLOCK) {
                        ret = -EAGAIN;
-                        raw_spin_unlock(&logbuf_lock);
+                        raw_spin_unlock_irq(&logbuf_lock);
                        goto out;
                }
-                raw_spin_unlock(&logbuf_lock);
+                raw_spin_unlock_irq(&logbuf_lock);
                ret = wait_event_interruptible(log_wait,
                                               user->seq != log_next_seq);
                if (ret)
                        goto out;
-                raw_spin_lock(&logbuf_lock);
+                raw_spin_lock_irq(&logbuf_lock);
        }
        if (user->seq < log_first_seq) {
@@ -436,7 +455,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
                user->idx = log_first_idx;
                user->seq = log_first_seq;
                ret = -EPIPE;
-                raw_spin_unlock(&logbuf_lock);
+                raw_spin_unlock_irq(&logbuf_lock);
                goto out;
        }
@@ -444,13 +463,13 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
        ts_usec = msg->ts_nsec;
        do_div(ts_usec, 1000);
        len = sprintf(user->buf, "%u,%llu,%llu;",
-                      msg->level, user->seq, ts_usec);
+                      (msg->facility << 3) | msg->level, user->seq, ts_usec);
        /* escape non-printable characters */
        for (i = 0; i < msg->text_len; i++) {
                unsigned char c = log_text(msg)[i];
-                if (c < ' ' || c >= 128)
+                if (c < ' ' || c >= 127 || c == '\\')
                        len += sprintf(user->buf + len, "\\x%02x", c);
                else
                        user->buf[len++] = c;
@@ -474,7 +493,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
                                continue;
                        }
-                        if (c < ' ' || c >= 128) {
+                        if (c < ' ' || c >= 127 || c == '\\') {
                                len += sprintf(user->buf + len, "\\x%02x", c);
                                continue;
                        }
@@ -486,7 +505,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
        user->idx = log_next(user->idx);
        user->seq++;
-        raw_spin_unlock(&logbuf_lock);
+        raw_spin_unlock_irq(&logbuf_lock);
        if (len > count) {
                ret = -EINVAL;
@@ -513,7 +532,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
        if (offset)
                return -ESPIPE;
-        raw_spin_lock(&logbuf_lock);
+        raw_spin_lock_irq(&logbuf_lock);
        switch (whence) {
        case SEEK_SET:
                /* the first record */
@@ -537,7 +556,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
        default:
                ret = -EINVAL;
        }
-        raw_spin_unlock(&logbuf_lock);
+        raw_spin_unlock_irq(&logbuf_lock);
        return ret;
 }
@@ -551,14 +570,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
        poll_wait(file, &log_wait, wait);
-        raw_spin_lock(&logbuf_lock);
+        raw_spin_lock_irq(&logbuf_lock);
        if (user->seq < log_next_seq) {
                /* return error when data has vanished underneath us */
                if (user->seq < log_first_seq)
                        ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
                ret = POLLIN|POLLRDNORM;
        }
-        raw_spin_unlock(&logbuf_lock);
+        raw_spin_unlock_irq(&logbuf_lock);
        return ret;
 }
@@ -582,10 +601,10 @@ static int devkmsg_open(struct inode *inode, struct file *file)
        mutex_init(&user->lock);
-        raw_spin_lock(&logbuf_lock);
+        raw_spin_lock_irq(&logbuf_lock);
        user->idx = log_first_idx;
        user->seq = log_first_seq;
-        raw_spin_unlock(&logbuf_lock);
+        raw_spin_unlock_irq(&logbuf_lock);
        file->private_data = user;
        return 0;
@@ -785,44 +804,64 @@ static bool printk_time;
 #endif
 module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
+static size_t print_time(u64 ts, char *buf)
+{
+        unsigned long rem_nsec;
+        if (!printk_time)
+                return 0;
+        if (!buf)
+                return 15;
+        rem_nsec = do_div(ts, 1000000000);
+        return sprintf(buf, "[%5lu.%06lu] ",
+                       (unsigned long)ts, rem_nsec / 1000);
+}
 static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
 {
        size_t len = 0;
+        unsigned int prefix = (msg->facility << 3) | msg->level;
        if (syslog) {
                if (buf) {
-                        len += sprintf(buf, "<%u>", msg->level);
+                        len += sprintf(buf, "<%u>", prefix);
                } else {
                        len += 3;
-                        if (msg->level > 9)
+                        if (prefix > 999)
-                                len++;
+                                len += 3;
-                        if (msg->level > 99)
+                        else if (prefix > 99)
+                                len += 2;
+                        else if (prefix > 9)
                                len++;
                }
        }
-        if (printk_time) {
+        len += print_time(msg->ts_nsec, buf ? buf + len : NULL);
-                if (buf) {
-                        unsigned long long ts = msg->ts_nsec;
-                        unsigned long rem_nsec = do_div(ts, 1000000000);
-                        len += sprintf(buf + len, "[%5lu.%06lu] ",
-                                         (unsigned long) ts, rem_nsec / 1000);
-                } else {
-                        len += 15;
-                }
-        }
        return len;
 }
-static size_t msg_print_text(const struct log *msg, bool syslog,
+static size_t msg_print_text(const struct log *msg, enum log_flags prev,
-                             char *buf, size_t size)
+                             bool syslog, char *buf, size_t size)
 {
        const char *text = log_text(msg);
        size_t text_size = msg->text_len;
+        bool prefix = true;
+        bool newline = true;
        size_t len = 0;
+        if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
+                prefix = false;
+        if (msg->flags & LOG_CONT) {
+                if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
+                        prefix = false;
+                if (!(msg->flags & LOG_NEWLINE))
+                        newline = false;
+        }
        do {
                const char *next = memchr(text, '\n', text_size);
                size_t text_len;
@@ -840,16 +879,22 @@ static size_t msg_print_text(const struct log *msg, bool syslog,
                            text_len + 1>= size - len)
                                break;
-                        len += print_prefix(msg, syslog, buf + len);
+                        if (prefix)
+                                len += print_prefix(msg, syslog, buf + len);
                        memcpy(buf + len, text, text_len);
                        len += text_len;
-                        buf[len++] = '\n';
+                        if (next || newline)
+                                buf[len++] = '\n';
                } else {
                        /* SYSLOG_ACTION_* buffer size only calculation */
-                        len += print_prefix(msg, syslog, NULL);
+                        if (prefix)
-                        len += text_len + 1;
+                                len += print_prefix(msg, syslog, NULL);
+                        len += text_len;
+                        if (next || newline)
+                                len++;
                }
+                prefix = true;
                text = next;
        } while (text);
@@ -860,26 +905,60 @@ static int syslog_print(char __user *buf, int size)
 {
        char *text;
        struct log *msg;
-        int len;
+        int len = 0;
        text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
        if (!text)
                return -ENOMEM;
-        raw_spin_lock_irq(&logbuf_lock);
+        while (size > 0) {
-        if (syslog_seq < log_first_seq) {
+                size_t n;
-                /* messages are gone, move to first one */
+                size_t skip;
-                syslog_seq = log_first_seq;
-                syslog_idx = log_first_idx;
+                raw_spin_lock_irq(&logbuf_lock);
-        }
+                if (syslog_seq < log_first_seq) {
-        msg = log_from_idx(syslog_idx);
+                        /* messages are gone, move to first one */
-        len = msg_print_text(msg, true, text, LOG_LINE_MAX);
+                        syslog_seq = log_first_seq;
-        syslog_idx = log_next(syslog_idx);
+                        syslog_idx = log_first_idx;
-        syslog_seq++;
+                        syslog_prev = 0;
-        raw_spin_unlock_irq(&logbuf_lock);
+                        syslog_partial = 0;
+                }
+                if (syslog_seq == log_next_seq) {
+                        raw_spin_unlock_irq(&logbuf_lock);
+                        break;
+                }
+                skip = syslog_partial;
+                msg = log_from_idx(syslog_idx);
+                n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX);
+                if (n - syslog_partial <= size) {
+                        /* message fits into buffer, move forward */
+                        syslog_idx = log_next(syslog_idx);
+                        syslog_seq++;
+                        syslog_prev = msg->flags;
+                        n -= syslog_partial;
+                        syslog_partial = 0;
+                } else if (!len){
+                        /* partial read(), remember position */
+                        n = size;
+                        syslog_partial += n;
+                } else
+                        n = 0;
+                raw_spin_unlock_irq(&logbuf_lock);
+                if (!n)
+                        break;
+                if (copy_to_user(buf, text + skip, n)) {
+                        if (!len)
+                                len = -EFAULT;
+                        break;
+                }
-        if (len > 0 && copy_to_user(buf, text, len))
+                len += n;
-                len = -EFAULT;
+                size -= n;
+                buf += n;
+        }
        kfree(text);
        return len;
@@ -899,6 +978,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                u64 next_seq;
                u64 seq;
                u32 idx;
+                enum log_flags prev;
                if (clear_seq < log_first_seq) {
                        /* messages are gone, move to first available one */
@@ -909,41 +989,47 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                /*
                 * Find first record that fits, including all following records,
                 * into the user-provided buffer for this dump.
-                */
+                 */
                seq = clear_seq;
                idx = clear_idx;
+                prev = 0;
                while (seq < log_next_seq) {
                        struct log *msg = log_from_idx(idx);
-                        len += msg_print_text(msg, true, NULL, 0);
+                        len += msg_print_text(msg, prev, true, NULL, 0);
                        idx = log_next(idx);
                        seq++;
                }
+                /* move first record forward until length fits into the buffer */
                seq = clear_seq;
                idx = clear_idx;
+                prev = 0;
                while (len > size && seq < log_next_seq) {
                        struct log *msg = log_from_idx(idx);
-                        len -= msg_print_text(msg, true, NULL, 0);
+                        len -= msg_print_text(msg, prev, true, NULL, 0);
                        idx = log_next(idx);
                        seq++;
                }
-                /* last message in this dump */
+                /* last message fitting into this dump */
                next_seq = log_next_seq;
                len = 0;
+                prev = 0;
                while (len >= 0 && seq < next_seq) {
                        struct log *msg = log_from_idx(idx);
                        int textlen;
-                        textlen = msg_print_text(msg, true, text, LOG_LINE_MAX);
+                        textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX);
                        if (textlen < 0) {
                                len = textlen;
                                break;
                        }
                        idx = log_next(idx);
                        seq++;
+                        prev = msg->flags;
                        raw_spin_unlock_irq(&logbuf_lock);
                        if (copy_to_user(buf + len, text, textlen))
@@ -956,6 +1042,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                                /* messages are gone, move to next one */
                                seq = log_first_seq;
                                idx = log_first_idx;
+                                prev = 0;
                        }
                }
        }
@@ -1027,6 +1114,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
        /* Clear ring buffer */
        case SYSLOG_ACTION_CLEAR:
                syslog_print_all(NULL, 0, true);
+                break;
        /* Disable logging to console */
        case SYSLOG_ACTION_CONSOLE_OFF:
                if (saved_console_loglevel == -1)
@@ -1059,6 +1147,8 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                        /* messages are gone, move to first one */
                        syslog_seq = log_first_seq;
                        syslog_idx = log_first_idx;
+                        syslog_prev = 0;
+                        syslog_partial = 0;
                }
                if (from_file) {
                        /*
@@ -1068,19 +1158,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                         */
                        error = log_next_idx - syslog_idx;
                } else {
-                        u64 seq;
+                        u64 seq = syslog_seq;
-                        u32 idx;
+                        u32 idx = syslog_idx;
+                        enum log_flags prev = syslog_prev;
                        error = 0;
-                        seq = syslog_seq;
-                        idx = syslog_idx;
                        while (seq < log_next_seq) {
                                struct log *msg = log_from_idx(idx);
-                                error += msg_print_text(msg, true, NULL, 0);
+                                error += msg_print_text(msg, prev, true, NULL, 0);
                                idx = log_next(idx);
                                seq++;
+                                prev = msg->flags;
                        }
+                        error -= syslog_partial;
                }
                raw_spin_unlock_irq(&logbuf_lock);
                break;
@@ -1259,22 +1350,98 @@ static inline void printk_delay(void)
        }
 }
+/*
+ * Continuation lines are buffered, and not committed to the record buffer
+ * until the line is complete, or a race forces it. The line fragments
+ * though, are printed immediately to the consoles to ensure everything has
+ * reached the console in case of a kernel crash.
+ */
+static struct cont {
+        char buf[LOG_LINE_MAX];
+        size_t len;                     /* length == 0 means unused buffer */
+        size_t cons;                    /* bytes written to console */
+        struct task_struct *owner;      /* task of first print*/
+        u64 ts_nsec;                    /* time of first print */
+        u8 level;                       /* log level of first message */
+        u8 facility;                    /* log level of first message */
+        bool flushed:1;                 /* buffer sealed and committed */
+} cont;
+static void cont_flush(void)
+{
+        if (cont.flushed)
+                return;
+        if (cont.len == 0)
+                return;
+        log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec,
+                  NULL, 0, cont.buf, cont.len);
+        cont.flushed = true;
+}
+static bool cont_add(int facility, int level, const char *text, size_t len)
+{
+        if (cont.len && cont.flushed)
+                return false;
+        if (cont.len + len > sizeof(cont.buf)) {
+                cont_flush();
+                return false;
+        }
+        if (!cont.len) {
+                cont.facility = facility;
+                cont.level = level;
+                cont.owner = current;
+                cont.ts_nsec = local_clock();
+                cont.cons = 0;
+                cont.flushed = false;
+        }
+        memcpy(cont.buf + cont.len, text, len);
+        cont.len += len;
+        return true;
+}
+static size_t cont_print_text(char *text, size_t size)
+{
+        size_t textlen = 0;
+        size_t len;
+        if (cont.cons == 0) {
+                textlen += print_time(cont.ts_nsec, text);
+                size -= textlen;
+        }
+        len = cont.len - cont.cons;
+        if (len > 0) {
+                if (len+1 > size)
+                        len = size-1;
+                memcpy(text + textlen, cont.buf + cont.cons, len);
+                textlen += len;
+                cont.cons = cont.len;
+        }
+        if (cont.flushed) {
+                text[textlen++] = '\n';
+                /* got everything, release buffer */
+                cont.len = 0;
+        }
+        return textlen;
+}
 asmlinkage int vprintk_emit(int facility, int level,
                            const char *dict, size_t dictlen,
                            const char *fmt, va_list args)
 {
        static int recursion_bug;
-        static char cont_buf[LOG_LINE_MAX];
-        static size_t cont_len;
-        static int cont_level;
-        static struct task_struct *cont_task;
        static char textbuf[LOG_LINE_MAX];
        char *text = textbuf;
        size_t text_len;
+        enum log_flags lflags = 0;
        unsigned long flags;
        int this_cpu;
-        bool newline = false;
-        bool prefix = false;
        int printed_len = 0;
        boot_delay_msec();
@@ -1313,7 +1480,8 @@ asmlinkage int vprintk_emit(int facility, int level,
                recursion_bug = 0;
                printed_len += strlen(recursion_msg);
                /* emit KERN_CRIT message */
-                log_store(0, 2, NULL, 0, recursion_msg, printed_len);
+                log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
+                          NULL, 0, recursion_msg, printed_len);
        }
        /*
@@ -1325,7 +1493,7 @@ asmlinkage int vprintk_emit(int facility, int level,
        /* mark and strip a trailing newline */
        if (text_len && text[text_len-1] == '\n') {
                text_len--;
-                newline = true;
+                lflags |= LOG_NEWLINE;
        }
        /* strip syslog prefix and extract log level or control flags */
@@ -1335,7 +1503,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                        if (level == -1)
                                level = text[1] - '0';
                case 'd':       /* KERN_DEFAULT */
-                        prefix = true;
+                        lflags |= LOG_PREFIX;
                case 'c':       /* KERN_CONT */
                        text += 3;
                        text_len -= 3;
@@ -1345,61 +1513,41 @@ asmlinkage int vprintk_emit(int facility, int level,
        if (level == -1)
                level = default_message_loglevel;
-        if (dict) {
+        if (dict)
-                prefix = true;
+                lflags |= LOG_PREFIX|LOG_NEWLINE;
-                newline = true;
-        }
-        if (!newline) {
+        if (!(lflags & LOG_NEWLINE)) {
-                if (cont_len && (prefix || cont_task != current)) {
+                /*
-                        /*
+                 * Flush the conflicting buffer. An earlier newline was missing,
-                         * Flush earlier buffer, which is either from a
+                 * or another task also prints continuation lines.
-                         * different thread, or when we got a new prefix.
+                 */
-                         */
+                if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
-                        log_store(facility, cont_level, NULL, 0, cont_buf, cont_len);
+                        cont_flush();
-                        cont_len = 0;
-                }
-                if (!cont_len) {
-                        cont_level = level;
-                        cont_task = current;
-                }
-                /* buffer or append to earlier buffer from the same thread */
+                /* buffer line if possible, otherwise store it right away */
-                if (cont_len + text_len > sizeof(cont_buf))
+                if (!cont_add(facility, level, text, text_len))
-                        text_len = sizeof(cont_buf) - cont_len;
+                        log_store(facility, level, lflags | LOG_CONT, 0,
-                memcpy(cont_buf + cont_len, text, text_len);
+                                  dict, dictlen, text, text_len);
-                cont_len += text_len;
        } else {
-                if (cont_len && cont_task == current) {
+                bool stored = false;
-                        if (prefix) {
-                                /*
-                                 * New prefix from the same thread; flush. We
-                                 * either got no earlier newline, or we race
-                                 * with an interrupt.
-                                 */
-                                log_store(facility, cont_level,
-                                          NULL, 0, cont_buf, cont_len);
-                                cont_len = 0;
-                        }
-                        /* append to the earlier buffer and flush */
+                /*
-                        if (cont_len + text_len > sizeof(cont_buf))
+                 * If an earlier newline was missing and it was the same task,
-                                text_len = sizeof(cont_buf) - cont_len;
+                 * either merge it with the current buffer and flush, or if
-                        memcpy(cont_buf + cont_len, text, text_len);
+                 * there was a race with interrupts (prefix == true) then just
-                        cont_len += text_len;
+                 * flush it out and store this line separately.
-                        log_store(facility, cont_level,
+                 */
-                                  NULL, 0, cont_buf, cont_len);
+                if (cont.len && cont.owner == current) {
-                        cont_len = 0;
+                        if (!(lflags & LOG_PREFIX))
-                        cont_task = NULL;
+                                stored = cont_add(facility, level, text, text_len);
-                        printed_len = cont_len;
+                        cont_flush();
-                } else {
-                        /* ordinary single and terminated line */
-                        log_store(facility, level,
-                                  dict, dictlen, text, text_len);
-                        printed_len = text_len;
                }
+                if (!stored)
+                        log_store(facility, level, lflags, 0,
+                                  dict, dictlen, text, text_len);
        }
+        printed_len += text_len;
        /*
         * Try to acquire and then immediately release the console semaphore.
@@ -1486,11 +1634,18 @@ EXPORT_SYMBOL(printk);
 #else
 #define LOG_LINE_MAX 0
+static struct cont {
+        size_t len;
+        size_t cons;
+        u8 level;
+        bool flushed:1;
+} cont;
 static struct log *log_from_idx(u32 idx) { return NULL; }
 static u32 log_next(u32 idx) { return 0; }
 static void call_console_drivers(int level, const char *text, size_t len) {}
-static size_t msg_print_text(const struct log *msg, bool syslog,
+static size_t msg_print_text(const struct log *msg, enum log_flags prev,
-                             char *buf, size_t size) { return 0; }
+                             bool syslog, char *buf, size_t size) { return 0; }
+static size_t cont_print_text(char *text, size_t size) { return 0; }
 #endif /* CONFIG_PRINTK */
@@ -1765,6 +1920,7 @@ void wake_up_klogd(void)
 /* the next printk record to write to the console */
 static u64 console_seq;
 static u32 console_idx;
+static enum log_flags console_prev;
 /**
 * console_unlock - unlock the console system
@@ -1782,6 +1938,7 @@ static u32 console_idx;
 */
 void console_unlock(void)
 {
+        static char text[LOG_LINE_MAX];
        static u64 seen_seq;
        unsigned long flags;
        bool wake_klogd = false;
@@ -1794,10 +1951,23 @@ void console_unlock(void)
        console_may_schedule = 0;
+        /* flush buffered message fragment immediately to console */
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        if (cont.len && (cont.cons < cont.len || cont.flushed)) {
+                size_t len;
+                len = cont_print_text(text, sizeof(text));
+                raw_spin_unlock(&logbuf_lock);
+                stop_critical_timings();
+                call_console_drivers(cont.level, text, len);
+                start_critical_timings();
+                local_irq_restore(flags);
+        } else
+                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 again:
        for (;;) {
                struct log *msg;
-                static char text[LOG_LINE_MAX];
                size_t len;
                int level;
@@ -1811,18 +1981,35 @@ again:
                        /* messages are gone, move to first one */
                        console_seq = log_first_seq;
                        console_idx = log_first_idx;
+                        console_prev = 0;
                }
+skip:
                if (console_seq == log_next_seq)
                        break;
                msg = log_from_idx(console_idx);
-                level = msg->level & 7;
+                if (msg->flags & LOG_NOCONS) {
+                        /*
-                len = msg_print_text(msg, false, text, sizeof(text));
+                         * Skip record we have buffered and already printed
+                         * directly to the console when we received it.
+                         */
+                        console_idx = log_next(console_idx);
+                        console_seq++;
+                        /*
+                         * We will get here again when we register a new
+                         * CON_PRINTBUFFER console. Clear the flag so we
+                         * will properly dump everything later.
+                         */
+                        msg->flags &= ~LOG_NOCONS;
+                        goto skip;
+                }
+                level = msg->level;
+                len = msg_print_text(msg, console_prev, false,
+                                     text, sizeof(text));
                console_idx = log_next(console_idx);
                console_seq++;
+                console_prev = msg->flags;
                raw_spin_unlock(&logbuf_lock);
                stop_critical_timings();        /* don't trace print latency */
@@ -2085,6 +2272,7 @@ void register_console(struct console *newcon)
                raw_spin_lock_irqsave(&logbuf_lock, flags);
                console_seq = syslog_seq;
                console_idx = syslog_idx;
+                console_prev = syslog_prev;
                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
                /*
                 * We're about to replay the log buffer.  Only do this to the
@@ -2300,48 +2488,214 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
 * kmsg_dump - dump kernel log to kernel message dumpers.
 * @reason: the reason (oops, panic etc) for dumping
 *
- * Iterate through each of the dump devices and call the oops/panic
+ * Call each of the registered dumper's dump() callback, which can
- * callbacks with the log buffer.
+ * retrieve the kmsg records with kmsg_dump_get_line() or
+ * kmsg_dump_get_buffer().
 */
 void kmsg_dump(enum kmsg_dump_reason reason)
 {
-        u64 idx;
        struct kmsg_dumper *dumper;
-        const char *s1, *s2;
-        unsigned long l1, l2;
        unsigned long flags;
        if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
                return;
-        /* Theoretically, the log could move on after we do this, but
+        rcu_read_lock();
-           there's not a lot we can do about that. The new messages
+        list_for_each_entry_rcu(dumper, &dump_list, list) {
-           will overwrite the start of what we dump. */
+                if (dumper->max_reason && reason > dumper->max_reason)
+                        continue;
+                /* initialize iterator with data about the stored records */
+                dumper->active = true;
+                raw_spin_lock_irqsave(&logbuf_lock, flags);
+                dumper->cur_seq = clear_seq;
+                dumper->cur_idx = clear_idx;
+                dumper->next_seq = log_next_seq;
+                dumper->next_idx = log_next_idx;
+                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+                /* invoke dumper which will iterate over records */
+                dumper->dump(dumper, reason);
+                /* reset iterator */
+                dumper->active = false;
+        }
+        rcu_read_unlock();
+}
+/**
+ * kmsg_dump_get_line - retrieve one kmsg log line
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @line: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the beginning of the kmsg buffer, with the oldest kmsg
+ * record, and copy one record into the provided buffer.
+ *
+ * Consecutive calls will return the next available record moving
+ * towards the end of the buffer with the youngest messages.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ */
+bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+                        char *line, size_t size, size_t *len)
+{
+        unsigned long flags;
+        struct log *msg;
+        size_t l = 0;
+        bool ret = false;
+        if (!dumper->active)
+                goto out;
        raw_spin_lock_irqsave(&logbuf_lock, flags);
-        if (syslog_seq < log_first_seq)
+        if (dumper->cur_seq < log_first_seq) {
-                idx = syslog_idx;
+                /* messages are gone, move to first available one */
-        else
+                dumper->cur_seq = log_first_seq;
-                idx = log_first_idx;
+                dumper->cur_idx = log_first_idx;
+        }
-        if (idx > log_next_idx) {
+        /* last entry */
-                s1 = log_buf;
+        if (dumper->cur_seq >= log_next_seq) {
-                l1 = log_next_idx;
+                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+                goto out;
+        }
-                s2 = log_buf + idx;
+        msg = log_from_idx(dumper->cur_idx);
-                l2 = log_buf_len - idx;
+        l = msg_print_text(msg, 0, syslog, line, size);
-        } else {
-                s1 = "";
+        dumper->cur_idx = log_next(dumper->cur_idx);
-                l1 = 0;
+        dumper->cur_seq++;
+        ret = true;
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+out:
+        if (len)
+                *len = l;
+        return ret;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
+/**
+ * kmsg_dump_get_buffer - copy kmsg log lines
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @buf: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the end of the kmsg buffer and fill the provided buffer
+ * with as many of the the *youngest* kmsg records that fit into it.
+ * If the buffer is large enough, all available kmsg records will be
+ * copied with a single call.
+ *
+ * Consecutive calls will fill the buffer with the next block of
+ * available older records, not including the earlier retrieved ones.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ */
+bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
+                          char *buf, size_t size, size_t *len)
+{
+        unsigned long flags;
+        u64 seq;
+        u32 idx;
+        u64 next_seq;
+        u32 next_idx;
+        enum log_flags prev;
+        size_t l = 0;
+        bool ret = false;
+        if (!dumper->active)
+                goto out;
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        if (dumper->cur_seq < log_first_seq) {
+                /* messages are gone, move to first available one */
+                dumper->cur_seq = log_first_seq;
+                dumper->cur_idx = log_first_idx;
+        }
+        /* last entry */
+        if (dumper->cur_seq >= dumper->next_seq) {
+                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+                goto out;
+        }
+        /* calculate length of entire buffer */
+        seq = dumper->cur_seq;
+        idx = dumper->cur_idx;
+        prev = 0;
+        while (seq < dumper->next_seq) {
+                struct log *msg = log_from_idx(idx);
+                l += msg_print_text(msg, prev, true, NULL, 0);
+                idx = log_next(idx);
+                seq++;
+                prev = msg->flags;
+        }
-                s2 = log_buf + idx;
+        /* move first record forward until length fits into the buffer */
-                l2 = log_next_idx - idx;
+        seq = dumper->cur_seq;
+        idx = dumper->cur_idx;
+        prev = 0;
+        while (l > size && seq < dumper->next_seq) {
+                struct log *msg = log_from_idx(idx);
+                l -= msg_print_text(msg, prev, true, NULL, 0);
+                idx = log_next(idx);
+                seq++;
+                prev = msg->flags;
        }
+        /* last message in next interation */
+        next_seq = seq;
+        next_idx = idx;
+        l = 0;
+        prev = 0;
+        while (seq < dumper->next_seq) {
+                struct log *msg = log_from_idx(idx);
+                l += msg_print_text(msg, prev, syslog, buf + l, size - l);
+                idx = log_next(idx);
+                seq++;
+                prev = msg->flags;
+        }
+        dumper->next_seq = next_seq;
+        dumper->next_idx = next_idx;
+        ret = true;
        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+out:
+        if (len)
+                *len = l;
+        return ret;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
-        rcu_read_lock();
+/**
-        list_for_each_entry_rcu(dumper, &dump_list, list)
+ * kmsg_dump_rewind - reset the interator
-                dumper->dump(dumper, reason, s1, l1, s2, l2);
+ * @dumper: registered kmsg dumper
-        rcu_read_unlock();
+ *
+ * Reset the dumper's iterator so that kmsg_dump_get_line() and
+ * kmsg_dump_get_buffer() can be called again and used multiple
+ * times within the same dumper.dump() callback.
+ */
+void kmsg_dump_rewind(struct kmsg_dumper *dumper)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        dumper->cur_seq = clear_seq;
+        dumper->cur_idx = clear_idx;
+        dumper->next_seq = log_next_seq;
+        dumper->next_idx = log_next_idx;
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 }
+EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
 #endif
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0da7b88d92d0..4b97bba7396e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -201,6 +201,7 @@ void rcu_note_context_switch(int cpu)
 {
        trace_rcu_utilization("Start context switch");
        rcu_sched_qs(cpu);
+        rcu_preempt_note_context_switch(cpu);
        trace_rcu_utilization("End context switch");
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -1397,6 +1398,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
        rdp->qlen_lazy += rsp->qlen_lazy;
        rdp->qlen += rsp->qlen;
        rdp->n_cbs_adopted += rsp->qlen;
+        if (rsp->qlen_lazy != rsp->qlen)
+                rcu_idle_count_callbacks_posted();
        rsp->qlen_lazy = 0;
        rsp->qlen = 0;
@@ -1528,7 +1531,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 {
        unsigned long flags;
        struct rcu_head *next, *list, **tail;
-        int bl, count, count_lazy;
+        int bl, count, count_lazy, i;
        /* If no callbacks are ready, just return.*/
        if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
@@ -1551,9 +1554,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
        *rdp->nxttail[RCU_DONE_TAIL] = NULL;
        tail = rdp->nxttail[RCU_DONE_TAIL];
-        for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
+        for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
-                if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
+                if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
-                        rdp->nxttail[count] = &rdp->nxtlist;
+                        rdp->nxttail[i] = &rdp->nxtlist;
        local_irq_restore(flags);
        /* Invoke callbacks. */
@@ -1581,9 +1584,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        if (list != NULL) {
                *tail = rdp->nxtlist;
                rdp->nxtlist = list;
-                for (count = 0; count < RCU_NEXT_SIZE; count++)
+                for (i = 0; i < RCU_NEXT_SIZE; i++)
-                        if (&rdp->nxtlist == rdp->nxttail[count])
+                        if (&rdp->nxtlist == rdp->nxttail[i])
-                                rdp->nxttail[count] = tail;
+                                rdp->nxttail[i] = tail;
                        else
                                break;
        }
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 7f5d138dedf5..19b61ac1079f 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,6 +84,20 @@ struct rcu_dynticks {
                                    /* Process level is worth LLONG_MAX/2. */
        int dynticks_nmi_nesting;   /* Track NMI nesting level. */
        atomic_t dynticks;          /* Even value for idle, else odd. */
+#ifdef CONFIG_RCU_FAST_NO_HZ
+        int dyntick_drain;          /* Prepare-for-idle state variable. */
+        unsigned long dyntick_holdoff;
+                                    /* No retries for the jiffy of failure. */
+        struct timer_list idle_gp_timer;
+                                    /* Wake up CPU sleeping with callbacks. */
+        unsigned long idle_gp_timer_expires;
+                                    /* When to wake up CPU (for repost). */
+        bool idle_first_pass;       /* First pass of attempt to go idle? */
+        unsigned long nonlazy_posted;
+                                    /* # times non-lazy CBs posted to CPU. */
+        unsigned long nonlazy_posted_snap;
+                                    /* idle-period nonlazy_posted snapshot. */
+#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 };
 /* RCU's kthread states for tracing. */
@@ -430,6 +444,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
 /* Forward declarations for rcutree_plugin.h */
 static void rcu_bootup_announce(void);
 long rcu_batches_completed(void);
+static void rcu_preempt_note_context_switch(int cpu);
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 2411000d9869..3e4899459f3d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu)
 *
 * Caller must disable preemption.
 */
-void rcu_preempt_note_context_switch(void)
+static void rcu_preempt_note_context_switch(int cpu)
 {
        struct task_struct *t = current;
        unsigned long flags;
@@ -164,7 +164,7 @@ void rcu_preempt_note_context_switch(void)
            (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
                /* Possibly blocking in an RCU read-side critical section. */
-                rdp = __this_cpu_ptr(rcu_preempt_state.rda);
+                rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
                rnp = rdp->mynode;
                raw_spin_lock_irqsave(&rnp->lock, flags);
                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -228,7 +228,7 @@ void rcu_preempt_note_context_switch(void)
         * means that we continue to block the current grace period.
         */
        local_irq_save(flags);
-        rcu_preempt_qs(smp_processor_id());
+        rcu_preempt_qs(cpu);
        local_irq_restore(flags);
 }
@@ -1002,6 +1002,14 @@ void rcu_force_quiescent_state(void)
 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 /*
+ * Because preemptible RCU does not exist, we never have to check for
+ * CPUs being in quiescent states.
+ */
+static void rcu_preempt_note_context_switch(int cpu)
+{
+}
+/*
 * Because preemptible RCU does not exist, there are never any preempted
 * RCU readers.
 */
@@ -1886,8 +1894,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
 * any flavor of RCU.
 */
-int rcu_needs_cpu(int cpu)
+int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 {
+        *delta_jiffies = ULONG_MAX;
        return rcu_cpu_has_callbacks(cpu);
 }
@@ -1962,41 +1971,6 @@ static void rcu_idle_count_callbacks_posted(void)
 #define RCU_IDLE_GP_DELAY 6             /* Roughly one grace period. */
 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
-/* Loop counter for rcu_prepare_for_idle(). */
-static DEFINE_PER_CPU(int, rcu_dyntick_drain);
-/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */
-static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
-/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */
-static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer);
-/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */
-static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires);
-/* Enable special processing on first attempt to enter dyntick-idle mode. */
-static DEFINE_PER_CPU(bool, rcu_idle_first_pass);
-/* Running count of non-lazy callbacks posted, never decremented. */
-static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted);
-/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */
-static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap);
-/*
- * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
- * callbacks on this CPU, (2) this CPU has not yet attempted to enter
- * dyntick-idle mode, or (3) this CPU is in the process of attempting to
- * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
- * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
- * it is better to incur scheduling-clock interrupts than to spin
- * continuously for the same time duration!
- */
-int rcu_needs_cpu(int cpu)
-{
-        /* Flag a new idle sojourn to the idle-entry state machine. */
-        per_cpu(rcu_idle_first_pass, cpu) = 1;
-        /* If no callbacks, RCU doesn't need the CPU. */
-        if (!rcu_cpu_has_callbacks(cpu))
-                return 0;
-        /* Otherwise, RCU needs the CPU only if it recently tried and failed. */
-        return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
-}
 /*
 * Does the specified flavor of RCU have non-lazy callbacks pending on
 * the specified CPU?  Both RCU flavor and CPU are specified by the
@@ -2040,6 +2014,47 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
 }
 /*
+ * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
+ * callbacks on this CPU, (2) this CPU has not yet attempted to enter
+ * dyntick-idle mode, or (3) this CPU is in the process of attempting to
+ * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
+ * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
+ * it is better to incur scheduling-clock interrupts than to spin
+ * continuously for the same time duration!
+ *
+ * The delta_jiffies argument is used to store the time when RCU is
+ * going to need the CPU again if it still has callbacks.  The reason
+ * for this is that rcu_prepare_for_idle() might need to post a timer,
+ * but if so, it will do so after tick_nohz_stop_sched_tick() has set
+ * the wakeup time for this CPU.  This means that RCU's timer can be
+ * delayed until the wakeup time, which defeats the purpose of posting
+ * a timer.
+ */
+int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
+{
+        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+        /* Flag a new idle sojourn to the idle-entry state machine. */
+        rdtp->idle_first_pass = 1;
+        /* If no callbacks, RCU doesn't need the CPU. */
+        if (!rcu_cpu_has_callbacks(cpu)) {
+                *delta_jiffies = ULONG_MAX;
+                return 0;
+        }
+        if (rdtp->dyntick_holdoff == jiffies) {
+                /* RCU recently tried and failed, so don't try again. */
+                *delta_jiffies = 1;
+                return 1;
+        }
+        /* Set up for the possibility that RCU will post a timer. */
+        if (rcu_cpu_has_nonlazy_callbacks(cpu))
+                *delta_jiffies = RCU_IDLE_GP_DELAY;
+        else
+                *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY;
+        return 0;
+}
+/*
 * Handler for smp_call_function_single().  The only point of this
 * handler is to wake the CPU up, so the handler does only tracing.
 */
@@ -2075,21 +2090,24 @@ static void rcu_idle_gp_timer_func(unsigned long cpu_in)
 */
 static void rcu_prepare_for_idle_init(int cpu)
 {
-        per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-        setup_timer(&per_cpu(rcu_idle_gp_timer, cpu),
-                    rcu_idle_gp_timer_func, cpu);
+        rdtp->dyntick_holdoff = jiffies - 1;
-        per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1;
+        setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
-        per_cpu(rcu_idle_first_pass, cpu) = 1;
+        rdtp->idle_gp_timer_expires = jiffies - 1;
+        rdtp->idle_first_pass = 1;
 }
 /*
 * Clean up for exit from idle.  Because we are exiting from idle, there
- * is no longer any point to rcu_idle_gp_timer, so cancel it.  This will
+ * is no longer any point to ->idle_gp_timer, so cancel it.  This will
 * do nothing if this timer is not active, so just cancel it unconditionally.
 */
 static void rcu_cleanup_after_idle(int cpu)
 {
-        del_timer(&per_cpu(rcu_idle_gp_timer, cpu));
+        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+        del_timer(&rdtp->idle_gp_timer);
        trace_rcu_prep_idle("Cleanup after idle");
 }
@@ -2108,42 +2126,41 @@ static void rcu_cleanup_after_idle(int cpu)
 * Because it is not legal to invoke rcu_process_callbacks() with irqs
 * disabled, we do one pass of force_quiescent_state(), then do a
 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
- * later.  The per-cpu rcu_dyntick_drain variable controls the sequencing.
+ * later.  The ->dyntick_drain field controls the sequencing.
 *
 * The caller must have disabled interrupts.
 */
 static void rcu_prepare_for_idle(int cpu)
 {
        struct timer_list *tp;
+        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
        /*
         * If this is an idle re-entry, for example, due to use of
         * RCU_NONIDLE() or the new idle-loop tracing API within the idle
         * loop, then don't take any state-machine actions, unless the
         * momentary exit from idle queued additional non-lazy callbacks.
-         * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks
+         * Instead, repost the ->idle_gp_timer if this CPU has callbacks
         * pending.
         */
-        if (!per_cpu(rcu_idle_first_pass, cpu) &&
+        if (!rdtp->idle_first_pass &&
-            (per_cpu(rcu_nonlazy_posted, cpu) ==
+            (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
-             per_cpu(rcu_nonlazy_posted_snap, cpu))) {
                if (rcu_cpu_has_callbacks(cpu)) {
-                        tp = &per_cpu(rcu_idle_gp_timer, cpu);
+                        tp = &rdtp->idle_gp_timer;
-                        mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
+                        mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
                }
                return;
        }
-        per_cpu(rcu_idle_first_pass, cpu) = 0;
+        rdtp->idle_first_pass = 0;
-        per_cpu(rcu_nonlazy_posted_snap, cpu) =
+        rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
-                per_cpu(rcu_nonlazy_posted, cpu) - 1;
        /*
         * If there are no callbacks on this CPU, enter dyntick-idle mode.
         * Also reset state to avoid prejudicing later attempts.
         */
        if (!rcu_cpu_has_callbacks(cpu)) {
-                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+                rdtp->dyntick_holdoff = jiffies - 1;
-                per_cpu(rcu_dyntick_drain, cpu) = 0;
+                rdtp->dyntick_drain = 0;
                trace_rcu_prep_idle("No callbacks");
                return;
        }
@@ -2152,36 +2169,37 @@ static void rcu_prepare_for_idle(int cpu)
         * If in holdoff mode, just return.  We will presumably have
         * refrained from disabling the scheduling-clock tick.
         */
-        if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
+        if (rdtp->dyntick_holdoff == jiffies) {
                trace_rcu_prep_idle("In holdoff");
                return;
        }
-        /* Check and update the rcu_dyntick_drain sequencing. */
+        /* Check and update the ->dyntick_drain sequencing. */
-        if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
+        if (rdtp->dyntick_drain <= 0) {
                /* First time through, initialize the counter. */
-                per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
+                rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
-        } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
+        } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
                   !rcu_pending(cpu) &&
                   !local_softirq_pending()) {
                /* Can we go dyntick-idle despite still having callbacks? */
-                trace_rcu_prep_idle("Dyntick with callbacks");
+                rdtp->dyntick_drain = 0;
-                per_cpu(rcu_dyntick_drain, cpu) = 0;
+                rdtp->dyntick_holdoff = jiffies;
-                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
+                if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-                if (rcu_cpu_has_nonlazy_callbacks(cpu))
+                        trace_rcu_prep_idle("Dyntick with callbacks");
-                        per_cpu(rcu_idle_gp_timer_expires, cpu) =
+                        rdtp->idle_gp_timer_expires =
                                           jiffies + RCU_IDLE_GP_DELAY;
-                else
+                } else {
-                        per_cpu(rcu_idle_gp_timer_expires, cpu) =
+                        rdtp->idle_gp_timer_expires =
                                           jiffies + RCU_IDLE_LAZY_GP_DELAY;
-                tp = &per_cpu(rcu_idle_gp_timer, cpu);
+                        trace_rcu_prep_idle("Dyntick with lazy callbacks");
-                mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu));
+                }
-                per_cpu(rcu_nonlazy_posted_snap, cpu) =
+                tp = &rdtp->idle_gp_timer;
-                        per_cpu(rcu_nonlazy_posted, cpu);
+                mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
+                rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
                return; /* Nothing more to do immediately. */
-        } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
+        } else if (--(rdtp->dyntick_drain) <= 0) {
                /* We have hit the limit, so time to give up. */
-                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
+                rdtp->dyntick_holdoff = jiffies;
                trace_rcu_prep_idle("Begin holdoff");
                invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */
                return;
@@ -2227,7 +2245,7 @@ static void rcu_prepare_for_idle(int cpu)
 */
 static void rcu_idle_count_callbacks_posted(void)
 {
-        __this_cpu_add(rcu_nonlazy_posted, 1);
+        __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
 }
 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
@@ -2238,11 +2256,12 @@ static void rcu_idle_count_callbacks_posted(void)
 static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 {
-        struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu);
+        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+        struct timer_list *tltp = &rdtp->idle_gp_timer;
        sprintf(cp, "drain=%d %c timer=%lu",
-                per_cpu(rcu_dyntick_drain, cpu),
+                rdtp->dyntick_drain,
-                per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.',
+                rdtp->dyntick_holdoff == jiffies ? 'H' : '.',
                timer_pending(tltp) ? tltp->expires - jiffies : -1);
 }
diff --git a/kernel/relay.c b/kernel/relay.c
index ab56a1764d4d..e8cd2027abbd 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1235,6 +1235,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
        struct splice_pipe_desc spd = {
                .pages = pages,
                .nr_pages = 0,
+                .nr_pages_max = PIPE_DEF_BUFFERS,
                .partial = partial,
                .flags = flags,
                .ops = &relay_pipe_buf_ops,
@@ -1302,8 +1303,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
                ret += padding;
 out:
-        splice_shrink_spd(pipe, &spd);
+        splice_shrink_spd(&spd);
-        return ret;
+        return ret;
 }
 static ssize_t relay_file_splice_read(struct file *in,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d5594a4268d4..468bdd44c1ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
 #endif
        /* Here we just switch the register state and the stack. */
-        rcu_switch_from(prev);
        switch_to(prev, next, prev);
        barrier();
@@ -2161,11 +2160,73 @@ unsigned long this_cpu_load(void)
 }
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ *   nr_active = 0;
+ *   for_each_possible_cpu(cpu)
+ *      nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ *  - for_each_possible_cpu() is prohibitively expensive on machines with
+ *    serious number of cpus, therefore we need to take a distributed approach
+ *    to calculating nr_active.
+ *
+ *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ *    So assuming nr_active := 0 when we start out -- true per definition, we
+ *    can simply take per-cpu deltas and fold those into a global accumulate
+ *    to obtain the same result. See calc_load_fold_active().
+ *
+ *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ *    across the machine, we assume 10 ticks is sufficient time for every
+ *    cpu to have completed this task.
+ *
+ *    This places an upper-bound on the IRQ-off latency of the machine. Then
+ *    again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ *    this would add another cross-cpu cacheline miss and atomic operation
+ *    to the wakeup path. Instead we increment on whatever cpu the task ran
+ *    when it went into uninterruptible state and decrement on whatever cpu
+ *    did the wakeup. This means that only the sum of nr_uninterruptible over
+ *    all cpus yields the correct result.
+ *
+ *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
 /* Variables and functions for calc_load */
 static atomic_long_t calc_load_tasks;
 static unsigned long calc_load_update;
 unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
+EXPORT_SYMBOL(avenrun); /* should be removed */
+/**
+ * get_avenrun - get the load average array
+ * @loads:      pointer to dest load array
+ * @offset:     offset to add
+ * @shift:      shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+        loads[0] = (avenrun[0] + offset) << shift;
+        loads[1] = (avenrun[1] + offset) << shift;
+        loads[2] = (avenrun[2] + offset) << shift;
+}
 static long calc_load_fold_active(struct rq *this_rq)
 {
@@ -2182,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq)
        return delta;
 }
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
@@ -2193,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
 #ifdef CONFIG_NO_HZ
 /*
- * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ *  - When we go NO_HZ idle during the window, we can negate our sample
+ *    contribution, causing under-accounting.
+ *
+ *    We avoid this by keeping two idle-delta counters and flipping them
+ *    when the window starts, thus separating old and new NO_HZ load.
+ *
+ *    The only trick is the slight shift in index flip for read vs write.
+ *
+ *        0s            5s            10s           15s
+ *          +10           +10           +10           +10
+ *        |-|-----------|-|-----------|-|-----------|-|
+ *    r:0 0 1           1 0           0 1           1 0
+ *    w:0 1 1           0 0           1 1           0 0
+ *
+ *    This ensures we'll fold the old idle contribution in this window while
+ *    accumlating the new one.
+ *
+ *  - When we wake up from NO_HZ idle during the window, we push up our
+ *    contribution, since we effectively move our sample point to a known
+ *    busy state.
+ *
+ *    This is solved by pushing the window forward, and thus skipping the
+ *    sample, for this cpu (effectively using the idle-delta for this cpu which
+ *    was in effect at the time the window opened). This also solves the issue
+ *    of having to deal with a cpu having been in NOHZ idle for multiple
+ *    LOAD_FREQ intervals.
 *
 * When making the ILB scale, we should try to pull this in as well.
 */
-static atomic_long_t calc_load_tasks_idle;
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
-void calc_load_account_idle(struct rq *this_rq)
+static inline int calc_load_write_idx(void)
 {
+        int idx = calc_load_idx;
+        /*
+         * See calc_global_nohz(), if we observe the new index, we also
+         * need to observe the new update time.
+         */
+        smp_rmb();
+        /*
+         * If the folding window started, make sure we start writing in the
+         * next idle-delta.
+         */
+        if (!time_before(jiffies, calc_load_update))
+                idx++;
+        return idx & 1;
+}
+static inline int calc_load_read_idx(void)
+{
+        return calc_load_idx & 1;
+}
+void calc_load_enter_idle(void)
+{
+        struct rq *this_rq = this_rq();
        long delta;
+        /*
+         * We're going into NOHZ mode, if there's any pending delta, fold it
+         * into the pending idle delta.
+         */
        delta = calc_load_fold_active(this_rq);
-        if (delta)
+        if (delta) {
-                atomic_long_add(delta, &calc_load_tasks_idle);
+                int idx = calc_load_write_idx();
+                atomic_long_add(delta, &calc_load_idle[idx]);
+        }
 }
-static long calc_load_fold_idle(void)
+void calc_load_exit_idle(void)
 {
-        long delta = 0;
+        struct rq *this_rq = this_rq();
+        /*
+         * If we're still before the sample window, we're done.
+         */
+        if (time_before(jiffies, this_rq->calc_load_update))
+                return;
        /*
-         * Its got a race, we don't care...
+         * We woke inside or after the sample window, this means we're already
+         * accounted through the nohz accounting, so skip the entire deal and
+         * sync up for the next window.
         */
-        if (atomic_long_read(&calc_load_tasks_idle))
+        this_rq->calc_load_update = calc_load_update;
-                delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+        if (time_before(jiffies, this_rq->calc_load_update + 10))
+                this_rq->calc_load_update += LOAD_FREQ;
+}
+static long calc_load_fold_idle(void)
+{
+        int idx = calc_load_read_idx();
+        long delta = 0;
+        if (atomic_long_read(&calc_load_idle[idx]))
+                delta = atomic_long_xchg(&calc_load_idle[idx], 0);
        return delta;
 }
@@ -2302,66 +2454,39 @@ static void calc_global_nohz(void)
 {
        long delta, active, n;
-        /*
+        if (!time_before(jiffies, calc_load_update + 10)) {
-         * If we crossed a calc_load_update boundary, make sure to fold
+                /*
-         * any pending idle changes, the respective CPUs might have
+                 * Catch-up, fold however many we are behind still
-         * missed the tick driven calc_load_account_active() update
+                 */
-         * due to NO_HZ.
+                delta = jiffies - calc_load_update - 10;
-         */
+                n = 1 + (delta / LOAD_FREQ);
-        delta = calc_load_fold_idle();
-        if (delta)
-                atomic_long_add(delta, &calc_load_tasks);
-        /*
-         * It could be the one fold was all it took, we done!
-         */
-        if (time_before(jiffies, calc_load_update + 10))
-                return;
-        /*
-         * Catch-up, fold however many we are behind still
-         */
-        delta = jiffies - calc_load_update - 10;
-        n = 1 + (delta / LOAD_FREQ);
-        active = atomic_long_read(&calc_load_tasks);
+                active = atomic_long_read(&calc_load_tasks);
-        active = active > 0 ? active * FIXED_1 : 0;
+                active = active > 0 ? active * FIXED_1 : 0;
-        avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-        avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
-        avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-        calc_load_update += n * LOAD_FREQ;
+                calc_load_update += n * LOAD_FREQ;
-}
+        }
-#else
-void calc_load_account_idle(struct rq *this_rq)
-{
-}
-static inline long calc_load_fold_idle(void)
+        /*
-{
+         * Flip the idle index...
-        return 0;
+         *
+         * Make sure we first write the new time then flip the index, so that
+         * calc_load_write_idx() will see the new time when it reads the new
+         * index, this avoids a double flip messing things up.
+         */
+        smp_wmb();
+        calc_load_idx++;
 }
+#else /* !CONFIG_NO_HZ */
-static void calc_global_nohz(void)
+static inline long calc_load_fold_idle(void) { return 0; }
-{
+static inline void calc_global_nohz(void) { }
-}
-#endif
-/**
+#endif /* CONFIG_NO_HZ */
- * get_avenrun - get the load average array
- * @loads:      pointer to dest load array
- * @offset:     offset to add
- * @shift:      shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
-        loads[0] = (avenrun[0] + offset) << shift;
-        loads[1] = (avenrun[1] + offset) << shift;
-        loads[2] = (avenrun[2] + offset) << shift;
-}
 /*
 * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2369,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 */
 void calc_global_load(unsigned long ticks)
 {
-        long active;
+        long active, delta;
        if (time_before(jiffies, calc_load_update + 10))
                return;
+        /*
+         * Fold the 'old' idle-delta to include all NO_HZ cpus.
+         */
+        delta = calc_load_fold_idle();
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks);
        active = atomic_long_read(&calc_load_tasks);
        active = active > 0 ? active * FIXED_1 : 0;
@@ -2384,12 +2516,7 @@ void calc_global_load(unsigned long ticks)
        calc_load_update += LOAD_FREQ;
        /*
-         * Account one period with whatever state we found before
+         * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
-         * folding in the nohz state and ageing the entire idle period.
-         *
-         * This avoids loosing a sample when we go idle between 
-         * calc_load_account_active() (10 ticks ago) and now and thus
-         * under-accounting.
         */
        calc_global_nohz();
 }
@@ -2406,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq)
                return;
        delta  = calc_load_fold_active(this_rq);
-        delta += calc_load_fold_idle();
        if (delta)
                atomic_long_add(delta, &calc_load_tasks);
@@ -2414,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq)
 }
 /*
+ * End of global load-average stuff
+ */
+/*
 * The exact cpuload at various idx values, calculated at every tick would be
 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
 *
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b44d604b35d1..b6baf370cae9 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
        schedstat_inc(rq, sched_goidle);
-        calc_load_account_idle(rq);
        return rq->idle;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6d52cea7f33d..55844f24435a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -942,8 +942,6 @@ static inline u64 sched_avg_period(void)
        return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
 }
-void calc_load_account_idle(struct rq *this_rq);
 #ifdef CONFIG_SCHED_HRTICK
 /*
diff --git a/kernel/sys.c b/kernel/sys.c
index f0ec44dcd415..2d39a84cd857 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1788,7 +1788,6 @@ SYSCALL_DEFINE1(umask, int, mask)
 #ifdef CONFIG_CHECKPOINT_RESTORE
 static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
-        struct vm_area_struct *vma;
        struct file *exe_file;
        struct dentry *dentry;
        int err;
@@ -1816,13 +1815,17 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
        down_write(&mm->mmap_sem);
        /*
-         * Forbid mm->exe_file change if there are mapped other files.
+         * Forbid mm->exe_file change if old file still mapped.
         */
        err = -EBUSY;
-        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+        if (mm->exe_file) {
-                if (vma->vm_file && !path_equal(&vma->vm_file->f_path,
+                struct vm_area_struct *vma;
-                                                &exe_file->f_path))
-                        goto exit_unlock;
+                for (vma = mm->mmap; vma; vma = vma->vm_next)
+                        if (vma->vm_file &&
+                            path_equal(&vma->vm_file->f_path,
+                                       &mm->exe_file->f_path))
+                                goto exit_unlock;
        }
        /*
@@ -1835,6 +1838,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
        if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
                goto exit_unlock;
+        err = 0;
        set_mm_exe_file(mm, exe_file);
 exit_unlock:
        up_write(&mm->mmap_sem);
@@ -2127,9 +2131,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                else
                                        return -EINVAL;
                                break;
-                case PR_GET_TID_ADDRESS:
-                        error = prctl_get_tid_address(me, (int __user **)arg2);
-                        break;
                        default:
                                return -EINVAL;
                        }
@@ -2147,6 +2148,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                case PR_SET_MM:
                        error = prctl_set_mm(arg2, arg3, arg4, arg5);
                        break;
+                case PR_GET_TID_ADDRESS:
+                        error = prctl_get_tid_address(me, (int __user **)arg2);
+                        break;
                case PR_SET_CHILD_SUBREAPER:
                        me->signal->is_child_subreaper = !!arg2;
                        error = 0;
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 70b33abcc7bb..b7fbadc5c973 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -409,7 +409,9 @@ int second_overflow(unsigned long secs)
                        time_state = TIME_DEL;
                break;
        case TIME_INS:
-                if (secs % 86400 == 0) {
+                if (!(time_status & STA_INS))
+                        time_state = TIME_OK;
+                else if (secs % 86400 == 0) {
                        leap = -1;
                        time_state = TIME_OOP;
                        time_tai++;
@@ -418,7 +420,9 @@ int second_overflow(unsigned long secs)
                }
                break;
        case TIME_DEL:
-                if ((secs + 1) % 86400 == 0) {
+                if (!(time_status & STA_DEL))
+                        time_state = TIME_OK;
+                else if ((secs + 1) % 86400 == 0) {
                        leap = 1;
                        time_tai--;
                        time_state = TIME_WAIT;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 60c9c60e9108..41be02250e08 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -276,10 +276,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 {
        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
        ktime_t last_update, expires, ret = { .tv64 = 0 };
+        unsigned long rcu_delta_jiffies;
        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
        u64 time_delta;
        /* Read jiffies and the time when jiffies were updated last */
        do {
                seq = read_seqbegin(&xtime_lock);
@@ -288,7 +288,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
                time_delta = timekeeping_max_deferment();
        } while (read_seqretry(&xtime_lock, seq));
-        if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
+        if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) ||
            arch_needs_cpu(cpu)) {
                next_jiffies = last_jiffies + 1;
                delta_jiffies = 1;
@@ -296,6 +296,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
                /* Get the next timer wheel timer */
                next_jiffies = get_next_timer_interrupt(last_jiffies);
                delta_jiffies = next_jiffies - last_jiffies;
+                if (rcu_delta_jiffies < delta_jiffies) {
+                        next_jiffies = last_jiffies + rcu_delta_jiffies;
+                        delta_jiffies = rcu_delta_jiffies;
+                }
        }
        /*
         * Do not stop the tick, if we are only one off
@@ -369,6 +373,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
                 */
                if (!ts->tick_stopped) {
                        select_nohz_load_balancer(1);
+                        calc_load_enter_idle();
                        ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
                        ts->tick_stopped = 1;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6f46a00a1e8a..269b1fe5f2ae 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -70,6 +70,12 @@ struct timekeeper {
        /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
        struct timespec raw_time;
+        /* Offset clock monotonic -> clock realtime */
+        ktime_t offs_real;
+        /* Offset clock monotonic -> clock boottime */
+        ktime_t offs_boot;
        /* Seqlock for all timekeeper values */
        seqlock_t lock;
 };
@@ -172,6 +178,14 @@ static inline s64 timekeeping_get_ns_raw(void)
        return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
 }
+static void update_rt_offset(void)
+{
+        struct timespec tmp, *wtm = &timekeeper.wall_to_monotonic;
+        set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
+        timekeeper.offs_real = timespec_to_ktime(tmp);
+}
 /* must hold write on timekeeper.lock */
 static void timekeeping_update(bool clearntp)
 {
@@ -179,6 +193,7 @@ static void timekeeping_update(bool clearntp)
                timekeeper.ntp_error = 0;
                ntp_clear();
        }
+        update_rt_offset();
        update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic,
                         timekeeper.clock, timekeeper.mult);
 }
@@ -604,6 +619,7 @@ void __init timekeeping_init(void)
        }
        set_normalized_timespec(&timekeeper.wall_to_monotonic,
                                -boot.tv_sec, -boot.tv_nsec);
+        update_rt_offset();
        timekeeper.total_sleep_time.tv_sec = 0;
        timekeeper.total_sleep_time.tv_nsec = 0;
        write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -612,6 +628,12 @@ void __init timekeeping_init(void)
 /* time in seconds when suspend began */
 static struct timespec timekeeping_suspend_time;
+static void update_sleep_time(struct timespec t)
+{
+        timekeeper.total_sleep_time = t;
+        timekeeper.offs_boot = timespec_to_ktime(t);
+}
 /**
 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
 * @delta: pointer to a timespec delta value
@@ -630,8 +652,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
        timekeeper.xtime = timespec_add(timekeeper.xtime, *delta);
        timekeeper.wall_to_monotonic =
                        timespec_sub(timekeeper.wall_to_monotonic, *delta);
-        timekeeper.total_sleep_time = timespec_add(
+        update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta));
-                                        timekeeper.total_sleep_time, *delta);
 }
@@ -963,6 +984,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
                leap = second_overflow(timekeeper.xtime.tv_sec);
                timekeeper.xtime.tv_sec += leap;
                timekeeper.wall_to_monotonic.tv_sec -= leap;
+                if (leap)
+                        clock_was_set_delayed();
        }
        /* Accumulate raw time */
@@ -1079,6 +1102,8 @@ static void update_wall_time(void)
                leap = second_overflow(timekeeper.xtime.tv_sec);
                timekeeper.xtime.tv_sec += leap;
                timekeeper.wall_to_monotonic.tv_sec -= leap;
+                if (leap)
+                        clock_was_set_delayed();
        }
        timekeeping_update(false);
@@ -1246,6 +1271,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
        } while (read_seqretry(&timekeeper.lock, seq));
 }
+#ifdef CONFIG_HIGH_RES_TIMERS
+/**
+ * ktime_get_update_offsets - hrtimer helper
+ * @offs_real:  pointer to storage for monotonic -> realtime offset
+ * @offs_boot:  pointer to storage for monotonic -> boottime offset
+ *
+ * Returns current monotonic time and updates the offsets
+ * Called from hrtimer_interupt() or retrigger_next_event()
+ */
+ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
+{
+        ktime_t now;
+        unsigned int seq;
+        u64 secs, nsecs;
+        do {
+                seq = read_seqbegin(&timekeeper.lock);
+                secs = timekeeper.xtime.tv_sec;
+                nsecs = timekeeper.xtime.tv_nsec;
+                nsecs += timekeeping_get_ns();
+                /* If arch requires, add in gettimeoffset() */
+                nsecs += arch_gettimeoffset();
+                *offs_real = timekeeper.offs_real;
+                *offs_boot = timekeeper.offs_boot;
+        } while (read_seqretry(&timekeeper.lock, seq));
+        now = ktime_add_ns(ktime_set(secs, 0), nsecs);
+        now = ktime_sub(now, *offs_real);
+        return now;
+}
+#endif
 /**
 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
 */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1d0f6a8a0e5e..f765465bffe4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
        rb_init_page(bpage->page);
        INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+        INIT_LIST_HEAD(&cpu_buffer->new_pages);
        ret = rb_allocate_pages(cpu_buffer, nr_pages);
        if (ret < 0)
@@ -1346,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
                         * If something was added to this page, it was full
                         * since it is not the tail page. So we deduct the
                         * bytes consumed in ring buffer from here.
-                         * No need to update overruns, since this page is
+                         * Increment overrun to account for the lost events.
-                         * deleted from ring buffer and its entries are
-                         * already accounted for.
                         */
+                        local_add(page_entries, &cpu_buffer->overrun);
                        local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
                }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 68032c6177db..a7fa0702be1c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -371,7 +371,7 @@ EXPORT_SYMBOL_GPL(tracing_on);
 void tracing_off(void)
 {
        if (global_trace.buffer)
-                ring_buffer_record_on(global_trace.buffer);
+                ring_buffer_record_off(global_trace.buffer);
        /*
         * This flag is only looked at when buffers haven't been
         * allocated yet. We don't really care about the race
@@ -3609,6 +3609,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                .pages          = pages_def,
                .partial        = partial_def,
                .nr_pages       = 0, /* This gets updated below. */
+                .nr_pages_max   = PIPE_DEF_BUFFERS,
                .flags          = flags,
                .ops            = &tracing_pipe_buf_ops,
                .spd_release    = tracing_spd_release_pipe,
@@ -3680,7 +3681,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
        ret = splice_to_pipe(pipe, &spd);
 out:
-        splice_shrink_spd(pipe, &spd);
+        splice_shrink_spd(&spd);
        return ret;
 out_err:
@@ -4231,6 +4232,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        struct splice_pipe_desc spd = {
                .pages          = pages_def,
                .partial        = partial_def,
+                .nr_pages_max   = PIPE_DEF_BUFFERS,
                .flags          = flags,
                .ops            = &buffer_pipe_buf_ops,
                .spd_release    = buffer_spd_release,
@@ -4318,7 +4320,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        }
        ret = splice_to_pipe(pipe, &spd);
-        splice_shrink_spd(pipe, &spd);
+        splice_shrink_spd(&spd);
 out:
        return ret;
 }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index e5e1d85b8c7c..4b1dfba70f7c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -372,6 +372,13 @@ static int watchdog(void *unused)
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
+/*
+ * People like the simple clean cpu node info on boot.
+ * Reduce the watchdog noise by only printing messages
+ * that are different from what cpu0 displayed.
+ */
+static unsigned long cpu0_err;
 static int watchdog_nmi_enable(int cpu)
 {
        struct perf_event_attr *wd_attr;
@@ -390,11 +397,21 @@ static int watchdog_nmi_enable(int cpu)
        /* Try to register using hardware perf events */
        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+        /* save cpu0 error for future comparision */
+        if (cpu == 0 && IS_ERR(event))
+                cpu0_err = PTR_ERR(event);
        if (!IS_ERR(event)) {
-                pr_info("enabled, takes one hw-pmu counter.\n");
+                /* only print for cpu0 or different than cpu0 */
+                if (cpu == 0 || cpu0_err)
+                        pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
                goto out_save;
        }
+        /* skip displaying the same error again */
+        if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
+                return PTR_ERR(event);
        /* vary the KERN level based on the returned errno */
        if (PTR_ERR(event) == -EOPNOTSUPP)
author	Thomas Gleixner <tglx@linutronix.de>	2012-07-15 04:24:53 -0400
committer	Thomas Gleixner <tglx@linutronix.de>	2012-07-15 04:24:53 -0400
commit	e8b9dd7e2471b1274e3be719fcc385e0a710e46f (patch)
tree	030d7ce20e8f8767d9423f78c102aba089eec372 /kernel
parent	924412f66fd9d21212e560a93792b0b607d46c6e (diff)
parent	6b1859dba01c7d512b72d77e3fd7da8354235189 (diff)