Merge with upstream to accommodate with thermal changes

This merge is performed to take commit c56f5c0342dfee11a1 ("Thermal: Make Thermal trip points writeable") out of Linus' tree and then fixup power supply class. This is needed since thermal stuff added a new argument: CC drivers/power/power_supply_core.o drivers/power/power_supply_core.c: In function ‘psy_register_thermal’: drivers/power/power_supply_core.c:204:6: warning: passing argument 3 of ‘thermal_zone_device_register’ makes integer from pointer without a cast [enabled by default] include/linux/thermal.h:154:29: note: expected ‘int’ but argument is of type ‘struct power_supply *’ drivers/power/power_supply_core.c:204:6: error: too few arguments to function ‘thermal_zone_device_register’ include/linux/thermal.h:154:29: note: declared here make[1]: *** [drivers/power/power_supply_core.o] Error 1 make: *** [drivers/power/] Error 2 Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
author: Anton Vorontsov <anton.vorontsov@linaro.org> 2012-07-31 07:59:42 -0400
committer: Anton Vorontsov <anton.vorontsov@linaro.org> 2012-07-31 08:16:47 -0400
commit: e6db06a53b1dcf4e9da4aba143e2eb4d63418abb (patch)
tree: 10adcecb71c95ce4393c39fa7911d091bcadfe09 /kernel
parent: ecc2edd56c49fa31a0a9ed15a7bf810ae79d3b85 (diff)
parent: c56f5c0342dfee11a1a13d2f5bb7618de5b17590 (diff)
24 files changed, 1006 insertions, 394 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 72fcd3069a90..b303dfc7dce0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -255,12 +255,17 @@ int cgroup_lock_is_held(void)
 EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
+static int css_unbias_refcnt(int refcnt)
+{
+        return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
+}
 /* the current nr of refs, always >= 0 whether @css is deactivated or not */
 static int css_refcnt(struct cgroup_subsys_state *css)
 {
        int v = atomic_read(&css->refcnt);
-        return v >= 0 ? v : v - CSS_DEACT_BIAS;
+        return css_unbias_refcnt(v);
 }
 /* convenient tests for these bits */
@@ -896,13 +901,10 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                mutex_unlock(&cgroup_mutex);
                /*
-                 * We want to drop the active superblock reference from the
+                 * Drop the active superblock reference that we took when we
-                 * cgroup creation after all the dentry refs are gone -
+                 * created the cgroup
-                 * kill_sb gets mighty unhappy otherwise.  Mark
-                 * dentry->d_fsdata with cgroup_diput() to tell
-                 * cgroup_d_release() to call deactivate_super().
                 */
-                dentry->d_fsdata = cgroup_diput;
+                deactivate_super(cgrp->root->sb);
                /*
                 * if we're getting rid of the cgroup, refcount should ensure
@@ -928,13 +930,6 @@ static int cgroup_delete(const struct dentry *d)
        return 1;
 }
-static void cgroup_d_release(struct dentry *dentry)
-{
-        /* did cgroup_diput() tell me to deactivate super? */
-        if (dentry->d_fsdata == cgroup_diput)
-                deactivate_super(dentry->d_sb);
-}
 static void remove_dir(struct dentry *d)
 {
        struct dentry *parent = dget(d->d_parent);
@@ -1542,7 +1537,6 @@ static int cgroup_get_rootdir(struct super_block *sb)
        static const struct dentry_operations cgroup_dops = {
                .d_iput = cgroup_diput,
                .d_delete = cgroup_delete,
-                .d_release = cgroup_d_release,
        };
        struct inode *inode =
@@ -3889,8 +3883,12 @@ static void css_dput_fn(struct work_struct *work)
 {
        struct cgroup_subsys_state *css =
                container_of(work, struct cgroup_subsys_state, dput_work);
+        struct dentry *dentry = css->cgroup->dentry;
+        struct super_block *sb = dentry->d_sb;
-        dput(css->cgroup->dentry);
+        atomic_inc(&sb->s_active);
+        dput(dentry);
+        deactivate_super(sb);
 }
 static void init_cgroup_css(struct cgroup_subsys_state *css,
@@ -4982,10 +4980,12 @@ EXPORT_SYMBOL_GPL(__css_tryget);
 void __css_put(struct cgroup_subsys_state *css)
 {
        struct cgroup *cgrp = css->cgroup;
+        int v;
        rcu_read_lock();
-        atomic_dec(&css->refcnt);
+        v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
-        switch (css_refcnt(css)) {
+        switch (v) {
        case 1:
                if (notify_on_release(cgrp)) {
                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 67b847dfa2bb..1f91413edb87 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -14,6 +14,7 @@
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/kmsg_dump.h>
 #include <linux/reboot.h>
 #include <linux/sched.h>
 #include <linux/sysrq.h>
@@ -2040,8 +2041,15 @@ static int kdb_env(int argc, const char **argv)
 */
 static int kdb_dmesg(int argc, const char **argv)
 {
-        char *syslog_data[4], *start, *end, c = '\0', *p;
+        int diag;
-        int diag, logging, logsize, lines = 0, adjust = 0, n;
+        int logging;
+        int lines = 0;
+        int adjust = 0;
+        int n = 0;
+        int skip = 0;
+        struct kmsg_dumper dumper = { .active = 1 };
+        size_t len;
+        char buf[201];
        if (argc > 2)
                return KDB_ARGCOUNT;
@@ -2064,22 +2072,10 @@ static int kdb_dmesg(int argc, const char **argv)
                kdb_set(2, setargs);
        }
-        /* syslog_data[0,1] physical start, end+1.  syslog_data[2,3]
+        kmsg_dump_rewind_nolock(&dumper);
-         * logical start, end+1. */
+        while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
-        kdb_syslog_data(syslog_data);
+                n++;
-        if (syslog_data[2] == syslog_data[3])
-                return 0;
-        logsize = syslog_data[1] - syslog_data[0];
-        start = syslog_data[2];
-        end = syslog_data[3];
-#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
-        for (n = 0, p = start; p < end; ++p) {
-                c = *KDB_WRAP(p);
-                if (c == '\n')
-                        ++n;
-        }
-        if (c != '\n')
-                ++n;
        if (lines < 0) {
                if (adjust >= n)
                        kdb_printf("buffer only contains %d lines, nothing "
@@ -2087,21 +2083,11 @@ static int kdb_dmesg(int argc, const char **argv)
                else if (adjust - lines >= n)
                        kdb_printf("buffer only contains %d lines, last %d "
                                   "lines printed\n", n, n - adjust);
-                if (adjust) {
+                skip = adjust;
-                        for (; start < end && adjust; ++start) {
+                lines = abs(lines);
-                                if (*KDB_WRAP(start) == '\n')
-                                        --adjust;
-                        }
-                        if (start < end)
-                                ++start;
-                }
-                for (p = start; p < end && lines; ++p) {
-                        if (*KDB_WRAP(p) == '\n')
-                                ++lines;
-                }
-                end = p;
        } else if (lines > 0) {
-                int skip = n - (adjust + lines);
+                skip = n - lines - adjust;
+                lines = abs(lines);
                if (adjust >= n) {
                        kdb_printf("buffer only contains %d lines, "
                                   "nothing printed\n", n);
@@ -2112,35 +2098,24 @@ static int kdb_dmesg(int argc, const char **argv)
                        kdb_printf("buffer only contains %d lines, first "
                                   "%d lines printed\n", n, lines);
                }
-                for (; start < end && skip; ++start) {
+        } else {
-                        if (*KDB_WRAP(start) == '\n')
+                lines = n;
-                                --skip;
-                }
-                for (p = start; p < end && lines; ++p) {
-                        if (*KDB_WRAP(p) == '\n')
-                                --lines;
-                }
-                end = p;
        }
-        /* Do a line at a time (max 200 chars) to reduce protocol overhead */
-        c = '\n';
+        if (skip >= n || skip < 0)
-        while (start != end) {
+                return 0;
-                char buf[201];
-                p = buf;
+        kmsg_dump_rewind_nolock(&dumper);
-                if (KDB_FLAG(CMD_INTERRUPT))
+        while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
-                        return 0;
+                if (skip) {
-                while (start < end && (c = *KDB_WRAP(start)) &&
+                        skip--;
-                       (p - buf) < sizeof(buf)-1) {
+                        continue;
-                        ++start;
-                        *p++ = c;
-                        if (c == '\n')
-                                break;
                }
-                *p = '\0';
+                if (!lines--)
-                kdb_printf("%s", buf);
+                        break;
+                kdb_printf("%.*s\n", (int)len - 1, buf);
        }
-        if (c != '\n')
-                kdb_printf("\n");
        return 0;
 }
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 47c4e56e513b..392ec6a25844 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -205,7 +205,6 @@ extern char kdb_grep_string[];
 extern int kdb_grep_leading;
 extern int kdb_grep_trailing;
 extern char *kdb_cmds[];
-extern void kdb_syslog_data(char *syslog_data[]);
 extern unsigned long kdb_task_state_string(const char *);
 extern char kdb_task_state_char (const struct task_struct *);
 extern unsigned long kdb_task_state(const struct task_struct *p,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f85c0154b333..d7d71d6ec972 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event)
        return !event->cgrp || event->cgrp == cpuctx->cgrp;
 }
-static inline void perf_get_cgroup(struct perf_event *event)
+static inline bool perf_tryget_cgroup(struct perf_event *event)
 {
-        css_get(&event->cgrp->css);
+        return css_tryget(&event->cgrp->css);
 }
 static inline void perf_put_cgroup(struct perf_event *event)
@@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
        event->cgrp = cgrp;
        /* must be done before we fput() the file */
-        perf_get_cgroup(event);
+        if (!perf_tryget_cgroup(event)) {
+                event->cgrp = NULL;
+                ret = -ENOENT;
+                goto out;
+        }
        /*
         * all events in a group must monitor
diff --git a/kernel/exit.c b/kernel/exit.c
index 34867cc5b42a..2f59cc334516 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,6 +72,18 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
                list_del_rcu(&p->tasks);
                list_del_init(&p->sibling);
                __this_cpu_dec(process_counts);
+                /*
+                 * If we are the last child process in a pid namespace to be
+                 * reaped, notify the reaper sleeping zap_pid_ns_processes().
+                 */
+                if (IS_ENABLED(CONFIG_PID_NS)) {
+                        struct task_struct *parent = p->real_parent;
+                        if ((task_active_pid_ns(parent)->child_reaper == parent) &&
+                            list_empty(&parent->children) &&
+                            (parent->flags & PF_EXITING))
+                                wake_up_process(parent);
+                }
        }
        list_del_rcu(&p->thread_group);
 }
@@ -643,6 +655,7 @@ static void exit_mm(struct task_struct * tsk)
        mm_release(tsk, mm);
        if (!mm)
                return;
+        sync_mm_rss(mm);
        /*
         * Serialize with any possible pending coredump.
         * We must hold mmap_sem around checking core_state
@@ -719,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
                zap_pid_ns_processes(pid_ns);
                write_lock_irq(&tasklist_lock);
-                /*
-                 * We can not clear ->child_reaper or leave it alone.
-                 * There may by stealth EXIT_DEAD tasks on ->children,
-                 * forget_original_parent() must move them somewhere.
-                 */
-                pid_ns->child_reaper = init_pid_ns.child_reaper;
        } else if (father->signal->has_child_subreaper) {
                struct task_struct *reaper;
diff --git a/kernel/fork.c b/kernel/fork.c
index ab5211b9e622..f00e319d8376 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -304,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        }
        err = arch_dup_task_struct(tsk, orig);
-        if (err)
-                goto out;
+        /*
+         * We defer looking at err, because we will need this setup
+         * for the clean up path to work correctly.
+         */
        tsk->stack = ti;
        setup_thread_stack(tsk, orig);
+        if (err)
+                goto out;
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
        stackend = end_of_stack(tsk);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ae34bf51682b..6db7a5ed52b5 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
        return 0;
 }
+static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+{
+        ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+        ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+        return ktime_get_update_offsets(offs_real, offs_boot);
+}
 /*
 * Retrigger next event is called after clock was set
 *
@@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 static void retrigger_next_event(void *arg)
 {
        struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
-        struct timespec realtime_offset, xtim, wtm, sleep;
        if (!hrtimer_hres_active())
                return;
-        /* Optimized out for !HIGH_RES */
-        get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
-        set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
-        /* Adjust CLOCK_REALTIME offset */
        raw_spin_lock(&base->lock);
-        base->clock_base[HRTIMER_BASE_REALTIME].offset =
+        hrtimer_update_base(base);
-                timespec_to_ktime(realtime_offset);
-        base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
-                timespec_to_ktime(sleep);
        hrtimer_force_reprogram(base, 0);
        raw_spin_unlock(&base->lock);
 }
@@ -710,13 +708,25 @@ static int hrtimer_switch_to_hres(void)
                base->clock_base[i].resolution = KTIME_HIGH_RES;
        tick_setup_sched_timer();
        /* "Retrigger" the interrupt to get things going */
        retrigger_next_event(NULL);
        local_irq_restore(flags);
        return 1;
 }
+/*
+ * Called from timekeeping code to reprogramm the hrtimer interrupt
+ * device. If called from the timer interrupt context we defer it to
+ * softirq context.
+ */
+void clock_was_set_delayed(void)
+{
+        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+        cpu_base->clock_was_set = 1;
+        __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+}
 #else
 static inline int hrtimer_hres_active(void) { return 0; }
@@ -1250,11 +1260,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
        cpu_base->nr_events++;
        dev->next_event.tv64 = KTIME_MAX;
-        entry_time = now = ktime_get();
+        raw_spin_lock(&cpu_base->lock);
+        entry_time = now = hrtimer_update_base(cpu_base);
 retry:
        expires_next.tv64 = KTIME_MAX;
-        raw_spin_lock(&cpu_base->lock);
        /*
         * We set expires_next to KTIME_MAX here with cpu_base->lock
         * held to prevent that a timer is enqueued in our queue via
@@ -1330,8 +1339,12 @@ retry:
         * We need to prevent that we loop forever in the hrtimer
         * interrupt routine. We give it 3 attempts to avoid
         * overreacting on some spurious event.
+         *
+         * Acquire base lock for updating the offsets and retrieving
+         * the current time.
         */
-        now = ktime_get();
+        raw_spin_lock(&cpu_base->lock);
+        now = hrtimer_update_base(cpu_base);
        cpu_base->nr_retries++;
        if (++retries < 3)
                goto retry;
@@ -1343,6 +1356,7 @@ retry:
         */
        cpu_base->nr_hangs++;
        cpu_base->hang_detected = 1;
+        raw_spin_unlock(&cpu_base->lock);
        delta = ktime_sub(now, entry_time);
        if (delta.tv64 > cpu_base->max_hang_time.tv64)
                cpu_base->max_hang_time = delta;
@@ -1395,6 +1409,13 @@ void hrtimer_peek_ahead_timers(void)
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
+        struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
+        if (cpu_base->clock_was_set) {
+                cpu_base->clock_was_set = 0;
+                clock_was_set();
+        }
        hrtimer_peek_ahead_timers();
 }
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 16b20e38c4a1..b3c7fd554250 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -184,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        }
        read_unlock(&tasklist_lock);
+        /* Firstly reap the EXIT_ZOMBIE children we may have. */
        do {
                clear_thread_flag(TIF_SIGPENDING);
                rc = sys_wait4(-1, NULL, __WALL, NULL);
        } while (rc != -ECHILD);
+        /*
+         * sys_wait4() above can't reap the TASK_DEAD children.
+         * Make sure they all go away, see __unhash_process().
+         */
+        for (;;) {
+                bool need_wait = false;
+                read_lock(&tasklist_lock);
+                if (!list_empty(&current->children)) {
+                        __set_current_state(TASK_UNINTERRUPTIBLE);
+                        need_wait = true;
+                }
+                read_unlock(&tasklist_lock);
+                if (!need_wait)
+                        break;
+                schedule();
+        }
        if (pid_ns->reboot)
                current->signal->group_exit_code = pid_ns->reboot;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8b53db38a279..238025f5472e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -27,7 +27,6 @@
 #include <linux/syscore_ops.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
-#include <scsi/scsi_scan.h>
 #include "power.h"
@@ -748,13 +747,6 @@ static int software_resume(void)
                        async_synchronize_full();
                }
-                /*
-                 * We can't depend on SCSI devices being available after loading
-                 * one of their modules until scsi_complete_async_scans() is
-                 * called and the resume device usually is a SCSI one.
-                 */
-                scsi_complete_async_scans();
                swsusp_resume_device = name_to_dev_t(resume_file);
                if (!swsusp_resume_device) {
                        error = -ENODEV;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 91b0fd021a95..4ed81e74f86f 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -24,7 +24,6 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
-#include <scsi/scsi_scan.h>
 #include <asm/uaccess.h>
@@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
                 * appear.
                 */
                wait_for_device_probe();
-                scsi_complete_async_scans();
                data->swap = -1;
                data->mode = O_WRONLY;
diff --git a/kernel/printk.c b/kernel/printk.c
index 32462d2b364a..ac4bc9e79465 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -193,12 +193,21 @@ static int console_may_schedule;
 * separated by ',', and find the message after the ';' character.
 */
+enum log_flags {
+        LOG_NOCONS      = 1,    /* already flushed, do not print to console */
+        LOG_NEWLINE     = 2,    /* text ended with a newline */
+        LOG_PREFIX      = 4,    /* text started with a prefix */
+        LOG_CONT        = 8,    /* text is a fragment of a continuation line */
+};
 struct log {
        u64 ts_nsec;            /* timestamp in nanoseconds */
        u16 len;                /* length of entire record */
        u16 text_len;           /* length of text buffer */
        u16 dict_len;           /* length of dictionary buffer */
-        u16 level;              /* syslog level + facility */
+        u8 facility;            /* syslog facility */
+        u8 flags:5;             /* internal record flags */
+        u8 level:3;             /* syslog level */
 };
 /*
@@ -210,6 +219,8 @@ static DEFINE_RAW_SPINLOCK(logbuf_lock);
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
 static u64 syslog_seq;
 static u32 syslog_idx;
+static enum log_flags syslog_prev;
+static size_t syslog_partial;
 /* index and sequence number of the first record stored in the buffer */
 static u64 log_first_seq;
@@ -227,10 +238,10 @@ static u32 clear_idx;
 #define LOG_LINE_MAX 1024
 /* record buffer */
-#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
 #define LOG_ALIGN 4
 #else
-#define LOG_ALIGN 8
+#define LOG_ALIGN __alignof__(struct log)
 #endif
 #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
 static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
@@ -286,6 +297,7 @@ static u32 log_next(u32 idx)
 /* insert record into the buffer, discard old ones, update heads */
 static void log_store(int facility, int level,
+                      enum log_flags flags, u64 ts_nsec,
                      const char *dict, u16 dict_len,
                      const char *text, u16 text_len)
 {
@@ -329,8 +341,13 @@ static void log_store(int facility, int level,
        msg->text_len = text_len;
        memcpy(log_dict(msg), dict, dict_len);
        msg->dict_len = dict_len;
-        msg->level = (facility << 3) | (level & 7);
+        msg->facility = facility;
-        msg->ts_nsec = local_clock();
+        msg->level = level & 7;
+        msg->flags = flags & 0x1f;
+        if (ts_nsec > 0)
+                msg->ts_nsec = ts_nsec;
+        else
+                msg->ts_nsec = local_clock();
        memset(log_dict(msg) + dict_len, 0, pad_len);
        msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
@@ -414,21 +431,23 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
        if (!user)
                return -EBADF;
-        mutex_lock(&user->lock);
+        ret = mutex_lock_interruptible(&user->lock);
-        raw_spin_lock(&logbuf_lock);
+        if (ret)
+                return ret;
+        raw_spin_lock_irq(&logbuf_lock);
        while (user->seq == log_next_seq) {
                if (file->f_flags & O_NONBLOCK) {
                        ret = -EAGAIN;
-                        raw_spin_unlock(&logbuf_lock);
+                        raw_spin_unlock_irq(&logbuf_lock);
                        goto out;
                }
-                raw_spin_unlock(&logbuf_lock);
+                raw_spin_unlock_irq(&logbuf_lock);
                ret = wait_event_interruptible(log_wait,
                                               user->seq != log_next_seq);
                if (ret)
                        goto out;
-                raw_spin_lock(&logbuf_lock);
+                raw_spin_lock_irq(&logbuf_lock);
        }
        if (user->seq < log_first_seq) {
@@ -436,7 +455,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
                user->idx = log_first_idx;
                user->seq = log_first_seq;
                ret = -EPIPE;
-                raw_spin_unlock(&logbuf_lock);
+                raw_spin_unlock_irq(&logbuf_lock);
                goto out;
        }
@@ -444,13 +463,13 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
        ts_usec = msg->ts_nsec;
        do_div(ts_usec, 1000);
        len = sprintf(user->buf, "%u,%llu,%llu;",
-                      msg->level, user->seq, ts_usec);
+                      (msg->facility << 3) | msg->level, user->seq, ts_usec);
        /* escape non-printable characters */
        for (i = 0; i < msg->text_len; i++) {
                unsigned char c = log_text(msg)[i];
-                if (c < ' ' || c >= 128)
+                if (c < ' ' || c >= 127 || c == '\\')
                        len += sprintf(user->buf + len, "\\x%02x", c);
                else
                        user->buf[len++] = c;
@@ -474,7 +493,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
                                continue;
                        }
-                        if (c < ' ' || c >= 128) {
+                        if (c < ' ' || c >= 127 || c == '\\') {
                                len += sprintf(user->buf + len, "\\x%02x", c);
                                continue;
                        }
@@ -486,7 +505,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
        user->idx = log_next(user->idx);
        user->seq++;
-        raw_spin_unlock(&logbuf_lock);
+        raw_spin_unlock_irq(&logbuf_lock);
        if (len > count) {
                ret = -EINVAL;
@@ -513,7 +532,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
        if (offset)
                return -ESPIPE;
-        raw_spin_lock(&logbuf_lock);
+        raw_spin_lock_irq(&logbuf_lock);
        switch (whence) {
        case SEEK_SET:
                /* the first record */
@@ -537,7 +556,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
        default:
                ret = -EINVAL;
        }
-        raw_spin_unlock(&logbuf_lock);
+        raw_spin_unlock_irq(&logbuf_lock);
        return ret;
 }
@@ -551,14 +570,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
        poll_wait(file, &log_wait, wait);
-        raw_spin_lock(&logbuf_lock);
+        raw_spin_lock_irq(&logbuf_lock);
        if (user->seq < log_next_seq) {
                /* return error when data has vanished underneath us */
                if (user->seq < log_first_seq)
                        ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
                ret = POLLIN|POLLRDNORM;
        }
-        raw_spin_unlock(&logbuf_lock);
+        raw_spin_unlock_irq(&logbuf_lock);
        return ret;
 }
@@ -582,10 +601,10 @@ static int devkmsg_open(struct inode *inode, struct file *file)
        mutex_init(&user->lock);
-        raw_spin_lock(&logbuf_lock);
+        raw_spin_lock_irq(&logbuf_lock);
        user->idx = log_first_idx;
        user->seq = log_first_seq;
-        raw_spin_unlock(&logbuf_lock);
+        raw_spin_unlock_irq(&logbuf_lock);
        file->private_data = user;
        return 0;
@@ -785,44 +804,64 @@ static bool printk_time;
 #endif
 module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
+static size_t print_time(u64 ts, char *buf)
+{
+        unsigned long rem_nsec;
+        if (!printk_time)
+                return 0;
+        if (!buf)
+                return 15;
+        rem_nsec = do_div(ts, 1000000000);
+        return sprintf(buf, "[%5lu.%06lu] ",
+                       (unsigned long)ts, rem_nsec / 1000);
+}
 static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
 {
        size_t len = 0;
+        unsigned int prefix = (msg->facility << 3) | msg->level;
        if (syslog) {
                if (buf) {
-                        len += sprintf(buf, "<%u>", msg->level);
+                        len += sprintf(buf, "<%u>", prefix);
                } else {
                        len += 3;
-                        if (msg->level > 9)
+                        if (prefix > 999)
-                                len++;
+                                len += 3;
-                        if (msg->level > 99)
+                        else if (prefix > 99)
+                                len += 2;
+                        else if (prefix > 9)
                                len++;
                }
        }
-        if (printk_time) {
+        len += print_time(msg->ts_nsec, buf ? buf + len : NULL);
-                if (buf) {
-                        unsigned long long ts = msg->ts_nsec;
-                        unsigned long rem_nsec = do_div(ts, 1000000000);
-                        len += sprintf(buf + len, "[%5lu.%06lu] ",
-                                         (unsigned long) ts, rem_nsec / 1000);
-                } else {
-                        len += 15;
-                }
-        }
        return len;
 }
-static size_t msg_print_text(const struct log *msg, bool syslog,
+static size_t msg_print_text(const struct log *msg, enum log_flags prev,
-                             char *buf, size_t size)
+                             bool syslog, char *buf, size_t size)
 {
        const char *text = log_text(msg);
        size_t text_size = msg->text_len;
+        bool prefix = true;
+        bool newline = true;
        size_t len = 0;
+        if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
+                prefix = false;
+        if (msg->flags & LOG_CONT) {
+                if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
+                        prefix = false;
+                if (!(msg->flags & LOG_NEWLINE))
+                        newline = false;
+        }
        do {
                const char *next = memchr(text, '\n', text_size);
                size_t text_len;
@@ -840,16 +879,22 @@ static size_t msg_print_text(const struct log *msg, bool syslog,
                            text_len + 1>= size - len)
                                break;
-                        len += print_prefix(msg, syslog, buf + len);
+                        if (prefix)
+                                len += print_prefix(msg, syslog, buf + len);
                        memcpy(buf + len, text, text_len);
                        len += text_len;
-                        buf[len++] = '\n';
+                        if (next || newline)
+                                buf[len++] = '\n';
                } else {
                        /* SYSLOG_ACTION_* buffer size only calculation */
-                        len += print_prefix(msg, syslog, NULL);
+                        if (prefix)
-                        len += text_len + 1;
+                                len += print_prefix(msg, syslog, NULL);
+                        len += text_len;
+                        if (next || newline)
+                                len++;
                }
+                prefix = true;
                text = next;
        } while (text);
@@ -860,26 +905,60 @@ static int syslog_print(char __user *buf, int size)
 {
        char *text;
        struct log *msg;
-        int len;
+        int len = 0;
        text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
        if (!text)
                return -ENOMEM;
-        raw_spin_lock_irq(&logbuf_lock);
+        while (size > 0) {
-        if (syslog_seq < log_first_seq) {
+                size_t n;
-                /* messages are gone, move to first one */
+                size_t skip;
-                syslog_seq = log_first_seq;
-                syslog_idx = log_first_idx;
-        }
-        msg = log_from_idx(syslog_idx);
-        len = msg_print_text(msg, true, text, LOG_LINE_MAX);
-        syslog_idx = log_next(syslog_idx);
-        syslog_seq++;
-        raw_spin_unlock_irq(&logbuf_lock);
-        if (len > 0 && copy_to_user(buf, text, len))
+                raw_spin_lock_irq(&logbuf_lock);
-                len = -EFAULT;
+                if (syslog_seq < log_first_seq) {
+                        /* messages are gone, move to first one */
+                        syslog_seq = log_first_seq;
+                        syslog_idx = log_first_idx;
+                        syslog_prev = 0;
+                        syslog_partial = 0;
+                }
+                if (syslog_seq == log_next_seq) {
+                        raw_spin_unlock_irq(&logbuf_lock);
+                        break;
+                }
+                skip = syslog_partial;
+                msg = log_from_idx(syslog_idx);
+                n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX);
+                if (n - syslog_partial <= size) {
+                        /* message fits into buffer, move forward */
+                        syslog_idx = log_next(syslog_idx);
+                        syslog_seq++;
+                        syslog_prev = msg->flags;
+                        n -= syslog_partial;
+                        syslog_partial = 0;
+                } else if (!len){
+                        /* partial read(), remember position */
+                        n = size;
+                        syslog_partial += n;
+                } else
+                        n = 0;
+                raw_spin_unlock_irq(&logbuf_lock);
+                if (!n)
+                        break;
+                if (copy_to_user(buf, text + skip, n)) {
+                        if (!len)
+                                len = -EFAULT;
+                        break;
+                }
+                len += n;
+                size -= n;
+                buf += n;
+        }
        kfree(text);
        return len;
@@ -899,6 +978,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                u64 next_seq;
                u64 seq;
                u32 idx;
+                enum log_flags prev;
                if (clear_seq < log_first_seq) {
                        /* messages are gone, move to first available one */
@@ -909,41 +989,47 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                /*
                 * Find first record that fits, including all following records,
                 * into the user-provided buffer for this dump.
-                */
+                 */
                seq = clear_seq;
                idx = clear_idx;
+                prev = 0;
                while (seq < log_next_seq) {
                        struct log *msg = log_from_idx(idx);
-                        len += msg_print_text(msg, true, NULL, 0);
+                        len += msg_print_text(msg, prev, true, NULL, 0);
                        idx = log_next(idx);
                        seq++;
                }
+                /* move first record forward until length fits into the buffer */
                seq = clear_seq;
                idx = clear_idx;
+                prev = 0;
                while (len > size && seq < log_next_seq) {
                        struct log *msg = log_from_idx(idx);
-                        len -= msg_print_text(msg, true, NULL, 0);
+                        len -= msg_print_text(msg, prev, true, NULL, 0);
                        idx = log_next(idx);
                        seq++;
                }
-                /* last message in this dump */
+                /* last message fitting into this dump */
                next_seq = log_next_seq;
                len = 0;
+                prev = 0;
                while (len >= 0 && seq < next_seq) {
                        struct log *msg = log_from_idx(idx);
                        int textlen;
-                        textlen = msg_print_text(msg, true, text, LOG_LINE_MAX);
+                        textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX);
                        if (textlen < 0) {
                                len = textlen;
                                break;
                        }
                        idx = log_next(idx);
                        seq++;
+                        prev = msg->flags;
                        raw_spin_unlock_irq(&logbuf_lock);
                        if (copy_to_user(buf + len, text, textlen))
@@ -956,6 +1042,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                                /* messages are gone, move to next one */
                                seq = log_first_seq;
                                idx = log_first_idx;
+                                prev = 0;
                        }
                }
        }
@@ -1027,6 +1114,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
        /* Clear ring buffer */
        case SYSLOG_ACTION_CLEAR:
                syslog_print_all(NULL, 0, true);
+                break;
        /* Disable logging to console */
        case SYSLOG_ACTION_CONSOLE_OFF:
                if (saved_console_loglevel == -1)
@@ -1059,6 +1147,8 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                        /* messages are gone, move to first one */
                        syslog_seq = log_first_seq;
                        syslog_idx = log_first_idx;
+                        syslog_prev = 0;
+                        syslog_partial = 0;
                }
                if (from_file) {
                        /*
@@ -1068,19 +1158,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                         */
                        error = log_next_idx - syslog_idx;
                } else {
-                        u64 seq;
+                        u64 seq = syslog_seq;
-                        u32 idx;
+                        u32 idx = syslog_idx;
+                        enum log_flags prev = syslog_prev;
                        error = 0;
-                        seq = syslog_seq;
-                        idx = syslog_idx;
                        while (seq < log_next_seq) {
                                struct log *msg = log_from_idx(idx);
-                                error += msg_print_text(msg, true, NULL, 0);
+                                error += msg_print_text(msg, prev, true, NULL, 0);
                                idx = log_next(idx);
                                seq++;
+                                prev = msg->flags;
                        }
+                        error -= syslog_partial;
                }
                raw_spin_unlock_irq(&logbuf_lock);
                break;
@@ -1101,21 +1192,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
        return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
 }
-#ifdef  CONFIG_KGDB_KDB
-/* kdb dmesg command needs access to the syslog buffer.  do_syslog()
- * uses locks so it cannot be used during debugging.  Just tell kdb
- * where the start and end of the physical and logical logs are.  This
- * is equivalent to do_syslog(3).
- */
-void kdb_syslog_data(char *syslog_data[4])
-{
-        syslog_data[0] = log_buf;
-        syslog_data[1] = log_buf + log_buf_len;
-        syslog_data[2] = log_buf + log_first_idx;
-        syslog_data[3] = log_buf + log_next_idx;
-}
-#endif  /* CONFIG_KGDB_KDB */
 static bool __read_mostly ignore_loglevel;
 static int __init ignore_loglevel_setup(char *str)
@@ -1259,22 +1335,98 @@ static inline void printk_delay(void)
        }
 }
+/*
+ * Continuation lines are buffered, and not committed to the record buffer
+ * until the line is complete, or a race forces it. The line fragments
+ * though, are printed immediately to the consoles to ensure everything has
+ * reached the console in case of a kernel crash.
+ */
+static struct cont {
+        char buf[LOG_LINE_MAX];
+        size_t len;                     /* length == 0 means unused buffer */
+        size_t cons;                    /* bytes written to console */
+        struct task_struct *owner;      /* task of first print*/
+        u64 ts_nsec;                    /* time of first print */
+        u8 level;                       /* log level of first message */
+        u8 facility;                    /* log level of first message */
+        bool flushed:1;                 /* buffer sealed and committed */
+} cont;
+static void cont_flush(void)
+{
+        if (cont.flushed)
+                return;
+        if (cont.len == 0)
+                return;
+        log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec,
+                  NULL, 0, cont.buf, cont.len);
+        cont.flushed = true;
+}
+static bool cont_add(int facility, int level, const char *text, size_t len)
+{
+        if (cont.len && cont.flushed)
+                return false;
+        if (cont.len + len > sizeof(cont.buf)) {
+                cont_flush();
+                return false;
+        }
+        if (!cont.len) {
+                cont.facility = facility;
+                cont.level = level;
+                cont.owner = current;
+                cont.ts_nsec = local_clock();
+                cont.cons = 0;
+                cont.flushed = false;
+        }
+        memcpy(cont.buf + cont.len, text, len);
+        cont.len += len;
+        return true;
+}
+static size_t cont_print_text(char *text, size_t size)
+{
+        size_t textlen = 0;
+        size_t len;
+        if (cont.cons == 0) {
+                textlen += print_time(cont.ts_nsec, text);
+                size -= textlen;
+        }
+        len = cont.len - cont.cons;
+        if (len > 0) {
+                if (len+1 > size)
+                        len = size-1;
+                memcpy(text + textlen, cont.buf + cont.cons, len);
+                textlen += len;
+                cont.cons = cont.len;
+        }
+        if (cont.flushed) {
+                text[textlen++] = '\n';
+                /* got everything, release buffer */
+                cont.len = 0;
+        }
+        return textlen;
+}
 asmlinkage int vprintk_emit(int facility, int level,
                            const char *dict, size_t dictlen,
                            const char *fmt, va_list args)
 {
        static int recursion_bug;
-        static char cont_buf[LOG_LINE_MAX];
-        static size_t cont_len;
-        static int cont_level;
-        static struct task_struct *cont_task;
        static char textbuf[LOG_LINE_MAX];
        char *text = textbuf;
        size_t text_len;
+        enum log_flags lflags = 0;
        unsigned long flags;
        int this_cpu;
-        bool newline = false;
-        bool prefix = false;
        int printed_len = 0;
        boot_delay_msec();
@@ -1313,7 +1465,8 @@ asmlinkage int vprintk_emit(int facility, int level,
                recursion_bug = 0;
                printed_len += strlen(recursion_msg);
                /* emit KERN_CRIT message */
-                log_store(0, 2, NULL, 0, recursion_msg, printed_len);
+                log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
+                          NULL, 0, recursion_msg, printed_len);
        }
        /*
@@ -1325,7 +1478,7 @@ asmlinkage int vprintk_emit(int facility, int level,
        /* mark and strip a trailing newline */
        if (text_len && text[text_len-1] == '\n') {
                text_len--;
-                newline = true;
+                lflags |= LOG_NEWLINE;
        }
        /* strip syslog prefix and extract log level or control flags */
@@ -1335,7 +1488,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                        if (level == -1)
                                level = text[1] - '0';
                case 'd':       /* KERN_DEFAULT */
-                        prefix = true;
+                        lflags |= LOG_PREFIX;
                case 'c':       /* KERN_CONT */
                        text += 3;
                        text_len -= 3;
@@ -1345,61 +1498,41 @@ asmlinkage int vprintk_emit(int facility, int level,
        if (level == -1)
                level = default_message_loglevel;
-        if (dict) {
+        if (dict)
-                prefix = true;
+                lflags |= LOG_PREFIX|LOG_NEWLINE;
-                newline = true;
-        }
-        if (!newline) {
-                if (cont_len && (prefix || cont_task != current)) {
-                        /*
-                         * Flush earlier buffer, which is either from a
-                         * different thread, or when we got a new prefix.
-                         */
-                        log_store(facility, cont_level, NULL, 0, cont_buf, cont_len);
-                        cont_len = 0;
-                }
-                if (!cont_len) {
+        if (!(lflags & LOG_NEWLINE)) {
-                        cont_level = level;
+                /*
-                        cont_task = current;
+                 * Flush the conflicting buffer. An earlier newline was missing,
-                }
+                 * or another task also prints continuation lines.
+                 */
+                if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
+                        cont_flush();
-                /* buffer or append to earlier buffer from the same thread */
+                /* buffer line if possible, otherwise store it right away */
-                if (cont_len + text_len > sizeof(cont_buf))
+                if (!cont_add(facility, level, text, text_len))
-                        text_len = sizeof(cont_buf) - cont_len;
+                        log_store(facility, level, lflags | LOG_CONT, 0,
-                memcpy(cont_buf + cont_len, text, text_len);
+                                  dict, dictlen, text, text_len);
-                cont_len += text_len;
        } else {
-                if (cont_len && cont_task == current) {
+                bool stored = false;
-                        if (prefix) {
-                                /*
-                                 * New prefix from the same thread; flush. We
-                                 * either got no earlier newline, or we race
-                                 * with an interrupt.
-                                 */
-                                log_store(facility, cont_level,
-                                          NULL, 0, cont_buf, cont_len);
-                                cont_len = 0;
-                        }
-                        /* append to the earlier buffer and flush */
+                /*
-                        if (cont_len + text_len > sizeof(cont_buf))
+                 * If an earlier newline was missing and it was the same task,
-                                text_len = sizeof(cont_buf) - cont_len;
+                 * either merge it with the current buffer and flush, or if
-                        memcpy(cont_buf + cont_len, text, text_len);
+                 * there was a race with interrupts (prefix == true) then just
-                        cont_len += text_len;
+                 * flush it out and store this line separately.
-                        log_store(facility, cont_level,
+                 */
-                                  NULL, 0, cont_buf, cont_len);
+                if (cont.len && cont.owner == current) {
-                        cont_len = 0;
+                        if (!(lflags & LOG_PREFIX))
-                        cont_task = NULL;
+                                stored = cont_add(facility, level, text, text_len);
-                        printed_len = cont_len;
+                        cont_flush();
-                } else {
-                        /* ordinary single and terminated line */
-                        log_store(facility, level,
-                                  dict, dictlen, text, text_len);
-                        printed_len = text_len;
                }
+                if (!stored)
+                        log_store(facility, level, lflags, 0,
+                                  dict, dictlen, text, text_len);
        }
+        printed_len += text_len;
        /*
         * Try to acquire and then immediately release the console semaphore.
@@ -1486,11 +1619,18 @@ EXPORT_SYMBOL(printk);
 #else
 #define LOG_LINE_MAX 0
+static struct cont {
+        size_t len;
+        size_t cons;
+        u8 level;
+        bool flushed:1;
+} cont;
 static struct log *log_from_idx(u32 idx) { return NULL; }
 static u32 log_next(u32 idx) { return 0; }
 static void call_console_drivers(int level, const char *text, size_t len) {}
-static size_t msg_print_text(const struct log *msg, bool syslog,
+static size_t msg_print_text(const struct log *msg, enum log_flags prev,
-                             char *buf, size_t size) { return 0; }
+                             bool syslog, char *buf, size_t size) { return 0; }
+static size_t cont_print_text(char *text, size_t size) { return 0; }
 #endif /* CONFIG_PRINTK */
@@ -1765,6 +1905,7 @@ void wake_up_klogd(void)
 /* the next printk record to write to the console */
 static u64 console_seq;
 static u32 console_idx;
+static enum log_flags console_prev;
 /**
 * console_unlock - unlock the console system
@@ -1782,6 +1923,7 @@ static u32 console_idx;
 */
 void console_unlock(void)
 {
+        static char text[LOG_LINE_MAX];
        static u64 seen_seq;
        unsigned long flags;
        bool wake_klogd = false;
@@ -1794,10 +1936,23 @@ void console_unlock(void)
        console_may_schedule = 0;
+        /* flush buffered message fragment immediately to console */
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        if (cont.len && (cont.cons < cont.len || cont.flushed)) {
+                size_t len;
+                len = cont_print_text(text, sizeof(text));
+                raw_spin_unlock(&logbuf_lock);
+                stop_critical_timings();
+                call_console_drivers(cont.level, text, len);
+                start_critical_timings();
+                local_irq_restore(flags);
+        } else
+                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 again:
        for (;;) {
                struct log *msg;
-                static char text[LOG_LINE_MAX];
                size_t len;
                int level;
@@ -1811,18 +1966,35 @@ again:
                        /* messages are gone, move to first one */
                        console_seq = log_first_seq;
                        console_idx = log_first_idx;
+                        console_prev = 0;
                }
+skip:
                if (console_seq == log_next_seq)
                        break;
                msg = log_from_idx(console_idx);
-                level = msg->level & 7;
+                if (msg->flags & LOG_NOCONS) {
+                        /*
-                len = msg_print_text(msg, false, text, sizeof(text));
+                         * Skip record we have buffered and already printed
+                         * directly to the console when we received it.
+                         */
+                        console_idx = log_next(console_idx);
+                        console_seq++;
+                        /*
+                         * We will get here again when we register a new
+                         * CON_PRINTBUFFER console. Clear the flag so we
+                         * will properly dump everything later.
+                         */
+                        msg->flags &= ~LOG_NOCONS;
+                        goto skip;
+                }
+                level = msg->level;
+                len = msg_print_text(msg, console_prev, false,
+                                     text, sizeof(text));
                console_idx = log_next(console_idx);
                console_seq++;
+                console_prev = msg->flags;
                raw_spin_unlock(&logbuf_lock);
                stop_critical_timings();        /* don't trace print latency */
@@ -2085,6 +2257,7 @@ void register_console(struct console *newcon)
                raw_spin_lock_irqsave(&logbuf_lock, flags);
                console_seq = syslog_seq;
                console_idx = syslog_idx;
+                console_prev = syslog_prev;
                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
                /*
                 * We're about to replay the log buffer.  Only do this to the
@@ -2300,48 +2473,256 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
 * kmsg_dump - dump kernel log to kernel message dumpers.
 * @reason: the reason (oops, panic etc) for dumping
 *
- * Iterate through each of the dump devices and call the oops/panic
+ * Call each of the registered dumper's dump() callback, which can
- * callbacks with the log buffer.
+ * retrieve the kmsg records with kmsg_dump_get_line() or
+ * kmsg_dump_get_buffer().
 */
 void kmsg_dump(enum kmsg_dump_reason reason)
 {
-        u64 idx;
        struct kmsg_dumper *dumper;
-        const char *s1, *s2;
-        unsigned long l1, l2;
        unsigned long flags;
        if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
                return;
-        /* Theoretically, the log could move on after we do this, but
+        rcu_read_lock();
-           there's not a lot we can do about that. The new messages
+        list_for_each_entry_rcu(dumper, &dump_list, list) {
-           will overwrite the start of what we dump. */
+                if (dumper->max_reason && reason > dumper->max_reason)
+                        continue;
+                /* initialize iterator with data about the stored records */
+                dumper->active = true;
+                raw_spin_lock_irqsave(&logbuf_lock, flags);
+                dumper->cur_seq = clear_seq;
+                dumper->cur_idx = clear_idx;
+                dumper->next_seq = log_next_seq;
+                dumper->next_idx = log_next_idx;
+                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+                /* invoke dumper which will iterate over records */
+                dumper->dump(dumper, reason);
+                /* reset iterator */
+                dumper->active = false;
+        }
+        rcu_read_unlock();
+}
+/**
+ * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @line: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the beginning of the kmsg buffer, with the oldest kmsg
+ * record, and copy one record into the provided buffer.
+ *
+ * Consecutive calls will return the next available record moving
+ * towards the end of the buffer with the youngest messages.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ *
+ * The function is similar to kmsg_dump_get_line(), but grabs no locks.
+ */
+bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
+                               char *line, size_t size, size_t *len)
+{
+        struct log *msg;
+        size_t l = 0;
+        bool ret = false;
+        if (!dumper->active)
+                goto out;
+        if (dumper->cur_seq < log_first_seq) {
+                /* messages are gone, move to first available one */
+                dumper->cur_seq = log_first_seq;
+                dumper->cur_idx = log_first_idx;
+        }
+        /* last entry */
+        if (dumper->cur_seq >= log_next_seq)
+                goto out;
+        msg = log_from_idx(dumper->cur_idx);
+        l = msg_print_text(msg, 0, syslog, line, size);
+        dumper->cur_idx = log_next(dumper->cur_idx);
+        dumper->cur_seq++;
+        ret = true;
+out:
+        if (len)
+                *len = l;
+        return ret;
+}
+/**
+ * kmsg_dump_get_line - retrieve one kmsg log line
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @line: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the beginning of the kmsg buffer, with the oldest kmsg
+ * record, and copy one record into the provided buffer.
+ *
+ * Consecutive calls will return the next available record moving
+ * towards the end of the buffer with the youngest messages.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ */
+bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+                        char *line, size_t size, size_t *len)
+{
+        unsigned long flags;
+        bool ret;
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
+/**
+ * kmsg_dump_get_buffer - copy kmsg log lines
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @buf: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the end of the kmsg buffer and fill the provided buffer
+ * with as many of the the *youngest* kmsg records that fit into it.
+ * If the buffer is large enough, all available kmsg records will be
+ * copied with a single call.
+ *
+ * Consecutive calls will fill the buffer with the next block of
+ * available older records, not including the earlier retrieved ones.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ */
+bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
+                          char *buf, size_t size, size_t *len)
+{
+        unsigned long flags;
+        u64 seq;
+        u32 idx;
+        u64 next_seq;
+        u32 next_idx;
+        enum log_flags prev;
+        size_t l = 0;
+        bool ret = false;
+        if (!dumper->active)
+                goto out;
        raw_spin_lock_irqsave(&logbuf_lock, flags);
-        if (syslog_seq < log_first_seq)
+        if (dumper->cur_seq < log_first_seq) {
-                idx = syslog_idx;
+                /* messages are gone, move to first available one */
-        else
+                dumper->cur_seq = log_first_seq;
-                idx = log_first_idx;
+                dumper->cur_idx = log_first_idx;
+        }
+        /* last entry */
+        if (dumper->cur_seq >= dumper->next_seq) {
+                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+                goto out;
+        }
-        if (idx > log_next_idx) {
+        /* calculate length of entire buffer */
-                s1 = log_buf;
+        seq = dumper->cur_seq;
-                l1 = log_next_idx;
+        idx = dumper->cur_idx;
+        prev = 0;
+        while (seq < dumper->next_seq) {
+                struct log *msg = log_from_idx(idx);
+                l += msg_print_text(msg, prev, true, NULL, 0);
+                idx = log_next(idx);
+                seq++;
+                prev = msg->flags;
+        }
-                s2 = log_buf + idx;
+        /* move first record forward until length fits into the buffer */
-                l2 = log_buf_len - idx;
+        seq = dumper->cur_seq;
-        } else {
+        idx = dumper->cur_idx;
-                s1 = "";
+        prev = 0;
-                l1 = 0;
+        while (l > size && seq < dumper->next_seq) {
+                struct log *msg = log_from_idx(idx);
+                l -= msg_print_text(msg, prev, true, NULL, 0);
+                idx = log_next(idx);
+                seq++;
+                prev = msg->flags;
+        }
+        /* last message in next interation */
+        next_seq = seq;
+        next_idx = idx;
+        l = 0;
+        prev = 0;
+        while (seq < dumper->next_seq) {
+                struct log *msg = log_from_idx(idx);
-                s2 = log_buf + idx;
+                l += msg_print_text(msg, prev, syslog, buf + l, size - l);
-                l2 = log_next_idx - idx;
+                idx = log_next(idx);
+                seq++;
+                prev = msg->flags;
        }
+        dumper->next_seq = next_seq;
+        dumper->next_idx = next_idx;
+        ret = true;
        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+out:
+        if (len)
+                *len = l;
+        return ret;
+}
+EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
-        rcu_read_lock();
+/**
-        list_for_each_entry_rcu(dumper, &dump_list, list)
+ * kmsg_dump_rewind_nolock - reset the interator (unlocked version)
-                dumper->dump(dumper, reason, s1, l1, s2, l2);
+ * @dumper: registered kmsg dumper
-        rcu_read_unlock();
+ *
+ * Reset the dumper's iterator so that kmsg_dump_get_line() and
+ * kmsg_dump_get_buffer() can be called again and used multiple
+ * times within the same dumper.dump() callback.
+ *
+ * The function is similar to kmsg_dump_rewind(), but grabs no locks.
+ */
+void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
+{
+        dumper->cur_seq = clear_seq;
+        dumper->cur_idx = clear_idx;
+        dumper->next_seq = log_next_seq;
+        dumper->next_idx = log_next_idx;
+}
+/**
+ * kmsg_dump_rewind - reset the interator
+ * @dumper: registered kmsg dumper
+ *
+ * Reset the dumper's iterator so that kmsg_dump_get_line() and
+ * kmsg_dump_get_buffer() can be called again and used multiple
+ * times within the same dumper.dump() callback.
+ */
+void kmsg_dump_rewind(struct kmsg_dumper *dumper)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        kmsg_dump_rewind_nolock(dumper);
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 }
+EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
 #endif
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 3b0f1337f75b..4b97bba7396e 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -201,6 +201,7 @@ void rcu_note_context_switch(int cpu)
 {
        trace_rcu_utilization("Start context switch");
        rcu_sched_qs(cpu);
+        rcu_preempt_note_context_switch(cpu);
        trace_rcu_utilization("End context switch");
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -1530,7 +1531,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 {
        unsigned long flags;
        struct rcu_head *next, *list, **tail;
-        int bl, count, count_lazy;
+        int bl, count, count_lazy, i;
        /* If no callbacks are ready, just return.*/
        if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
@@ -1553,9 +1554,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
        *rdp->nxttail[RCU_DONE_TAIL] = NULL;
        tail = rdp->nxttail[RCU_DONE_TAIL];
-        for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
+        for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
-                if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
+                if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
-                        rdp->nxttail[count] = &rdp->nxtlist;
+                        rdp->nxttail[i] = &rdp->nxtlist;
        local_irq_restore(flags);
        /* Invoke callbacks. */
@@ -1583,9 +1584,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        if (list != NULL) {
                *tail = rdp->nxtlist;
                rdp->nxtlist = list;
-                for (count = 0; count < RCU_NEXT_SIZE; count++)
+                for (i = 0; i < RCU_NEXT_SIZE; i++)
-                        if (&rdp->nxtlist == rdp->nxttail[count])
+                        if (&rdp->nxtlist == rdp->nxttail[i])
-                                rdp->nxttail[count] = tail;
+                                rdp->nxttail[i] = tail;
                        else
                                break;
        }
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index ea056495783e..19b61ac1079f 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -444,6 +444,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
 /* Forward declarations for rcutree_plugin.h */
 static void rcu_bootup_announce(void);
 long rcu_batches_completed(void);
+static void rcu_preempt_note_context_switch(int cpu);
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 5271a020887e..3e4899459f3d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu)
 *
 * Caller must disable preemption.
 */
-void rcu_preempt_note_context_switch(void)
+static void rcu_preempt_note_context_switch(int cpu)
 {
        struct task_struct *t = current;
        unsigned long flags;
@@ -164,7 +164,7 @@ void rcu_preempt_note_context_switch(void)
            (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
                /* Possibly blocking in an RCU read-side critical section. */
-                rdp = __this_cpu_ptr(rcu_preempt_state.rda);
+                rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
                rnp = rdp->mynode;
                raw_spin_lock_irqsave(&rnp->lock, flags);
                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -228,7 +228,7 @@ void rcu_preempt_note_context_switch(void)
         * means that we continue to block the current grace period.
         */
        local_irq_save(flags);
-        rcu_preempt_qs(smp_processor_id());
+        rcu_preempt_qs(cpu);
        local_irq_restore(flags);
 }
@@ -1002,6 +1002,14 @@ void rcu_force_quiescent_state(void)
 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 /*
+ * Because preemptible RCU does not exist, we never have to check for
+ * CPUs being in quiescent states.
+ */
+static void rcu_preempt_note_context_switch(int cpu)
+{
+}
+/*
 * Because preemptible RCU does not exist, there are never any preempted
 * RCU readers.
 */
diff --git a/kernel/relay.c b/kernel/relay.c
index ab56a1764d4d..e8cd2027abbd 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1235,6 +1235,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
        struct splice_pipe_desc spd = {
                .pages = pages,
                .nr_pages = 0,
+                .nr_pages_max = PIPE_DEF_BUFFERS,
                .partial = partial,
                .flags = flags,
                .ops = &relay_pipe_buf_ops,
@@ -1302,8 +1303,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
                ret += padding;
 out:
-        splice_shrink_spd(pipe, &spd);
+        splice_shrink_spd(&spd);
-        return ret;
+        return ret;
 }
 static ssize_t relay_file_splice_read(struct file *in,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d5594a4268d4..468bdd44c1ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
 #endif
        /* Here we just switch the register state and the stack. */
-        rcu_switch_from(prev);
        switch_to(prev, next, prev);
        barrier();
@@ -2161,11 +2160,73 @@ unsigned long this_cpu_load(void)
 }
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ *   nr_active = 0;
+ *   for_each_possible_cpu(cpu)
+ *      nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ *  - for_each_possible_cpu() is prohibitively expensive on machines with
+ *    serious number of cpus, therefore we need to take a distributed approach
+ *    to calculating nr_active.
+ *
+ *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ *    So assuming nr_active := 0 when we start out -- true per definition, we
+ *    can simply take per-cpu deltas and fold those into a global accumulate
+ *    to obtain the same result. See calc_load_fold_active().
+ *
+ *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ *    across the machine, we assume 10 ticks is sufficient time for every
+ *    cpu to have completed this task.
+ *
+ *    This places an upper-bound on the IRQ-off latency of the machine. Then
+ *    again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ *    this would add another cross-cpu cacheline miss and atomic operation
+ *    to the wakeup path. Instead we increment on whatever cpu the task ran
+ *    when it went into uninterruptible state and decrement on whatever cpu
+ *    did the wakeup. This means that only the sum of nr_uninterruptible over
+ *    all cpus yields the correct result.
+ *
+ *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
 /* Variables and functions for calc_load */
 static atomic_long_t calc_load_tasks;
 static unsigned long calc_load_update;
 unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
+EXPORT_SYMBOL(avenrun); /* should be removed */
+/**
+ * get_avenrun - get the load average array
+ * @loads:      pointer to dest load array
+ * @offset:     offset to add
+ * @shift:      shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+        loads[0] = (avenrun[0] + offset) << shift;
+        loads[1] = (avenrun[1] + offset) << shift;
+        loads[2] = (avenrun[2] + offset) << shift;
+}
 static long calc_load_fold_active(struct rq *this_rq)
 {
@@ -2182,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq)
        return delta;
 }
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
@@ -2193,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
 #ifdef CONFIG_NO_HZ
 /*
- * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ *  - When we go NO_HZ idle during the window, we can negate our sample
+ *    contribution, causing under-accounting.
+ *
+ *    We avoid this by keeping two idle-delta counters and flipping them
+ *    when the window starts, thus separating old and new NO_HZ load.
+ *
+ *    The only trick is the slight shift in index flip for read vs write.
+ *
+ *        0s            5s            10s           15s
+ *          +10           +10           +10           +10
+ *        |-|-----------|-|-----------|-|-----------|-|
+ *    r:0 0 1           1 0           0 1           1 0
+ *    w:0 1 1           0 0           1 1           0 0
+ *
+ *    This ensures we'll fold the old idle contribution in this window while
+ *    accumlating the new one.
+ *
+ *  - When we wake up from NO_HZ idle during the window, we push up our
+ *    contribution, since we effectively move our sample point to a known
+ *    busy state.
+ *
+ *    This is solved by pushing the window forward, and thus skipping the
+ *    sample, for this cpu (effectively using the idle-delta for this cpu which
+ *    was in effect at the time the window opened). This also solves the issue
+ *    of having to deal with a cpu having been in NOHZ idle for multiple
+ *    LOAD_FREQ intervals.
 *
 * When making the ILB scale, we should try to pull this in as well.
 */
-static atomic_long_t calc_load_tasks_idle;
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
-void calc_load_account_idle(struct rq *this_rq)
+static inline int calc_load_write_idx(void)
 {
+        int idx = calc_load_idx;
+        /*
+         * See calc_global_nohz(), if we observe the new index, we also
+         * need to observe the new update time.
+         */
+        smp_rmb();
+        /*
+         * If the folding window started, make sure we start writing in the
+         * next idle-delta.
+         */
+        if (!time_before(jiffies, calc_load_update))
+                idx++;
+        return idx & 1;
+}
+static inline int calc_load_read_idx(void)
+{
+        return calc_load_idx & 1;
+}
+void calc_load_enter_idle(void)
+{
+        struct rq *this_rq = this_rq();
        long delta;
+        /*
+         * We're going into NOHZ mode, if there's any pending delta, fold it
+         * into the pending idle delta.
+         */
        delta = calc_load_fold_active(this_rq);
-        if (delta)
+        if (delta) {
-                atomic_long_add(delta, &calc_load_tasks_idle);
+                int idx = calc_load_write_idx();
+                atomic_long_add(delta, &calc_load_idle[idx]);
+        }
 }
-static long calc_load_fold_idle(void)
+void calc_load_exit_idle(void)
 {
-        long delta = 0;
+        struct rq *this_rq = this_rq();
+        /*
+         * If we're still before the sample window, we're done.
+         */
+        if (time_before(jiffies, this_rq->calc_load_update))
+                return;
        /*
-         * Its got a race, we don't care...
+         * We woke inside or after the sample window, this means we're already
+         * accounted through the nohz accounting, so skip the entire deal and
+         * sync up for the next window.
         */
-        if (atomic_long_read(&calc_load_tasks_idle))
+        this_rq->calc_load_update = calc_load_update;
-                delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+        if (time_before(jiffies, this_rq->calc_load_update + 10))
+                this_rq->calc_load_update += LOAD_FREQ;
+}
+static long calc_load_fold_idle(void)
+{
+        int idx = calc_load_read_idx();
+        long delta = 0;
+        if (atomic_long_read(&calc_load_idle[idx]))
+                delta = atomic_long_xchg(&calc_load_idle[idx], 0);
        return delta;
 }
@@ -2302,66 +2454,39 @@ static void calc_global_nohz(void)
 {
        long delta, active, n;
-        /*
+        if (!time_before(jiffies, calc_load_update + 10)) {
-         * If we crossed a calc_load_update boundary, make sure to fold
+                /*
-         * any pending idle changes, the respective CPUs might have
+                 * Catch-up, fold however many we are behind still
-         * missed the tick driven calc_load_account_active() update
+                 */
-         * due to NO_HZ.
+                delta = jiffies - calc_load_update - 10;
-         */
+                n = 1 + (delta / LOAD_FREQ);
-        delta = calc_load_fold_idle();
-        if (delta)
-                atomic_long_add(delta, &calc_load_tasks);
-        /*
-         * It could be the one fold was all it took, we done!
-         */
-        if (time_before(jiffies, calc_load_update + 10))
-                return;
-        /*
-         * Catch-up, fold however many we are behind still
-         */
-        delta = jiffies - calc_load_update - 10;
-        n = 1 + (delta / LOAD_FREQ);
-        active = atomic_long_read(&calc_load_tasks);
+                active = atomic_long_read(&calc_load_tasks);
-        active = active > 0 ? active * FIXED_1 : 0;
+                active = active > 0 ? active * FIXED_1 : 0;
-        avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-        avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
-        avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-        calc_load_update += n * LOAD_FREQ;
+                calc_load_update += n * LOAD_FREQ;
-}
+        }
-#else
-void calc_load_account_idle(struct rq *this_rq)
-{
-}
-static inline long calc_load_fold_idle(void)
+        /*
-{
+         * Flip the idle index...
-        return 0;
+         *
+         * Make sure we first write the new time then flip the index, so that
+         * calc_load_write_idx() will see the new time when it reads the new
+         * index, this avoids a double flip messing things up.
+         */
+        smp_wmb();
+        calc_load_idx++;
 }
+#else /* !CONFIG_NO_HZ */
-static void calc_global_nohz(void)
+static inline long calc_load_fold_idle(void) { return 0; }
-{
+static inline void calc_global_nohz(void) { }
-}
-#endif
-/**
+#endif /* CONFIG_NO_HZ */
- * get_avenrun - get the load average array
- * @loads:      pointer to dest load array
- * @offset:     offset to add
- * @shift:      shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
-        loads[0] = (avenrun[0] + offset) << shift;
-        loads[1] = (avenrun[1] + offset) << shift;
-        loads[2] = (avenrun[2] + offset) << shift;
-}
 /*
 * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2369,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 */
 void calc_global_load(unsigned long ticks)
 {
-        long active;
+        long active, delta;
        if (time_before(jiffies, calc_load_update + 10))
                return;
+        /*
+         * Fold the 'old' idle-delta to include all NO_HZ cpus.
+         */
+        delta = calc_load_fold_idle();
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks);
        active = atomic_long_read(&calc_load_tasks);
        active = active > 0 ? active * FIXED_1 : 0;
@@ -2384,12 +2516,7 @@ void calc_global_load(unsigned long ticks)
        calc_load_update += LOAD_FREQ;
        /*
-         * Account one period with whatever state we found before
+         * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
-         * folding in the nohz state and ageing the entire idle period.
-         *
-         * This avoids loosing a sample when we go idle between 
-         * calc_load_account_active() (10 ticks ago) and now and thus
-         * under-accounting.
         */
        calc_global_nohz();
 }
@@ -2406,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq)
                return;
        delta  = calc_load_fold_active(this_rq);
-        delta += calc_load_fold_idle();
        if (delta)
                atomic_long_add(delta, &calc_load_tasks);
@@ -2414,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq)
 }
 /*
+ * End of global load-average stuff
+ */
+/*
 * The exact cpuload at various idx values, calculated at every tick would be
 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
 *
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b44d604b35d1..b6baf370cae9 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
        schedstat_inc(rq, sched_goidle);
-        calc_load_account_idle(rq);
        return rq->idle;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6d52cea7f33d..55844f24435a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -942,8 +942,6 @@ static inline u64 sched_avg_period(void)
        return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
 }
-void calc_load_account_idle(struct rq *this_rq);
 #ifdef CONFIG_SCHED_HRTICK
 /*
diff --git a/kernel/sys.c b/kernel/sys.c
index f0ec44dcd415..2d39a84cd857 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1788,7 +1788,6 @@ SYSCALL_DEFINE1(umask, int, mask)
 #ifdef CONFIG_CHECKPOINT_RESTORE
 static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
-        struct vm_area_struct *vma;
        struct file *exe_file;
        struct dentry *dentry;
        int err;
@@ -1816,13 +1815,17 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
        down_write(&mm->mmap_sem);
        /*
-         * Forbid mm->exe_file change if there are mapped other files.
+         * Forbid mm->exe_file change if old file still mapped.
         */
        err = -EBUSY;
-        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+        if (mm->exe_file) {
-                if (vma->vm_file && !path_equal(&vma->vm_file->f_path,
+                struct vm_area_struct *vma;
-                                                &exe_file->f_path))
-                        goto exit_unlock;
+                for (vma = mm->mmap; vma; vma = vma->vm_next)
+                        if (vma->vm_file &&
+                            path_equal(&vma->vm_file->f_path,
+                                       &mm->exe_file->f_path))
+                                goto exit_unlock;
        }
        /*
@@ -1835,6 +1838,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
        if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
                goto exit_unlock;
+        err = 0;
        set_mm_exe_file(mm, exe_file);
 exit_unlock:
        up_write(&mm->mmap_sem);
@@ -2127,9 +2131,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                else
                                        return -EINVAL;
                                break;
-                case PR_GET_TID_ADDRESS:
-                        error = prctl_get_tid_address(me, (int __user **)arg2);
-                        break;
                        default:
                                return -EINVAL;
                        }
@@ -2147,6 +2148,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                case PR_SET_MM:
                        error = prctl_set_mm(arg2, arg3, arg4, arg5);
                        break;
+                case PR_GET_TID_ADDRESS:
+                        error = prctl_get_tid_address(me, (int __user **)arg2);
+                        break;
                case PR_SET_CHILD_SUBREAPER:
                        me->signal->is_child_subreaper = !!arg2;
                        error = 0;
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 70b33abcc7bb..b7fbadc5c973 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -409,7 +409,9 @@ int second_overflow(unsigned long secs)
                        time_state = TIME_DEL;
                break;
        case TIME_INS:
-                if (secs % 86400 == 0) {
+                if (!(time_status & STA_INS))
+                        time_state = TIME_OK;
+                else if (secs % 86400 == 0) {
                        leap = -1;
                        time_state = TIME_OOP;
                        time_tai++;
@@ -418,7 +420,9 @@ int second_overflow(unsigned long secs)
                }
                break;
        case TIME_DEL:
-                if ((secs + 1) % 86400 == 0) {
+                if (!(time_status & STA_DEL))
+                        time_state = TIME_OK;
+                else if ((secs + 1) % 86400 == 0) {
                        leap = 1;
                        time_tai--;
                        time_state = TIME_WAIT;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 869997833928..4a08472c3ca7 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -406,6 +406,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
                 */
                if (!ts->tick_stopped) {
                        select_nohz_load_balancer(1);
+                        calc_load_enter_idle();
                        ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
                        ts->tick_stopped = 1;
@@ -597,6 +598,7 @@ void tick_nohz_idle_exit(void)
                account_idle_ticks(ticks);
 #endif
+        calc_load_exit_idle();
        touch_softlockup_watchdog();
        /*
         * Cancel the scheduled timer and restore the tick
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6f46a00a1e8a..3447cfaf11e7 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -70,6 +70,12 @@ struct timekeeper {
        /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
        struct timespec raw_time;
+        /* Offset clock monotonic -> clock realtime */
+        ktime_t offs_real;
+        /* Offset clock monotonic -> clock boottime */
+        ktime_t offs_boot;
        /* Seqlock for all timekeeper values */
        seqlock_t lock;
 };
@@ -172,6 +178,14 @@ static inline s64 timekeeping_get_ns_raw(void)
        return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
 }
+static void update_rt_offset(void)
+{
+        struct timespec tmp, *wtm = &timekeeper.wall_to_monotonic;
+        set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
+        timekeeper.offs_real = timespec_to_ktime(tmp);
+}
 /* must hold write on timekeeper.lock */
 static void timekeeping_update(bool clearntp)
 {
@@ -179,6 +193,7 @@ static void timekeeping_update(bool clearntp)
                timekeeper.ntp_error = 0;
                ntp_clear();
        }
+        update_rt_offset();
        update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic,
                         timekeeper.clock, timekeeper.mult);
 }
@@ -604,6 +619,7 @@ void __init timekeeping_init(void)
        }
        set_normalized_timespec(&timekeeper.wall_to_monotonic,
                                -boot.tv_sec, -boot.tv_nsec);
+        update_rt_offset();
        timekeeper.total_sleep_time.tv_sec = 0;
        timekeeper.total_sleep_time.tv_nsec = 0;
        write_sequnlock_irqrestore(&timekeeper.lock, flags);
@@ -612,6 +628,12 @@ void __init timekeeping_init(void)
 /* time in seconds when suspend began */
 static struct timespec timekeeping_suspend_time;
+static void update_sleep_time(struct timespec t)
+{
+        timekeeper.total_sleep_time = t;
+        timekeeper.offs_boot = timespec_to_ktime(t);
+}
 /**
 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
 * @delta: pointer to a timespec delta value
@@ -630,8 +652,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
        timekeeper.xtime = timespec_add(timekeeper.xtime, *delta);
        timekeeper.wall_to_monotonic =
                        timespec_sub(timekeeper.wall_to_monotonic, *delta);
-        timekeeper.total_sleep_time = timespec_add(
+        update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta));
-                                        timekeeper.total_sleep_time, *delta);
 }
@@ -696,6 +717,7 @@ static void timekeeping_resume(void)
        timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
        timekeeper.ntp_error = 0;
        timekeeping_suspended = 0;
+        timekeeping_update(false);
        write_sequnlock_irqrestore(&timekeeper.lock, flags);
        touch_softlockup_watchdog();
@@ -963,6 +985,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
                leap = second_overflow(timekeeper.xtime.tv_sec);
                timekeeper.xtime.tv_sec += leap;
                timekeeper.wall_to_monotonic.tv_sec -= leap;
+                if (leap)
+                        clock_was_set_delayed();
        }
        /* Accumulate raw time */
@@ -1079,6 +1103,8 @@ static void update_wall_time(void)
                leap = second_overflow(timekeeper.xtime.tv_sec);
                timekeeper.xtime.tv_sec += leap;
                timekeeper.wall_to_monotonic.tv_sec -= leap;
+                if (leap)
+                        clock_was_set_delayed();
        }
        timekeeping_update(false);
@@ -1246,6 +1272,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
        } while (read_seqretry(&timekeeper.lock, seq));
 }
+#ifdef CONFIG_HIGH_RES_TIMERS
+/**
+ * ktime_get_update_offsets - hrtimer helper
+ * @offs_real:  pointer to storage for monotonic -> realtime offset
+ * @offs_boot:  pointer to storage for monotonic -> boottime offset
+ *
+ * Returns current monotonic time and updates the offsets
+ * Called from hrtimer_interupt() or retrigger_next_event()
+ */
+ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
+{
+        ktime_t now;
+        unsigned int seq;
+        u64 secs, nsecs;
+        do {
+                seq = read_seqbegin(&timekeeper.lock);
+                secs = timekeeper.xtime.tv_sec;
+                nsecs = timekeeper.xtime.tv_nsec;
+                nsecs += timekeeping_get_ns();
+                /* If arch requires, add in gettimeoffset() */
+                nsecs += arch_gettimeoffset();
+                *offs_real = timekeeper.offs_real;
+                *offs_boot = timekeeper.offs_boot;
+        } while (read_seqretry(&timekeeper.lock, seq));
+        now = ktime_add_ns(ktime_set(secs, 0), nsecs);
+        now = ktime_sub(now, *offs_real);
+        return now;
+}
+#endif
 /**
 * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
 */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1d0f6a8a0e5e..f765465bffe4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
        rb_init_page(bpage->page);
        INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
+        INIT_LIST_HEAD(&cpu_buffer->new_pages);
        ret = rb_allocate_pages(cpu_buffer, nr_pages);
        if (ret < 0)
@@ -1346,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
                         * If something was added to this page, it was full
                         * since it is not the tail page. So we deduct the
                         * bytes consumed in ring buffer from here.
-                         * No need to update overruns, since this page is
+                         * Increment overrun to account for the lost events.
-                         * deleted from ring buffer and its entries are
-                         * already accounted for.
                         */
+                        local_add(page_entries, &cpu_buffer->overrun);
                        local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
                }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 49249c28690d..a7fa0702be1c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3609,6 +3609,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                .pages          = pages_def,
                .partial        = partial_def,
                .nr_pages       = 0, /* This gets updated below. */
+                .nr_pages_max   = PIPE_DEF_BUFFERS,
                .flags          = flags,
                .ops            = &tracing_pipe_buf_ops,
                .spd_release    = tracing_spd_release_pipe,
@@ -3680,7 +3681,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
        ret = splice_to_pipe(pipe, &spd);
 out:
-        splice_shrink_spd(pipe, &spd);
+        splice_shrink_spd(&spd);
        return ret;
 out_err:
@@ -4231,6 +4232,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        struct splice_pipe_desc spd = {
                .pages          = pages_def,
                .partial        = partial_def,
+                .nr_pages_max   = PIPE_DEF_BUFFERS,
                .flags          = flags,
                .ops            = &buffer_pipe_buf_ops,
                .spd_release    = buffer_spd_release,
@@ -4318,7 +4320,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        }
        ret = splice_to_pipe(pipe, &spd);
-        splice_shrink_spd(pipe, &spd);
+        splice_shrink_spd(&spd);
 out:
        return ret;
 }
author	Anton Vorontsov <anton.vorontsov@linaro.org>	2012-07-31 07:59:42 -0400
committer	Anton Vorontsov <anton.vorontsov@linaro.org>	2012-07-31 08:16:47 -0400
commit	e6db06a53b1dcf4e9da4aba143e2eb4d63418abb (patch)
tree	10adcecb71c95ce4393c39fa7911d091bcadfe09 /kernel
parent	ecc2edd56c49fa31a0a9ed15a7bf810ae79d3b85 (diff)
parent	c56f5c0342dfee11a1a13d2f5bb7618de5b17590 (diff)