18 files changed, 183 insertions, 66 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 4414e93d8750..ce6d8ea3131e 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -61,8 +61,11 @@
 #include "audit.h"
-/* No auditing will take place until audit_initialized != 0.
+/* No auditing will take place until audit_initialized == AUDIT_INITIALIZED.
 * (Initialization happens after skb_init is called.) */
+#define AUDIT_DISABLED          -1
+#define AUDIT_UNINITIALIZED     0
+#define AUDIT_INITIALIZED       1
 static int      audit_initialized;
 #define AUDIT_OFF       0
@@ -965,6 +968,9 @@ static int __init audit_init(void)
 {
        int i;
+        if (audit_initialized == AUDIT_DISABLED)
+                return 0;
        printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
               audit_default ? "enabled" : "disabled");
        audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
@@ -976,7 +982,7 @@ static int __init audit_init(void)
        skb_queue_head_init(&audit_skb_queue);
        skb_queue_head_init(&audit_skb_hold_queue);
-        audit_initialized = 1;
+        audit_initialized = AUDIT_INITIALIZED;
        audit_enabled = audit_default;
        audit_ever_enabled |= !!audit_default;
@@ -999,13 +1005,21 @@ __initcall(audit_init);
 static int __init audit_enable(char *str)
 {
        audit_default = !!simple_strtol(str, NULL, 0);
-        printk(KERN_INFO "audit: %s%s\n",
+        if (!audit_default)
-               audit_default ? "enabled" : "disabled",
+                audit_initialized = AUDIT_DISABLED;
-               audit_initialized ? "" : " (after initialization)");
-        if (audit_initialized) {
+        printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled");
+        if (audit_initialized == AUDIT_INITIALIZED) {
                audit_enabled = audit_default;
                audit_ever_enabled |= !!audit_default;
+        } else if (audit_initialized == AUDIT_UNINITIALIZED) {
+                printk(" (after initialization)");
+        } else {
+                printk(" (until reboot)");
        }
+        printk("\n");
        return 1;
 }
@@ -1107,9 +1121,7 @@ unsigned int audit_serial(void)
 static inline void audit_get_stamp(struct audit_context *ctx,
                                   struct timespec *t, unsigned int *serial)
 {
-        if (ctx)
+        if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
-                auditsc_get_stamp(ctx, t, serial);
-        else {
                *t = CURRENT_TIME;
                *serial = audit_serial();
        }
@@ -1146,7 +1158,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
        int reserve;
        unsigned long timeout_start = jiffies;
-        if (!audit_initialized)
+        if (audit_initialized != AUDIT_INITIALIZED)
                return NULL;
        if (unlikely(audit_filter_type(type)))
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cf5bc2f5f9c3..2a3f0afc4d2a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1459,7 +1459,6 @@ void audit_free(struct task_struct *tsk)
 /**
 * audit_syscall_entry - fill in an audit record at syscall entry
- * @tsk: task being audited
 * @arch: architecture type
 * @major: major syscall type (function)
 * @a1: additional syscall register 1
@@ -1548,9 +1547,25 @@ void audit_syscall_entry(int arch, int major,
        context->ppid       = 0;
 }
+void audit_finish_fork(struct task_struct *child)
+{
+        struct audit_context *ctx = current->audit_context;
+        struct audit_context *p = child->audit_context;
+        if (!p || !ctx || !ctx->auditable)
+                return;
+        p->arch = ctx->arch;
+        p->major = ctx->major;
+        memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
+        p->ctime = ctx->ctime;
+        p->dummy = ctx->dummy;
+        p->auditable = ctx->auditable;
+        p->in_syscall = ctx->in_syscall;
+        p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
+        p->ppid = current->pid;
+}
 /**
 * audit_syscall_exit - deallocate audit context after a system call
- * @tsk: task being audited
 * @valid: success/failure flag
 * @return_code: syscall return value
 *
@@ -1942,15 +1957,18 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
 *
 * Also sets the context as auditable.
 */
-void auditsc_get_stamp(struct audit_context *ctx,
+int auditsc_get_stamp(struct audit_context *ctx,
                       struct timespec *t, unsigned int *serial)
 {
+        if (!ctx->in_syscall)
+                return 0;
        if (!ctx->serial)
                ctx->serial = audit_serial();
        t->tv_sec  = ctx->ctime.tv_sec;
        t->tv_nsec = ctx->ctime.tv_nsec;
        *serial    = ctx->serial;
        ctx->auditable = 1;
+        return 1;
 }
 /* global counter which is incremented every time something logs in */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fe00b3b983a8..8185a0f09594 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -702,7 +702,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
         * any child cgroups exist. This is theoretically supportable
         * but involves complex error handling, so it's being left until
         * later */
-        if (!list_empty(&cgrp->children))
+        if (root->number_of_cgroups > 1)
                return -EBUSY;
        /* Process each subsystem */
diff --git a/kernel/fork.c b/kernel/fork.c
index 7407ab319875..7b93da72d4a2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -319,17 +319,20 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                file = tmp->vm_file;
                if (file) {
                        struct inode *inode = file->f_path.dentry->d_inode;
+                        struct address_space *mapping = file->f_mapping;
                        get_file(file);
                        if (tmp->vm_flags & VM_DENYWRITE)
                                atomic_dec(&inode->i_writecount);
+                        spin_lock(&mapping->i_mmap_lock);
-                        /* insert tmp into the share list, just after mpnt */
+                        if (tmp->vm_flags & VM_SHARED)
-                        spin_lock(&file->f_mapping->i_mmap_lock);
+                                mapping->i_mmap_writable++;
                        tmp->vm_truncate_count = mpnt->vm_truncate_count;
-                        flush_dcache_mmap_lock(file->f_mapping);
+                        flush_dcache_mmap_lock(mapping);
+                        /* insert tmp into the share list, just after mpnt */
                        vma_prio_tree_add(tmp, mpnt);
-                        flush_dcache_mmap_unlock(file->f_mapping);
+                        flush_dcache_mmap_unlock(mapping);
-                        spin_unlock(&file->f_mapping->i_mmap_lock);
+                        spin_unlock(&mapping->i_mmap_lock);
                }
                /*
@@ -1406,6 +1409,7 @@ long do_fork(unsigned long clone_flags,
                        init_completion(&vfork);
                }
+                audit_finish_fork(p);
                tracehook_report_clone(trace, regs, clone_flags, nr, p);
                /*
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 5e7b45c56923..449db466bdbc 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -191,7 +191,7 @@ static int lstats_show(struct seq_file *m, void *v)
                                latency_record[i].time,
                                latency_record[i].max);
                        for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-                                char sym[KSYM_NAME_LEN];
+                                char sym[KSYM_SYMBOL_LEN];
                                char *c;
                                if (!latency_record[i].backtrace[q])
                                        break;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 895337b16a24..4e5288a831de 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -311,7 +311,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
        struct task_cputime cputime;
        thread_group_cputime(p, &cputime);
-        switch (which_clock) {
+        switch (CPUCLOCK_WHICH(which_clock)) {
        default:
                return -EINVAL;
        case CPUCLOCK_PROF:
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b7713b53d07a..6da14358537c 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -633,7 +633,7 @@ void swsusp_close(fmode_t mode)
                return;
        }
-        blkdev_put(resume_bdev, mode); /* move up */
+        blkdev_put(resume_bdev, mode);
 }
 static int swsusp_header_init(void)
diff --git a/kernel/relay.c b/kernel/relay.c
index 32b0befdcb6a..09ac2008f77b 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1317,12 +1317,9 @@ static ssize_t relay_file_splice_read(struct file *in,
                if (ret < 0)
                        break;
                else if (!ret) {
-                        if (spliced)
+                        if (flags & SPLICE_F_NONBLOCK)
-                                break;
-                        if (flags & SPLICE_F_NONBLOCK) {
                                ret = -EAGAIN;
-                                break;
+                        break;
-                        }
                }
                *ppos += ret;
diff --git a/kernel/sched.c b/kernel/sched.c
index d377097572f9..ceda5799466e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2457,7 +2457,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                p->sched_class->task_new(rq, p);
                inc_nr_running(rq);
        }
-        trace_sched_wakeup_new(rq, p);
+        trace_sched_wakeup_new(rq, p, 1);
        check_preempt_curr(rq, p, 0);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
@@ -6595,7 +6595,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                        req = list_entry(rq->migration_queue.next,
                                         struct migration_req, list);
                        list_del_init(&req->list);
+                        spin_unlock_irq(&rq->lock);
                        complete(&req->done);
+                        spin_lock_irq(&rq->lock);
                }
                spin_unlock_irq(&rq->lock);
                break;
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 81787248b60f..e8ab096ddfe3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -118,13 +118,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
        /*
         * scd->clock = clamp(scd->tick_gtod + delta,
-         *                    max(scd->tick_gtod, scd->clock),
+         *                    max(scd->tick_gtod, scd->clock),
-         *                    max(scd->clock, scd->tick_gtod + TICK_NSEC));
+         *                    scd->tick_gtod + TICK_NSEC);
         */
        clock = scd->tick_gtod + delta;
        min_clock = wrap_max(scd->tick_gtod, scd->clock);
-        max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
+        max_clock = scd->tick_gtod + TICK_NSEC;
        clock = wrap_max(clock, min_clock);
        clock = wrap_min(clock, max_clock);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 3953e4aed733..dc0b3be6b7d5 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -188,7 +188,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
        if ((long)(now - t->last_switch_timestamp) <
                                        sysctl_hung_task_timeout_secs)
                return;
-        if (sysctl_hung_task_warnings < 0)
+        if (!sysctl_hung_task_warnings)
                return;
        sysctl_hung_task_warnings--;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c83f566e940a..6ac501a2dcc6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -487,6 +487,16 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &ftrace_enable_sysctl,
        },
 #endif
+#ifdef CONFIG_STACK_TRACER
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "stack_tracer_enabled",
+                .data           = &stack_tracer_enabled,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &stack_trace_sysctl,
+        },
+#endif
 #ifdef CONFIG_TRACING
        {
                .ctl_name       = CTL_UNNUMBERED,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e7acfb482a68..fa05e88aa76f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -518,6 +518,28 @@ void update_wall_time(void)
        /* correct the clock when NTP error is too big */
        clocksource_adjust(offset);
+        /*
+         * Since in the loop above, we accumulate any amount of time
+         * in xtime_nsec over a second into xtime.tv_sec, its possible for
+         * xtime_nsec to be fairly small after the loop. Further, if we're
+         * slightly speeding the clocksource up in clocksource_adjust(),
+         * its possible the required corrective factor to xtime_nsec could
+         * cause it to underflow.
+         *
+         * Now, we cannot simply roll the accumulated second back, since
+         * the NTP subsystem has been notified via second_overflow. So
+         * instead we push xtime_nsec forward by the amount we underflowed,
+         * and add that amount into the error.
+         *
+         * We'll correct this error next time through this function, when
+         * xtime_nsec is not as small.
+         */
+        if (unlikely((s64)clock->xtime_nsec < 0)) {
+                s64 neg = -(s64)clock->xtime_nsec;
+                clock->xtime_nsec = 0;
+                clock->error += neg << (NTP_SCALE_SHIFT - clock->shift);
+        }
        /* store full nanoseconds into xtime after rounding it up and
         * add the remainder to the error difference.
         */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d8bae6f4219e..e2a4ff6fc3a6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -244,10 +244,15 @@ config STACK_TRACER
          This tracer works by hooking into every function call that the
          kernel executes, and keeping a maximum stack depth value and
-          stack-trace saved. Because this logic has to execute in every
+          stack-trace saved.  If this is configured with DYNAMIC_FTRACE
-          kernel function, all the time, this option can slow down the
+          then it will not have any overhead while the stack tracer
-          kernel measurably and is generally intended for kernel
+          is disabled.
-          developers only.
+          To enable the stack tracer on bootup, pass in 'stacktrace'
+          on the kernel command line.
+          The stack tracer can also be enabled or disabled via the
+          sysctl kernel.stack_tracer_enabled
          Say N if unsure.
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a12f80efceaa..2f32969c09df 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1047,6 +1047,13 @@ ftrace_match(unsigned char *buff, int len, int enable)
        int type = MATCH_FULL;
        unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
        unsigned i, match = 0, search_len = 0;
+        int not = 0;
+        if (buff[0] == '!') {
+                not = 1;
+                buff++;
+                len--;
+        }
        for (i = 0; i < len; i++) {
                if (buff[i] == '*') {
@@ -1100,8 +1107,12 @@ ftrace_match(unsigned char *buff, int len, int enable)
                                        matched = 1;
                                break;
                        }
-                        if (matched)
+                        if (matched) {
-                                rec->flags |= flag;
+                                if (not)
+                                        rec->flags &= ~flag;
+                                else
+                                        rec->flags |= flag;
+                        }
                }
                pg = pg->next;
        }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1a3d6b329782..0eb6d48347f7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1748,6 +1748,13 @@ lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
 static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
+static int task_state_char(unsigned long state)
+{
+        int bit = state ? __ffs(state) + 1 : 0;
+        return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
+}
 /*
 * The message is supposed to contain an ending newline.
 * If the printing stops prematurely, try to add a newline of our own.
@@ -1816,7 +1823,6 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
        char *comm;
        int S, T;
        int i;
-        unsigned state;
        if (entry->type == TRACE_CONT)
                return TRACE_TYPE_HANDLED;
@@ -1862,12 +1868,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
                trace_assign_type(field, entry);
-                T = field->next_state < sizeof(state_to_char) ?
+                T = task_state_char(field->next_state);
-                        state_to_char[field->next_state] : 'X';
+                S = task_state_char(field->prev_state);
-                state = field->prev_state ?
-                        __ffs(field->prev_state) + 1 : 0;
-                S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
                comm = trace_find_cmdline(field->next_pid);
                trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
                                 field->prev_pid,
@@ -2008,10 +2010,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
                trace_assign_type(field, entry);
-                S = field->prev_state < sizeof(state_to_char) ?
+                T = task_state_char(field->next_state);
-                        state_to_char[field->prev_state] : 'X';
+                S = task_state_char(field->prev_state);
-                T = field->next_state < sizeof(state_to_char) ?
-                        state_to_char[field->next_state] : 'X';
                ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
                                       field->prev_pid,
                                       field->prev_prio,
@@ -2141,12 +2141,9 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
                trace_assign_type(field, entry);
-                S = field->prev_state < sizeof(state_to_char) ?
+                T = task_state_char(field->next_state);
-                        state_to_char[field->prev_state] : 'X';
+                S = entry->type == TRACE_WAKE ? '+' :
-                T = field->next_state < sizeof(state_to_char) ?
+                        task_state_char(field->prev_state);
-                        state_to_char[field->next_state] : 'X';
-                if (entry->type == TRACE_WAKE)
-                        S = '+';
                ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
                                       field->prev_pid,
                                       field->prev_prio,
@@ -2233,12 +2230,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
                trace_assign_type(field, entry);
-                S = field->prev_state < sizeof(state_to_char) ?
+                T = task_state_char(field->next_state);
-                        state_to_char[field->prev_state] : 'X';
+                S = entry->type == TRACE_WAKE ? '+' :
-                T = field->next_state < sizeof(state_to_char) ?
+                        task_state_char(field->prev_state);
-                        state_to_char[field->next_state] : 'X';
-                if (entry->type == TRACE_WAKE)
-                        S = '+';
                SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
                SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
                SEQ_PUT_HEX_FIELD_RET(s, S);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 863390557b44..781d72ef873c 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -247,3 +247,4 @@ __init static int init_sched_switch_trace(void)
        return register_tracer(&sched_switch_trace);
 }
 device_initcall(init_sched_switch_trace);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0b863f2cbc8e..d0871bc0aca5 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -10,6 +10,7 @@
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include <linux/module.h>
+#include <linux/sysctl.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include "trace.h"
@@ -31,6 +32,10 @@ static raw_spinlock_t max_stack_lock =
 static int stack_trace_disabled __read_mostly;
 static DEFINE_PER_CPU(int, trace_active);
+static DEFINE_MUTEX(stack_sysctl_mutex);
+int stack_tracer_enabled;
+static int last_stack_tracer_enabled;
 static inline void check_stack(void)
 {
@@ -174,7 +179,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
        return count;
 }
-static struct file_operations stack_max_size_fops = {
+static const struct file_operations stack_max_size_fops = {
        .open           = tracing_open_generic,
        .read           = stack_max_size_read,
        .write          = stack_max_size_write,
@@ -272,7 +277,7 @@ static int t_show(struct seq_file *m, void *v)
        return 0;
 }
-static struct seq_operations stack_trace_seq_ops = {
+static const struct seq_operations stack_trace_seq_ops = {
        .start          = t_start,
        .next           = t_next,
        .stop           = t_stop,
@@ -288,12 +293,47 @@ static int stack_trace_open(struct inode *inode, struct file *file)
        return ret;
 }
-static struct file_operations stack_trace_fops = {
+static const struct file_operations stack_trace_fops = {
        .open           = stack_trace_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
 };
+int
+stack_trace_sysctl(struct ctl_table *table, int write,
+                   struct file *file, void __user *buffer, size_t *lenp,
+                   loff_t *ppos)
+{
+        int ret;
+        mutex_lock(&stack_sysctl_mutex);
+        ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
+        if (ret || !write ||
+            (last_stack_tracer_enabled == stack_tracer_enabled))
+                goto out;
+        last_stack_tracer_enabled = stack_tracer_enabled;
+        if (stack_tracer_enabled)
+                register_ftrace_function(&trace_ops);
+        else
+                unregister_ftrace_function(&trace_ops);
+ out:
+        mutex_unlock(&stack_sysctl_mutex);
+        return ret;
+}
+static __init int enable_stacktrace(char *str)
+{
+        stack_tracer_enabled = 1;
+        last_stack_tracer_enabled = 1;
+        return 1;
+}
+__setup("stacktrace", enable_stacktrace);
 static __init int stack_trace_init(void)
 {
        struct dentry *d_tracer;
@@ -311,7 +351,8 @@ static __init int stack_trace_init(void)
        if (!entry)
                pr_warning("Could not create debugfs 'stack_trace' entry\n");
-        register_ftrace_function(&trace_ops);
+        if (stack_tracer_enabled)
+                register_ftrace_function(&trace_ops);
        return 0;
 }