9 files changed, 123 insertions, 51 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1c8ddd6ee940..677f25376a38 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -151,13 +151,13 @@ static inline void check_for_tasks(int cpu)
        write_lock_irq(&tasklist_lock);
        for_each_process(p) {
-                if (task_cpu(p) == cpu &&
+                if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
                    (!cputime_eq(p->utime, cputime_zero) ||
                     !cputime_eq(p->stime, cputime_zero)))
-                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
+                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
-                                (state = %ld, flags = %x) \n",
+                                "(state = %ld, flags = %x)\n",
-                                 p->comm, task_pid_nr(p), cpu,
+                                p->comm, task_pid_nr(p), cpu,
-                                 p->state, p->flags);
+                                p->state, p->flags);
        }
        write_unlock_irq(&tasklist_lock);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 5b2959b3ffc2..f88bd984df35 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1241,21 +1241,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        /* Need tasklist lock for parent etc handling! */
        write_lock_irq(&tasklist_lock);
-        /*
-         * The task hasn't been attached yet, so its cpus_allowed mask will
-         * not be changed, nor will its assigned CPU.
-         *
-         * The cpus_allowed mask of the parent may have changed after it was
-         * copied first time - so re-copy it here, then check the child's CPU
-         * to ensure it is on a valid CPU (and if not, just force it back to
-         * parent's CPU). This avoids alot of nasty races.
-         */
-        p->cpus_allowed = current->cpus_allowed;
-        p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
-        if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
-                        !cpu_online(task_cpu(p))))
-                set_task_cpu(p, smp_processor_id());
        /* CLONE_PARENT re-uses the old parent */
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
                p->real_parent = current->real_parent;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 50dbd5999588..8a5c7d55ac9f 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -243,38 +243,70 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 *       ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
 *            + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
 */
-int reserve_bp_slot(struct perf_event *bp)
+static int __reserve_bp_slot(struct perf_event *bp)
 {
        struct bp_busy_slots slots = {0};
-        int ret = 0;
-        mutex_lock(&nr_bp_mutex);
        fetch_bp_busy_slots(&slots, bp);
        /* Flexible counters need to keep at least one slot */
-        if (slots.pinned + (!!slots.flexible) == HBP_NUM) {
+        if (slots.pinned + (!!slots.flexible) == HBP_NUM)
-                ret = -ENOSPC;
+                return -ENOSPC;
-                goto end;
-        }
        toggle_bp_slot(bp, true);
-end:
+        return 0;
+}
+int reserve_bp_slot(struct perf_event *bp)
+{
+        int ret;
+        mutex_lock(&nr_bp_mutex);
+        ret = __reserve_bp_slot(bp);
        mutex_unlock(&nr_bp_mutex);
        return ret;
 }
+static void __release_bp_slot(struct perf_event *bp)
+{
+        toggle_bp_slot(bp, false);
+}
 void release_bp_slot(struct perf_event *bp)
 {
        mutex_lock(&nr_bp_mutex);
-        toggle_bp_slot(bp, false);
+        __release_bp_slot(bp);
        mutex_unlock(&nr_bp_mutex);
 }
+/*
+ * Allow the kernel debugger to reserve breakpoint slots without
+ * taking a lock using the dbg_* variant of for the reserve and
+ * release breakpoint slots.
+ */
+int dbg_reserve_bp_slot(struct perf_event *bp)
+{
+        if (mutex_is_locked(&nr_bp_mutex))
+                return -1;
+        return __reserve_bp_slot(bp);
+}
+int dbg_release_bp_slot(struct perf_event *bp)
+{
+        if (mutex_is_locked(&nr_bp_mutex))
+                return -1;
+        __release_bp_slot(bp);
+        return 0;
+}
 int register_perf_hw_breakpoint(struct perf_event *bp)
 {
@@ -296,6 +328,10 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
        if (!bp->attr.disabled || !bp->overflow_handler)
                ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+        /* if arch_validate_hwbkpt_settings() fails then release bp slot */
+        if (ret)
+                release_bp_slot(bp);
        return ret;
 }
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 2eb517e23514..c7ade62e4ef0 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -583,6 +583,9 @@ static void kgdb_wait(struct pt_regs *regs)
        smp_wmb();
        atomic_set(&cpu_in_kgdb[cpu], 1);
+        /* Disable any cpu specific hw breakpoints */
+        kgdb_disable_hw_debug(regs);
        /* Wait till primary CPU is done with debugging */
        while (atomic_read(&passive_cpu_wait[cpu]))
                cpu_relax();
diff --git a/kernel/sched.c b/kernel/sched.c
index 4508fe7048be..3a8fb30a91b1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2320,14 +2320,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 }
 /*
- * Called from:
+ * Gets called from 3 sites (exec, fork, wakeup), since it is called without
+ * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
+ * by:
 *
- *  - fork, @p is stable because it isn't on the tasklist yet
+ *  exec:           is unstable, retry loop
- *
+ *  fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
- *  - exec, @p is unstable, retry loop
- *
- *  - wake-up, we serialize ->cpus_allowed against TASK_WAKING so
- *             we should be good.
 */
 static inline
 int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
@@ -2620,9 +2618,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
        if (p->sched_class->task_fork)
                p->sched_class->task_fork(p);
-#ifdef CONFIG_SMP
-        cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
-#endif
        set_task_cpu(p, cpu);
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -2652,6 +2647,21 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
        unsigned long flags;
        struct rq *rq;
+        int cpu = get_cpu();
+#ifdef CONFIG_SMP
+        /*
+         * Fork balancing, do it here and not earlier because:
+         *  - cpus_allowed can change in the fork path
+         *  - any previously selected cpu might disappear through hotplug
+         *
+         * We still have TASK_WAKING but PF_STARTING is gone now, meaning
+         * ->cpus_allowed is stable, we have preemption disabled, meaning
+         * cpu_online_mask is stable.
+         */
+        cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
+        set_task_cpu(p, cpu);
+#endif
        rq = task_rq_lock(p, &flags);
        BUG_ON(p->state != TASK_WAKING);
@@ -2665,6 +2675,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                p->sched_class->task_woken(rq, p);
 #endif
        task_rq_unlock(rq, &flags);
+        put_cpu();
 }
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -7139,14 +7150,18 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
         * the ->cpus_allowed mask from under waking tasks, which would be
         * possible when we change rq->lock in ttwu(), so synchronize against
         * TASK_WAKING to avoid that.
+         *
+         * Make an exception for freshly cloned tasks, since cpuset namespaces
+         * might move the task about, we have to validate the target in
+         * wake_up_new_task() anyway since the cpu might have gone away.
         */
 again:
-        while (p->state == TASK_WAKING)
+        while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
                cpu_relax();
        rq = task_rq_lock(p, &flags);
-        if (p->state == TASK_WAKING) {
+        if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
                task_rq_unlock(rq, &flags);
                goto again;
        }
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e85c23404d34..13700833c181 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -343,7 +343,19 @@ static void clocksource_resume_watchdog(void)
 {
        unsigned long flags;
-        spin_lock_irqsave(&watchdog_lock, flags);
+        /*
+         * We use trylock here to avoid a potential dead lock when
+         * kgdb calls this code after the kernel has been stopped with
+         * watchdog_lock held. When watchdog_lock is held we just
+         * return and accept, that the watchdog might trigger and mark
+         * the monitored clock source (usually TSC) unstable.
+         *
+         * This does not affect the other caller clocksource_resume()
+         * because at this point the kernel is UP, interrupts are
+         * disabled and nothing can hold watchdog_lock.
+         */
+        if (!spin_trylock_irqsave(&watchdog_lock, flags))
+                return;
        clocksource_reset_watchdog();
        spin_unlock_irqrestore(&watchdog_lock, flags);
 }
@@ -458,8 +470,8 @@ void clocksource_resume(void)
 * clocksource_touch_watchdog - Update watchdog
 *
 * Update the watchdog after exception contexts such as kgdb so as not
- * to incorrectly trip the watchdog.
+ * to incorrectly trip the watchdog. This might fail when the kernel
- *
+ * was stopped in code which holds watchdog_lock.
 */
 void clocksource_touch_watchdog(void)
 {
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 6c22d8a2f289..60e2ce0181ee 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -27,9 +27,7 @@ config HAVE_FUNCTION_GRAPH_TRACER
 config HAVE_FUNCTION_GRAPH_FP_TEST
        bool
        help
-         An arch may pass in a unique value (frame pointer) to both the
+          See Documentation/trace/ftrace-design.txt
-         entering and exiting of a function. On exit, the value is compared
-         and if it does not match, then it will panic the kernel.
 config HAVE_FUNCTION_TRACE_MCOUNT_TEST
        bool
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index edefe3b2801b..8c1b2d290718 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -464,6 +464,8 @@ struct ring_buffer_iter {
        struct ring_buffer_per_cpu      *cpu_buffer;
        unsigned long                   head;
        struct buffer_page              *head_page;
+        struct buffer_page              *cache_reader_page;
+        unsigned long                   cache_read;
        u64                             read_stamp;
 };
@@ -2716,6 +2718,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
                iter->read_stamp = cpu_buffer->read_stamp;
        else
                iter->read_stamp = iter->head_page->page->time_stamp;
+        iter->cache_reader_page = cpu_buffer->reader_page;
+        iter->cache_read = cpu_buffer->read;
 }
 /**
@@ -3060,13 +3064,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        struct ring_buffer_event *event;
        int nr_loops = 0;
-        if (ring_buffer_iter_empty(iter))
-                return NULL;
        cpu_buffer = iter->cpu_buffer;
        buffer = cpu_buffer->buffer;
+        /*
+         * Check if someone performed a consuming read to
+         * the buffer. A consuming read invalidates the iterator
+         * and we need to reset the iterator in this case.
+         */
+        if (unlikely(iter->cache_read != cpu_buffer->read ||
+                     iter->cache_reader_page != cpu_buffer->reader_page))
+                rb_iter_reset(iter);
 again:
+        if (ring_buffer_iter_empty(iter))
+                return NULL;
        /*
         * We repeat when a timestamp is encountered.
         * We can get multiple timestamps by nested interrupts or also
@@ -3081,6 +3094,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        if (rb_per_cpu_empty(cpu_buffer))
                return NULL;
+        if (iter->head >= local_read(&iter->head_page->page->commit)) {
+                rb_inc_iter(iter);
+                goto again;
+        }
        event = rb_iter_head_event(iter);
        switch (event->type_len) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0df1b0f2cb9e..eac6875cb990 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -951,6 +951,11 @@ void trace_find_cmdline(int pid, char comm[])
                return;
        }
+        if (WARN_ON_ONCE(pid < 0)) {
+                strcpy(comm, "<XXX>");
+                return;
+        }
        if (pid > PID_MAX_DEFAULT) {
                strcpy(comm, "<...>");
                return;