Merge branch 'iov_iter' into for-next

author: Al Viro <viro@zeniv.linux.org.uk> 2014-12-08 20:39:29 -0500
committer: Al Viro <viro@zeniv.linux.org.uk> 2014-12-08 20:39:29 -0500
commit: ba00410b8131b23edfb0e09f8b6dd26c8eb621fb (patch)
tree: c08504e4d2fa51ac91cef544f336d0169806c49f /kernel
parent: 8ce74dd6057832618957fc2cbd38fa959c3a0a6c (diff)
parent: aa583096d9767892983332e7c1a984bd17e3cd39 (diff)
35 files changed, 410 insertions, 260 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index dc5c77544fd6..17ea6d4a9a24 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -86,7 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_CPU_PM) += cpu_pm.o
-obj-$(CONFIG_NET) += bpf/
+obj-$(CONFIG_BPF) += bpf/
 obj-$(CONFIG_PERF_EVENTS) += events/
diff --git a/kernel/audit.c b/kernel/audit.c
index 80983df92cd4..cebb11db4d34 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -739,7 +739,7 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
        audit_log_task_info(ab, current);
-        audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
+        audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
                         audit_feature_names[which], !!old_feature, !!new_feature,
                         !!old_lock, !!new_lock, res);
        audit_log_end(ab);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e242e3a9864a..80f29e015570 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -154,6 +154,7 @@ static struct audit_chunk *alloc_chunk(int count)
                chunk->owners[i].index = i;
        }
        fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
+        chunk->mark.mask = FS_IN_IGNORED;
        return chunk;
 }
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 45427239f375..0daf7f6ae7df 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
-obj-y := core.o syscall.o verifier.o
+obj-y := core.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o
 ifdef CONFIG_TEST_BPF
-obj-y += test_stub.o
+obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
 endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f0c30c59b317..d6594e457a25 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -655,3 +655,12 @@ void bpf_prog_free(struct bpf_prog *fp)
        schedule_work(&aux->work);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_free);
+/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
+ * skb_copy_bits(), so provide a weak definition of it for NET-less config.
+ */
+int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
+                         int len)
+{
+        return -EFAULT;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 801f5f3b9307..9f81818f2941 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1409,7 +1409,8 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
                if (memcmp(&old->regs[i], &cur->regs[i],
                           sizeof(old->regs[0])) != 0) {
                        if (old->regs[i].type == NOT_INIT ||
-                            old->regs[i].type == UNKNOWN_VALUE)
+                            (old->regs[i].type == UNKNOWN_VALUE &&
+                             cur->regs[i].type != NOT_INIT))
                                continue;
                        return false;
                }
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 5664985c46a0..937ecdfdf258 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -107,46 +107,6 @@ void context_tracking_user_enter(void)
 }
 NOKPROBE_SYMBOL(context_tracking_user_enter);
-#ifdef CONFIG_PREEMPT
-/**
- * preempt_schedule_context - preempt_schedule called by tracing
- *
- * The tracing infrastructure uses preempt_enable_notrace to prevent
- * recursion and tracing preempt enabling caused by the tracing
- * infrastructure itself. But as tracing can happen in areas coming
- * from userspace or just about to enter userspace, a preempt enable
- * can occur before user_exit() is called. This will cause the scheduler
- * to be called when the system is still in usermode.
- *
- * To prevent this, the preempt_enable_notrace will use this function
- * instead of preempt_schedule() to exit user context if needed before
- * calling the scheduler.
- */
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
-{
-        enum ctx_state prev_ctx;
-        if (likely(!preemptible()))
-                return;
-        /*
-         * Need to disable preemption in case user_exit() is traced
-         * and the tracer calls preempt_enable_notrace() causing
-         * an infinite recursion.
-         */
-        preempt_disable_notrace();
-        prev_ctx = exception_enter();
-        preempt_enable_no_resched_notrace();
-        preempt_schedule();
-        preempt_disable_notrace();
-        exception_exit(prev_ctx);
-        preempt_enable_notrace();
-}
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_PREEMPT */
 /**
 * context_tracking_user_exit - Inform the context tracking that the CPU is
 *                              exiting userspace mode and entering the kernel.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 356450f09c1f..90a3d017b90c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ static struct {
         * an ongoing cpu hotplug operation.
         */
        int refcount;
+        /* And allows lockless put_online_cpus(). */
+        atomic_t puts_pending;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map dep_map;
@@ -113,7 +115,11 @@ void put_online_cpus(void)
 {
        if (cpu_hotplug.active_writer == current)
                return;
-        mutex_lock(&cpu_hotplug.lock);
+        if (!mutex_trylock(&cpu_hotplug.lock)) {
+                atomic_inc(&cpu_hotplug.puts_pending);
+                cpuhp_lock_release();
+                return;
+        }
        if (WARN_ON(!cpu_hotplug.refcount))
                cpu_hotplug.refcount++; /* try to fix things up */
@@ -155,6 +161,12 @@ void cpu_hotplug_begin(void)
        cpuhp_lock_acquire();
        for (;;) {
                mutex_lock(&cpu_hotplug.lock);
+                if (atomic_read(&cpu_hotplug.puts_pending)) {
+                        int delta;
+                        delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
+                        cpu_hotplug.refcount -= delta;
+                }
                if (likely(!cpu_hotplug.refcount))
                        break;
                __set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 446fbeefad1c..e56923026dd8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1562,8 +1562,10 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group
        if (!task) {
                /*
-                 * Per cpu events are removed via an smp call and
+                 * Per cpu events are removed via an smp call. The removal can
-                 * the removal is always successful.
+                 * fail if the CPU is currently offline, but in that case we
+                 * already called __perf_remove_from_context from
+                 * perf_event_exit_cpu.
                 */
                cpu_function_call(event->cpu, __perf_remove_from_context, &re);
                return;
@@ -6071,11 +6073,6 @@ static int perf_swevent_init(struct perf_event *event)
        return 0;
 }
-static int perf_swevent_event_idx(struct perf_event *event)
-{
-        return 0;
-}
 static struct pmu perf_swevent = {
        .task_ctx_nr    = perf_sw_context,
@@ -6085,8 +6082,6 @@ static struct pmu perf_swevent = {
        .start          = perf_swevent_start,
        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
-        .event_idx      = perf_swevent_event_idx,
 };
 #ifdef CONFIG_EVENT_TRACING
@@ -6204,8 +6199,6 @@ static struct pmu perf_tracepoint = {
        .start          = perf_swevent_start,
        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
-        .event_idx      = perf_swevent_event_idx,
 };
 static inline void perf_tp_register(void)
@@ -6431,8 +6424,6 @@ static struct pmu perf_cpu_clock = {
        .start          = cpu_clock_event_start,
        .stop           = cpu_clock_event_stop,
        .read           = cpu_clock_event_read,
-        .event_idx      = perf_swevent_event_idx,
 };
 /*
@@ -6511,8 +6502,6 @@ static struct pmu perf_task_clock = {
        .start          = task_clock_event_start,
        .stop           = task_clock_event_stop,
        .read           = task_clock_event_read,
-        .event_idx      = perf_swevent_event_idx,
 };
 static void perf_pmu_nop_void(struct pmu *pmu)
@@ -6542,7 +6531,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
 static int perf_event_idx_default(struct perf_event *event)
 {
-        return event->hw.idx + 1;
+        return 0;
 }
 /*
@@ -8130,7 +8119,7 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
 static void __perf_event_exit_context(void *__info)
 {
-        struct remove_event re = { .detach_group = false };
+        struct remove_event re = { .detach_group = true };
        struct perf_event_context *ctx = __info;
        perf_pmu_rotate_stop(ctx->pmu);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 1559fb0b9296..9803a6600d49 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
        bp->hw.state = PERF_HES_STOPPED;
 }
-static int hw_breakpoint_event_idx(struct perf_event *bp)
-{
-        return 0;
-}
 static struct pmu perf_breakpoint = {
        .task_ctx_nr    = perf_sw_context, /* could eventually get its own */
@@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = {
        .start          = hw_breakpoint_start,
        .stop           = hw_breakpoint_stop,
        .read           = hw_breakpoint_pmu_read,
-        .event_idx      = hw_breakpoint_event_idx,
 };
 int __init init_hw_breakpoint(void)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 1d0af8a2c646..ed8f2cde34c5 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1640,7 +1640,6 @@ bool uprobe_deny_signal(void)
                if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
                        utask->state = UTASK_SSTEP_TRAPPED;
                        set_tsk_thread_flag(t, TIF_UPROBE);
-                        set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
                }
        }
diff --git a/kernel/futex.c b/kernel/futex.c
index f3a3a071283c..63678b573d61 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -143,9 +143,8 @@
 *
 * Where (A) orders the waiters increment and the futex value read through
 * atomic operations (see hb_waiters_inc) and where (B) orders the write
- * to futex and the waiters read -- this is done by the barriers in
+ * to futex and the waiters read -- this is done by the barriers for both
- * get_futex_key_refs(), through either ihold or atomic_inc, depending on the
+ * shared and private futexes in get_futex_key_refs().
- * futex type.
 *
 * This yields the following case (where X:=waiters, Y:=futex):
 *
@@ -344,13 +343,20 @@ static void get_futex_key_refs(union futex_key *key)
                futex_get_mm(key); /* implies MB (B) */
                break;
        default:
+                /*
+                 * Private futexes do not hold reference on an inode or
+                 * mm, therefore the only purpose of calling get_futex_key_refs
+                 * is because we need the barrier for the lockless waiter check.
+                 */
                smp_mb(); /* explicit MB (B) */
        }
 }
 /*
 * Drop a reference to the resource addressed by a key.
- * The hash bucket spinlock must not be held.
+ * The hash bucket spinlock must not be held. This is
+ * a no-op for private futexes, see comment in the get
+ * counterpart.
 */
 static void drop_futex_key_refs(union futex_key *key)
 {
@@ -641,8 +647,14 @@ static struct futex_pi_state * alloc_pi_state(void)
        return pi_state;
 }
+/*
+ * Must be called with the hb lock held.
+ */
 static void free_pi_state(struct futex_pi_state *pi_state)
 {
+        if (!pi_state)
+                return;
        if (!atomic_dec_and_test(&pi_state->refcount))
                return;
@@ -1521,15 +1533,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
        }
 retry:
-        if (pi_state != NULL) {
-                /*
-                 * We will have to lookup the pi_state again, so free this one
-                 * to keep the accounting correct.
-                 */
-                free_pi_state(pi_state);
-                pi_state = NULL;
-        }
        ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
@@ -1619,6 +1622,8 @@ retry_private:
                case 0:
                        break;
                case -EFAULT:
+                        free_pi_state(pi_state);
+                        pi_state = NULL;
                        double_unlock_hb(hb1, hb2);
                        hb_waiters_dec(hb2);
                        put_futex_key(&key2);
@@ -1634,6 +1639,8 @@ retry_private:
                         *   exit to complete.
                         * - The user space value changed.
                         */
+                        free_pi_state(pi_state);
+                        pi_state = NULL;
                        double_unlock_hb(hb1, hb2);
                        hb_waiters_dec(hb2);
                        put_futex_key(&key2);
@@ -1710,6 +1717,7 @@ retry_private:
        }
 out_unlock:
+        free_pi_state(pi_state);
        double_unlock_hb(hb1, hb2);
        hb_waiters_dec(hb2);
@@ -1727,8 +1735,6 @@ out_put_keys:
 out_put_key1:
        put_futex_key(&key1);
 out:
-        if (pi_state != NULL)
-                free_pi_state(pi_state);
        return ret ? ret : task_count;
 }
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index cf66c5c8458e..3b7408759bdf 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
 config GCOV_PROFILE_ALL
        bool "Profile entire Kernel"
        depends on GCOV_KERNEL
-        depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM
+        depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64
        default n
        ---help---
        This options activates profiling for the entire kernel.
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8637e041a247..80f7a6d00519 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -196,12 +196,34 @@ int __request_module(bool wait, const char *fmt, ...)
 EXPORT_SYMBOL(__request_module);
 #endif /* CONFIG_MODULES */
+static void call_usermodehelper_freeinfo(struct subprocess_info *info)
+{
+        if (info->cleanup)
+                (*info->cleanup)(info);
+        kfree(info);
+}
+static void umh_complete(struct subprocess_info *sub_info)
+{
+        struct completion *comp = xchg(&sub_info->complete, NULL);
+        /*
+         * See call_usermodehelper_exec(). If xchg() returns NULL
+         * we own sub_info, the UMH_KILLABLE caller has gone away
+         * or the caller used UMH_NO_WAIT.
+         */
+        if (comp)
+                complete(comp);
+        else
+                call_usermodehelper_freeinfo(sub_info);
+}
 /*
 * This is the task which runs the usermode application
 */
 static int ____call_usermodehelper(void *data)
 {
        struct subprocess_info *sub_info = data;
+        int wait = sub_info->wait & ~UMH_KILLABLE;
        struct cred *new;
        int retval;
@@ -221,7 +243,7 @@ static int ____call_usermodehelper(void *data)
        retval = -ENOMEM;
        new = prepare_kernel_cred(current);
        if (!new)
-                goto fail;
+                goto out;
        spin_lock(&umh_sysctl_lock);
        new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
@@ -233,7 +255,7 @@ static int ____call_usermodehelper(void *data)
                retval = sub_info->init(sub_info, new);
                if (retval) {
                        abort_creds(new);
-                        goto fail;
+                        goto out;
                }
        }
@@ -242,12 +264,13 @@ static int ____call_usermodehelper(void *data)
        retval = do_execve(getname_kernel(sub_info->path),
                           (const char __user *const __user *)sub_info->argv,
                           (const char __user *const __user *)sub_info->envp);
+out:
+        sub_info->retval = retval;
+        /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
+        if (wait != UMH_WAIT_PROC)
+                umh_complete(sub_info);
        if (!retval)
                return 0;
-        /* Exec failed? */
-fail:
-        sub_info->retval = retval;
        do_exit(0);
 }
@@ -258,26 +281,6 @@ static int call_helper(void *data)
        return ____call_usermodehelper(data);
 }
-static void call_usermodehelper_freeinfo(struct subprocess_info *info)
-{
-        if (info->cleanup)
-                (*info->cleanup)(info);
-        kfree(info);
-}
-static void umh_complete(struct subprocess_info *sub_info)
-{
-        struct completion *comp = xchg(&sub_info->complete, NULL);
-        /*
-         * See call_usermodehelper_exec(). If xchg() returns NULL
-         * we own sub_info, the UMH_KILLABLE caller has gone away.
-         */
-        if (comp)
-                complete(comp);
-        else
-                call_usermodehelper_freeinfo(sub_info);
-}
 /* Keventd can't block, but this (a child) can. */
 static int wait_for_helper(void *data)
 {
@@ -336,18 +339,8 @@ static void __call_usermodehelper(struct work_struct *work)
                kmod_thread_locker = NULL;
        }
-        switch (wait) {
+        if (pid < 0) {
-        case UMH_NO_WAIT:
+                sub_info->retval = pid;
-                call_usermodehelper_freeinfo(sub_info);
-                break;
-        case UMH_WAIT_PROC:
-                if (pid > 0)
-                        break;
-                /* FALLTHROUGH */
-        case UMH_WAIT_EXEC:
-                if (pid < 0)
-                        sub_info->retval = pid;
                umh_complete(sub_info);
        }
 }
@@ -588,7 +581,12 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
                goto out;
        }
-        sub_info->complete = &done;
+        /*
+         * Set the completion pointer only if there is a waiter.
+         * This makes it possible to use umh_complete to free
+         * the data structure in case of UMH_NO_WAIT.
+         */
+        sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
        sub_info->wait = wait;
        queue_work(khelper_wq, &sub_info->work);
diff --git a/kernel/panic.c b/kernel/panic.c
index d09dc5c32c67..cf80672b7924 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -244,6 +244,7 @@ static const struct tnt tnts[] = {
 *  'I' - Working around severe firmware bug.
 *  'O' - Out-of-tree module has been loaded.
 *  'E' - Unsigned module has been loaded.
+ *  'L' - A soft lockup has previously occurred.
 *
 *      The string is overwritten by the next call to print_tainted().
 */
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a9dfa79b6bab..1f35a3478f3c 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -502,8 +502,14 @@ int hibernation_restore(int platform_mode)
        error = dpm_suspend_start(PMSG_QUIESCE);
        if (!error) {
                error = resume_target_kernel(platform_mode);
-                dpm_resume_end(PMSG_RECOVER);
+                /*
+                 * The above should either succeed and jump to the new kernel,
+                 * or return with an error. Otherwise things are just
+                 * undefined, so let's be paranoid.
+                 */
+                BUG_ON(!error);
        }
+        dpm_resume_end(PMSG_RECOVER);
        pm_restore_gfp_mask();
        resume_console();
        pm_restore_console();
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4ca9a33ff620..c347e3ce3a55 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -146,7 +146,7 @@ static int platform_suspend_prepare(suspend_state_t state)
 static int platform_suspend_prepare_late(suspend_state_t state)
 {
-        return state == PM_SUSPEND_FREEZE && freeze_ops->prepare ?
+        return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ?
                freeze_ops->prepare() : 0;
 }
@@ -164,7 +164,7 @@ static void platform_resume_noirq(suspend_state_t state)
 static void platform_resume_early(suspend_state_t state)
 {
-        if (state == PM_SUSPEND_FREEZE && freeze_ops->restore)
+        if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore)
                freeze_ops->restore();
 }
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 133e47223095..9815447d22e0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3299,11 +3299,16 @@ static void _rcu_barrier(struct rcu_state *rsp)
                        continue;
                rdp = per_cpu_ptr(rsp->rda, cpu);
                if (rcu_is_nocb_cpu(cpu)) {
-                        _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
+                        if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
-                                           rsp->n_barrier_done);
+                                _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
-                        atomic_inc(&rsp->barrier_cpu_count);
+                                                   rsp->n_barrier_done);
-                        __call_rcu(&rdp->barrier_head, rcu_barrier_callback,
+                        } else {
-                                   rsp, cpu, 0);
+                                _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
+                                                   rsp->n_barrier_done);
+                                atomic_inc(&rsp->barrier_cpu_count);
+                                __call_rcu(&rdp->barrier_head,
+                                           rcu_barrier_callback, rsp, cpu, 0);
+                        }
                } else if (ACCESS_ONCE(rdp->qlen)) {
                        _rcu_barrier_trace(rsp, "OnlineQ", cpu,
                                           rsp->n_barrier_done);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index d03764652d91..bbdc45d8d74f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -587,6 +587,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
 static void print_cpu_stall_info_end(void);
 static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static void increment_cpu_stall_ticks(void);
+static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
 static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
 static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 387dd4599344..c1d7f27bd38f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2050,6 +2050,33 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
 }
 /*
+ * Does the specified CPU need an RCU callback for the specified flavor
+ * of rcu_barrier()?
+ */
+static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
+{
+        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+        struct rcu_head *rhp;
+        /* No-CBs CPUs might have callbacks on any of three lists. */
+        rhp = ACCESS_ONCE(rdp->nocb_head);
+        if (!rhp)
+                rhp = ACCESS_ONCE(rdp->nocb_gp_head);
+        if (!rhp)
+                rhp = ACCESS_ONCE(rdp->nocb_follower_head);
+        /* Having no rcuo kthread but CBs after scheduler starts is bad! */
+        if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) {
+                /* RCU callback enqueued before CPU first came online??? */
+                pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
+                       cpu, rhp->func);
+                WARN_ON_ONCE(1);
+        }
+        return !!rhp;
+}
+/*
 * Enqueue the specified string of rcu_head structures onto the specified
 * CPU's no-CBs lists.  The CPU is specified by rdp, the head of the
 * string by rhp, and the tail of the string by rhtp.  The non-lazy/lazy
@@ -2642,6 +2669,12 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
+static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
+{
+        WARN_ON_ONCE(1); /* Should be dead code. */
+        return false;
+}
 static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
 {
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 44999505e1bf..24beb9bb4c3e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2475,44 +2475,6 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
 /*
- * Return any ns on the sched_clock that have not yet been accounted in
- * @p in case that task is currently running.
- *
- * Called with task_rq_lock() held on @rq.
- */
-static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
-{
-        u64 ns = 0;
-        /*
-         * Must be ->curr _and_ ->on_rq.  If dequeued, we would
-         * project cycles that may never be accounted to this
-         * thread, breaking clock_gettime().
-         */
-        if (task_current(rq, p) && task_on_rq_queued(p)) {
-                update_rq_clock(rq);
-                ns = rq_clock_task(rq) - p->se.exec_start;
-                if ((s64)ns < 0)
-                        ns = 0;
-        }
-        return ns;
-}
-unsigned long long task_delta_exec(struct task_struct *p)
-{
-        unsigned long flags;
-        struct rq *rq;
-        u64 ns = 0;
-        rq = task_rq_lock(p, &flags);
-        ns = do_task_delta_exec(p, rq);
-        task_rq_unlock(rq, p, &flags);
-        return ns;
-}
-/*
 * Return accounted runtime for the task.
 * In case the task is currently running, return the runtime plus current's
 * pending runtime that have not been accounted yet.
@@ -2521,7 +2483,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 {
        unsigned long flags;
        struct rq *rq;
-        u64 ns = 0;
+        u64 ns;
 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
        /*
@@ -2540,7 +2502,16 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 #endif
        rq = task_rq_lock(p, &flags);
-        ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+        /*
+         * Must be ->curr _and_ ->on_rq.  If dequeued, we would
+         * project cycles that may never be accounted to this
+         * thread, breaking clock_gettime().
+         */
+        if (task_current(rq, p) && task_on_rq_queued(p)) {
+                update_rq_clock(rq);
+                p->sched_class->update_curr(rq);
+        }
+        ns = p->se.sum_exec_runtime;
        task_rq_unlock(rq, p, &flags);
        return ns;
@@ -2951,6 +2922,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
 }
 NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
+#ifdef CONFIG_CONTEXT_TRACKING
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+{
+        enum ctx_state prev_ctx;
+        if (likely(!preemptible()))
+                return;
+        do {
+                __preempt_count_add(PREEMPT_ACTIVE);
+                /*
+                 * Needs preempt disabled in case user_exit() is traced
+                 * and the tracer calls preempt_enable_notrace() causing
+                 * an infinite recursion.
+                 */
+                prev_ctx = exception_enter();
+                __schedule();
+                exception_exit(prev_ctx);
+                __preempt_count_sub(PREEMPT_ACTIVE);
+                barrier();
+        } while (need_resched());
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_CONTEXT_TRACKING */
 #endif /* CONFIG_PREEMPT */
 /*
@@ -6327,6 +6339,10 @@ static void sched_init_numa(void)
                if (!sched_debug())
                        break;
        }
+        if (!level)
+                return;
        /*
         * 'level' contains the number of unique distances, excluding the
         * identity distance node_distance(i,i).
@@ -7403,8 +7419,12 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                put_prev_task(rq, tsk);
-        tg = container_of(task_css_check(tsk, cpu_cgrp_id,
+        /*
-                                lockdep_is_held(&tsk->sighand->siglock)),
+         * All callers are synchronized by task_rq_lock(); we do not use RCU
+         * which is pointless here. Thus, we pass "true" to task_css_check()
+         * to prevent lockdep warnings.
+         */
+        tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
                          struct task_group, css);
        tg = autogroup_task_group(tsk, tg);
        tsk->sched_task_group = tg;
@@ -7833,6 +7853,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
        sched_offline_group(tg);
 }
+static void cpu_cgroup_fork(struct task_struct *task)
+{
+        sched_move_task(task);
+}
 static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
                                 struct cgroup_taskset *tset)
 {
@@ -8205,6 +8230,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
        .css_free       = cpu_cgroup_css_free,
        .css_online     = cpu_cgroup_css_online,
        .css_offline    = cpu_cgroup_css_offline,
+        .fork           = cpu_cgroup_fork,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 256e577faf1b..28fa9d9e9201 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -518,12 +518,20 @@ again:
        }
        /*
-         * We need to take care of a possible races here. In fact, the
+         * We need to take care of several possible races here:
-         * task might have changed its scheduling policy to something
+         *
-         * different from SCHED_DEADLINE or changed its reservation
+         *   - the task might have changed its scheduling policy
-         * parameters (through sched_setattr()).
+         *     to something different than SCHED_DEADLINE
+         *   - the task might have changed its reservation parameters
+         *     (through sched_setattr())
+         *   - the task might have been boosted by someone else and
+         *     might be in the boosting/deboosting path
+         *
+         * In all this cases we bail out, as the task is already
+         * in the runqueue or is going to be enqueued back anyway.
         */
-        if (!dl_task(p) || dl_se->dl_new)
+        if (!dl_task(p) || dl_se->dl_new ||
+            dl_se->dl_boosted || !dl_se->dl_throttled)
                goto unlock;
        sched_clock_tick();
@@ -532,7 +540,7 @@ again:
        dl_se->dl_yielded = 0;
        if (task_on_rq_queued(p)) {
                enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
-                if (task_has_dl_policy(rq->curr))
+                if (dl_task(rq->curr))
                        check_preempt_curr_dl(rq, p, 0);
                else
                        resched_curr(rq);
@@ -847,8 +855,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
         * smaller than our one... OTW we keep our runtime and
         * deadline.
         */
-        if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))
+        if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
                pi_se = &pi_task->dl;
+        } else if (!dl_prio(p->normal_prio)) {
+                /*
+                 * Special case in which we have a !SCHED_DEADLINE task
+                 * that is going to be deboosted, but exceedes its
+                 * runtime while doing so. No point in replenishing
+                 * it, as it's going to return back to its original
+                 * scheduling class after this.
+                 */
+                BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
+                return;
+        }
        /*
         * If p is throttled, we do nothing. In fact, if it exhausted
@@ -1607,8 +1626,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
                        /* Only reschedule if pushing failed */
                        check_resched = 0;
 #endif /* CONFIG_SMP */
-                if (check_resched && task_has_dl_policy(rq->curr))
+                if (check_resched) {
-                        check_preempt_curr_dl(rq, p, 0);
+                        if (dl_task(rq->curr))
+                                check_preempt_curr_dl(rq, p, 0);
+                        else
+                                resched_curr(rq);
+                }
        }
 }
@@ -1678,4 +1701,6 @@ const struct sched_class dl_sched_class = {
        .prio_changed           = prio_changed_dl,
        .switched_from          = switched_from_dl,
        .switched_to            = switched_to_dl,
+        .update_curr            = update_curr_dl,
 };
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0b069bf3e708..ef2b104b254c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -726,6 +726,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
        account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
+static void update_curr_fair(struct rq *rq)
+{
+        update_curr(cfs_rq_of(&rq->curr->se));
+}
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -828,11 +833,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
 static unsigned int task_scan_min(struct task_struct *p)
 {
+        unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
        unsigned int scan, floor;
        unsigned int windows = 1;
-        if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
+        if (scan_size < MAX_SCAN_WINDOW)
-                windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+                windows = MAX_SCAN_WINDOW / scan_size;
        floor = 1000 / windows;
        scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
@@ -1164,9 +1170,26 @@ static void task_numa_compare(struct task_numa_env *env,
        long moveimp = imp;
        rcu_read_lock();
-        cur = ACCESS_ONCE(dst_rq->curr);
-        if (cur->pid == 0) /* idle */
+        raw_spin_lock_irq(&dst_rq->lock);
+        cur = dst_rq->curr;
+        /*
+         * No need to move the exiting task, and this ensures that ->curr
+         * wasn't reaped and thus get_task_struct() in task_numa_assign()
+         * is safe under RCU read lock.
+         * Note that rcu_read_lock() itself can't protect from the final
+         * put_task_struct() after the last schedule().
+         */
+        if ((cur->flags & PF_EXITING) || is_idle_task(cur))
                cur = NULL;
+        raw_spin_unlock_irq(&dst_rq->lock);
+        /*
+         * Because we have preemption enabled we can get migrated around and
+         * end try selecting ourselves (current == env->p) as a swap candidate.
+         */
+        if (cur == env->p)
+                goto unlock;
        /*
         * "imp" is the fault differential for the source task between the
@@ -1520,7 +1543,7 @@ static void update_task_scan_period(struct task_struct *p,
                 * scanning faster if shared accesses dominate as it may
                 * simply bounce migrations uselessly
                 */
-                ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+                ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
                diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
        }
@@ -7938,6 +7961,8 @@ const struct sched_class fair_sched_class = {
        .get_rr_interval        = get_rr_interval_fair,
+        .update_curr            = update_curr_fair,
 #ifdef CONFIG_FAIR_GROUP_SCHED
        .task_move_group        = task_move_group_fair,
 #endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 67ad4e7f506a..c65dac8c97cd 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -75,6 +75,10 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task
        return 0;
 }
+static void update_curr_idle(struct rq *rq)
+{
+}
 /*
 * Simple, special scheduling class for the per-CPU idle tasks:
 */
@@ -101,4 +105,5 @@ const struct sched_class idle_sched_class = {
        .prio_changed           = prio_changed_idle,
        .switched_to            = switched_to_idle,
+        .update_curr            = update_curr_idle,
 };
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d024e6ce30ba..20bca398084a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2128,6 +2128,8 @@ const struct sched_class rt_sched_class = {
        .prio_changed           = prio_changed_rt,
        .switched_to            = switched_to_rt,
+        .update_curr            = update_curr_rt,
 };
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 24156c8434d1..2df8ef067cc5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1135,6 +1135,8 @@ struct sched_class {
        unsigned int (*get_rr_interval) (struct rq *rq,
                                         struct task_struct *task);
+        void (*update_curr) (struct rq *rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
        void (*task_move_group) (struct task_struct *p, int on_rq);
 #endif
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 67426e529f59..79ffec45a6ac 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -102,6 +102,10 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
        return 0;
 }
+static void update_curr_stop(struct rq *rq)
+{
+}
 /*
 * Simple, special scheduling class for the per-CPU stop tasks:
 */
@@ -128,4 +132,5 @@ const struct sched_class stop_sched_class = {
        .prio_changed           = prio_changed_stop,
        .switched_to            = switched_to_stop,
+        .update_curr            = update_curr_stop,
 };
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4aada6d9fe74..15f2511a1b7c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = {
                .data           = &sysctl_numa_balancing_scan_size,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &one,
        },
        {
                .procname       = "numa_balancing",
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9c94c19f1305..55449909f114 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -72,7 +72,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
         * Also omit the add if it would overflow the u64 boundary.
         */
        if ((~0ULL - clc > rnd) &&
-            (!ismax || evt->mult <= (1U << evt->shift)))
+            (!ismax || evt->mult <= (1ULL << evt->shift)))
                clc += rnd;
        do_div(clc, evt->mult);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 492b986195d5..a16b67859e2a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -553,7 +553,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
                *sample = cputime_to_expires(cputime.utime);
                break;
        case CPUCLOCK_SCHED:
-                *sample = cputime.sum_exec_runtime + task_delta_exec(p);
+                *sample = cputime.sum_exec_runtime;
                break;
        }
        return 0;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 42b463ad90f2..31ea01f42e1f 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -636,6 +636,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
                        goto out;
                }
        } else {
+                memset(&event.sigev_value, 0, sizeof(event.sigev_value));
                event.sigev_notify = SIGEV_SIGNAL;
                event.sigev_signo = SIGALRM;
                event.sigev_value.sival_int = new_timer->it_id;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fb186b9ddf51..31c90fec4158 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1925,8 +1925,16 @@ ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
         * when we are adding another op to the rec or removing the
         * current one. Thus, if the op is being added, we can
         * ignore it because it hasn't attached itself to the rec
-         * yet. That means we just need to find the op that has a
+         * yet.
-         * trampoline and is not beeing added.
+         *
+         * If an ops is being modified (hooking to different functions)
+         * then we don't care about the new functions that are being
+         * added, just the old ones (that are probably being removed).
+         *
+         * If we are adding an ops to a function that already is using
+         * a trampoline, it needs to be removed (trampolines are only
+         * for single ops connected), then an ops that is not being
+         * modified also needs to be checked.
         */
        do_for_each_ftrace_op(op, ftrace_ops_list) {
@@ -1940,17 +1948,23 @@ ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
                if (op->flags & FTRACE_OPS_FL_ADDING)
                        continue;
                /*
-                 * If the ops is not being added and has a trampoline,
+                 * If the ops is being modified and is in the old
-                 * then it must be the one that we want!
+                 * hash, then it is probably being removed from this
+                 * function.
                 */
-                if (hash_contains_ip(ip, op->func_hash))
-                        return op;
-                /* If the ops is being modified, it may be in the old hash. */
                if ((op->flags & FTRACE_OPS_FL_MODIFYING) &&
                    hash_contains_ip(ip, &op->old_hash))
                        return op;
+                /*
+                 * If the ops is not being added or modified, and it's
+                 * in its normal filter hash, then this must be the one
+                 * we want!
+                 */
+                if (!(op->flags & FTRACE_OPS_FL_MODIFYING) &&
+                    hash_contains_ip(ip, op->func_hash))
+                        return op;
        } while_for_each_ftrace_op(op);
@@ -2293,10 +2307,13 @@ static void ftrace_run_update_code(int command)
        FTRACE_WARN_ON(ret);
 }
-static void ftrace_run_modify_code(struct ftrace_ops *ops, int command)
+static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
+                                   struct ftrace_hash *old_hash)
 {
        ops->flags |= FTRACE_OPS_FL_MODIFYING;
+        ops->old_hash.filter_hash = old_hash;
        ftrace_run_update_code(command);
+        ops->old_hash.filter_hash = NULL;
        ops->flags &= ~FTRACE_OPS_FL_MODIFYING;
 }
@@ -3340,7 +3357,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly =
 static int ftrace_probe_registered;
-static void __enable_ftrace_function_probe(void)
+static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash)
 {
        int ret;
        int i;
@@ -3348,7 +3365,8 @@ static void __enable_ftrace_function_probe(void)
        if (ftrace_probe_registered) {
                /* still need to update the function call sites */
                if (ftrace_enabled)
-                        ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS);
+                        ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS,
+                                               old_hash);
                return;
        }
@@ -3477,13 +3495,14 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
        } while_for_each_ftrace_rec();
        ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+        __enable_ftrace_function_probe(old_hash);
        if (!ret)
                free_ftrace_hash_rcu(old_hash);
        else
                count = ret;
-        __enable_ftrace_function_probe();
 out_unlock:
        mutex_unlock(&ftrace_lock);
 out:
@@ -3764,10 +3783,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
        return add_hash_entry(hash, ip);
 }
-static void ftrace_ops_update_code(struct ftrace_ops *ops)
+static void ftrace_ops_update_code(struct ftrace_ops *ops,
+                                   struct ftrace_hash *old_hash)
 {
        if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled)
-                ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS);
+                ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
 }
 static int
@@ -3813,7 +3833,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
        old_hash = *orig_hash;
        ret = ftrace_hash_move(ops, enable, orig_hash, hash);
        if (!ret) {
-                ftrace_ops_update_code(ops);
+                ftrace_ops_update_code(ops, old_hash);
                free_ftrace_hash_rcu(old_hash);
        }
        mutex_unlock(&ftrace_lock);
@@ -4058,7 +4078,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
                ret = ftrace_hash_move(iter->ops, filter_hash,
                                       orig_hash, iter->hash);
                if (!ret) {
-                        ftrace_ops_update_code(iter->ops);
+                        ftrace_ops_update_code(iter->ops, old_hash);
                        free_ftrace_hash_rcu(old_hash);
                }
                mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2d75c94ae87d..a56e07c8d15b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -538,16 +538,18 @@ static void rb_wake_up_waiters(struct irq_work *work)
 * ring_buffer_wait - wait for input to the ring buffer
 * @buffer: buffer to wait on
 * @cpu: the cpu buffer to wait on
+ * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS
 *
 * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
 * as data is added to any of the @buffer's cpu buffers. Otherwise
 * it will wait for data to be added to a specific cpu buffer.
 */
-int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
 {
-        struct ring_buffer_per_cpu *cpu_buffer;
+        struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
        DEFINE_WAIT(wait);
        struct rb_irq_work *work;
+        int ret = 0;
        /*
         * Depending on what the caller is waiting for, either any
@@ -564,36 +566,61 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
        }
-        prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+        while (true) {
+                prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
-        /*
+                /*
-         * The events can happen in critical sections where
+                 * The events can happen in critical sections where
-         * checking a work queue can cause deadlocks.
+                 * checking a work queue can cause deadlocks.
-         * After adding a task to the queue, this flag is set
+                 * After adding a task to the queue, this flag is set
-         * only to notify events to try to wake up the queue
+                 * only to notify events to try to wake up the queue
-         * using irq_work.
+                 * using irq_work.
-         *
+                 *
-         * We don't clear it even if the buffer is no longer
+                 * We don't clear it even if the buffer is no longer
-         * empty. The flag only causes the next event to run
+                 * empty. The flag only causes the next event to run
-         * irq_work to do the work queue wake up. The worse
+                 * irq_work to do the work queue wake up. The worse
-         * that can happen if we race with !trace_empty() is that
+                 * that can happen if we race with !trace_empty() is that
-         * an event will cause an irq_work to try to wake up
+                 * an event will cause an irq_work to try to wake up
-         * an empty queue.
+                 * an empty queue.
-         *
+                 *
-         * There's no reason to protect this flag either, as
+                 * There's no reason to protect this flag either, as
-         * the work queue and irq_work logic will do the necessary
+                 * the work queue and irq_work logic will do the necessary
-         * synchronization for the wake ups. The only thing
+                 * synchronization for the wake ups. The only thing
-         * that is necessary is that the wake up happens after
+                 * that is necessary is that the wake up happens after
-         * a task has been queued. It's OK for spurious wake ups.
+                 * a task has been queued. It's OK for spurious wake ups.
-         */
+                 */
-        work->waiters_pending = true;
+                work->waiters_pending = true;
+                if (signal_pending(current)) {
+                        ret = -EINTR;
+                        break;
+                }
+                if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
+                        break;
+                if (cpu != RING_BUFFER_ALL_CPUS &&
+                    !ring_buffer_empty_cpu(buffer, cpu)) {
+                        unsigned long flags;
+                        bool pagebusy;
+                        if (!full)
+                                break;
+                        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+                        pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+                        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+                        if (!pagebusy)
+                                break;
+                }
-        if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
-            (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
                schedule();
+        }
        finish_wait(&work->waiters, &wait);
-        return 0;
+        return ret;
 }
 /**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 459a7b1251e5..426962b04183 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1076,13 +1076,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 }
 #endif /* CONFIG_TRACER_MAX_TRACE */
-static int wait_on_pipe(struct trace_iterator *iter)
+static int wait_on_pipe(struct trace_iterator *iter, bool full)
 {
        /* Iterators are static, they should be filled or empty */
        if (trace_buffer_iter(iter, iter->cpu_file))
                return 0;
-        return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
+        return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file,
+                                full);
 }
 #ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -4434,15 +4435,12 @@ static int tracing_wait_pipe(struct file *filp)
                mutex_unlock(&iter->mutex);
-                ret = wait_on_pipe(iter);
+                ret = wait_on_pipe(iter, false);
                mutex_lock(&iter->mutex);
                if (ret)
                        return ret;
-                if (signal_pending(current))
-                        return -EINTR;
        }
        return 1;
@@ -5372,16 +5370,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
                                goto out_unlock;
                        }
                        mutex_unlock(&trace_types_lock);
-                        ret = wait_on_pipe(iter);
+                        ret = wait_on_pipe(iter, false);
                        mutex_lock(&trace_types_lock);
                        if (ret) {
                                size = ret;
                                goto out_unlock;
                        }
-                        if (signal_pending(current)) {
-                                size = -EINTR;
-                                goto out_unlock;
-                        }
                        goto again;
                }
                size = 0;
@@ -5500,7 +5494,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        };
        struct buffer_ref *ref;
        int entries, size, i;
-        ssize_t ret;
+        ssize_t ret = 0;
        mutex_lock(&trace_types_lock);
@@ -5538,13 +5532,16 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                int r;
                ref = kzalloc(sizeof(*ref), GFP_KERNEL);
-                if (!ref)
+                if (!ref) {
+                        ret = -ENOMEM;
                        break;
+                }
                ref->ref = 1;
                ref->buffer = iter->trace_buffer->buffer;
                ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
                if (!ref->page) {
+                        ret = -ENOMEM;
                        kfree(ref);
                        break;
                }
@@ -5582,19 +5579,19 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        /* did we read anything? */
        if (!spd.nr_pages) {
+                if (ret)
+                        goto out;
                if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
                        ret = -EAGAIN;
                        goto out;
                }
                mutex_unlock(&trace_types_lock);
-                ret = wait_on_pipe(iter);
+                ret = wait_on_pipe(iter, true);
                mutex_lock(&trace_types_lock);
                if (ret)
                        goto out;
-                if (signal_pending(current)) {
-                        ret = -EINTR;
-                        goto out;
-                }
                goto again;
        }
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 4dc8b79c5f75..29228c4d5696 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -313,7 +313,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
        int size;
        syscall_nr = trace_get_syscall_nr(current, regs);
-        if (syscall_nr < 0)
+        if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
                return;
        /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
@@ -360,7 +360,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
        int syscall_nr;
        syscall_nr = trace_get_syscall_nr(current, regs);
-        if (syscall_nr < 0)
+        if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
                return;
        /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
@@ -567,7 +567,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
        int size;
        syscall_nr = trace_get_syscall_nr(current, regs);
-        if (syscall_nr < 0)
+        if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
                return;
        if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
                return;
@@ -641,7 +641,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
        int size;
        syscall_nr = trace_get_syscall_nr(current, regs);
-        if (syscall_nr < 0)
+        if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
                return;
        if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
                return;
author	Al Viro <viro@zeniv.linux.org.uk>	2014-12-08 20:39:29 -0500
committer	Al Viro <viro@zeniv.linux.org.uk>	2014-12-08 20:39:29 -0500
commit	ba00410b8131b23edfb0e09f8b6dd26c8eb621fb (patch)
tree	c08504e4d2fa51ac91cef544f336d0169806c49f /kernel
parent	8ce74dd6057832618957fc2cbd38fa959c3a0a6c (diff)
parent	aa583096d9767892983332e7c1a984bd17e3cd39 (diff)