14 files changed, 124 insertions, 68 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5df20d6d1520..29de1a9352c0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -228,7 +228,7 @@ static struct {
        .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
        .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-        .dep_map = {.name = "cpu_hotplug.lock" },
+        .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map),
 #endif
 };
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c6e47e97b33f..0e292132efac 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1960,6 +1960,12 @@ void perf_event_disable(struct perf_event *event)
 }
 EXPORT_SYMBOL_GPL(perf_event_disable);
+void perf_event_disable_inatomic(struct perf_event *event)
+{
+        event->pending_disable = 1;
+        irq_work_queue(&event->pending);
+}
 static void perf_set_shadow_time(struct perf_event *event,
                                 struct perf_event_context *ctx,
                                 u64 tstamp)
@@ -7075,8 +7081,8 @@ static int __perf_event_overflow(struct perf_event *event,
        if (events && atomic_dec_and_test(&event->event_limit)) {
                ret = 1;
                event->pending_kill = POLL_HUP;
-                event->pending_disable = 1;
-                irq_work_queue(&event->pending);
+                perf_event_disable_inatomic(event);
        }
        READ_ONCE(event->overflow_handler)(event, data, regs);
@@ -8855,7 +8861,10 @@ EXPORT_SYMBOL_GPL(perf_pmu_register);
 void perf_pmu_unregister(struct pmu *pmu)
 {
+        int remove_device;
        mutex_lock(&pmus_lock);
+        remove_device = pmu_bus_running;
        list_del_rcu(&pmu->entry);
        mutex_unlock(&pmus_lock);
@@ -8869,10 +8878,12 @@ void perf_pmu_unregister(struct pmu *pmu)
        free_percpu(pmu->pmu_disable_count);
        if (pmu->type >= PERF_TYPE_MAX)
                idr_remove(&pmu_idr, pmu->type);
-        if (pmu->nr_addr_filters)
+        if (remove_device) {
-                device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
+                if (pmu->nr_addr_filters)
-        device_del(pmu->dev);
+                        device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
-        put_device(pmu->dev);
+                device_del(pmu->dev);
+                put_device(pmu->dev);
+        }
        free_pmu_context(pmu);
 }
 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d4129bb05e5d..f9ec9add2164 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -300,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
 retry:
        /* Read the page with vaddr into memory */
-        ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
+        ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page,
+                        &vma);
        if (ret <= 0)
                return ret;
@@ -1710,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
         * but we treat this as a 'remote' access since it is
         * essentially a kernel access to the memory.
         */
-        result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
+        result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
+                        NULL);
        if (result < 0)
                return result;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0c5f1a5db654..9c4d30483264 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -721,6 +721,7 @@ int irq_set_parent(int irq, int parent_irq)
        irq_put_desc_unlock(desc, flags);
        return 0;
 }
+EXPORT_SYMBOL_GPL(irq_set_parent);
 #endif
 /*
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 8d44b3fea9d0..30e6d05aa5a9 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -53,8 +53,15 @@ void notrace __sanitizer_cov_trace_pc(void)
        /*
         * We are interested in code coverage as a function of a syscall inputs,
         * so we ignore code executed in interrupts.
+         * The checks for whether we are in an interrupt are open-coded, because
+         * 1. We can't use in_interrupt() here, since it also returns true
+         *    when we are inside local_bh_disable() section.
+         * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()),
+         *    since that leads to slower generated code (three separate tests,
+         *    one for each of the flags).
         */
-        if (!t || in_interrupt())
+        if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET
+                                                        | NMI_MASK)))
                return;
        mode = READ_ONCE(t->kcov_mode);
        if (mode == KCOV_MODE_TRACE) {
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 1e7f5da648d9..6ccb08f57fcb 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -498,9 +498,9 @@ static int enter_state(suspend_state_t state)
 #ifndef CONFIG_SUSPEND_SKIP_SYNC
        trace_suspend_resume(TPS("sync_filesystems"), 0, true);
-        printk(KERN_INFO "PM: Syncing filesystems ... ");
+        pr_info("PM: Syncing filesystems ... ");
        sys_sync();
-        printk("done.\n");
+        pr_cont("done.\n");
        trace_suspend_resume(TPS("sync_filesystems"), 0, false);
 #endif
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index d5e397315473..de08fc90baaf 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1769,6 +1769,10 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c
                cont_flush();
        }
+        /* Skip empty continuation lines that couldn't be added - they just flush */
+        if (!text_len && (lflags & LOG_CONT))
+                return 0;
        /* If it doesn't end in a newline, try to buffer the current line */
        if (!(lflags & LOG_NEWLINE)) {
                if (cont_add(facility, level, lflags, text, text_len))
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2a99027312a6..e6474f7272ec 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -537,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
                int this_len, retval;
                this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
-                retval = access_process_vm(tsk, src, buf, this_len, 0);
+                retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE);
                if (!retval) {
                        if (copied)
                                break;
@@ -564,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
                this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
                if (copy_from_user(buf, src, this_len))
                        return -EFAULT;
-                retval = access_process_vm(tsk, dst, buf, this_len, 1);
+                retval = access_process_vm(tsk, dst, buf, this_len,
+                                FOLL_FORCE | FOLL_WRITE);
                if (!retval) {
                        if (copied)
                                break;
@@ -1127,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
        unsigned long tmp;
        int copied;
-        copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0);
+        copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
        if (copied != sizeof(tmp))
                return -EIO;
        return put_user(tmp, (unsigned long __user *)data);
@@ -1138,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
 {
        int copied;
-        copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
+        copied = access_process_vm(tsk, addr, &data, sizeof(data),
+                        FOLL_FORCE | FOLL_WRITE);
        return (copied == sizeof(data)) ? 0 : -EIO;
 }
@@ -1155,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
        switch (request) {
        case PTRACE_PEEKTEXT:
        case PTRACE_PEEKDATA:
-                ret = access_process_vm(child, addr, &word, sizeof(word), 0);
+                ret = access_process_vm(child, addr, &word, sizeof(word),
+                                FOLL_FORCE);
                if (ret != sizeof(word))
                        ret = -EIO;
                else
@@ -1164,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
        case PTRACE_POKETEXT:
        case PTRACE_POKEDATA:
-                ret = access_process_vm(child, addr, &data, sizeof(data), 1);
+                ret = access_process_vm(child, addr, &data, sizeof(data),
+                                FOLL_FORCE | FOLL_WRITE);
                ret = (ret != sizeof(data) ? -EIO : 0);
                break;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94732d1ab00a..42d4027f9e26 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7515,11 +7515,27 @@ static struct kmem_cache *task_group_cache __read_mostly;
 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
 DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
+#define WAIT_TABLE_BITS 8
+#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
+static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
+{
+        const int shift = BITS_PER_LONG == 32 ? 5 : 6;
+        unsigned long val = (unsigned long)word << shift | bit;
+        return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
+}
+EXPORT_SYMBOL(bit_waitqueue);
 void __init sched_init(void)
 {
        int i, j;
        unsigned long alloc_size = 0, ptr;
+        for (i = 0; i < WAIT_TABLE_SIZE; i++)
+                init_waitqueue_head(bit_wait_table + i);
 #ifdef CONFIG_FAIR_GROUP_SCHED
        alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2d4ad72f8f3c..c242944f5cbd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -690,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se)
         * will definitely be update (after enqueue).
         */
        sa->period_contrib = 1023;
-        sa->load_avg = scale_load_down(se->load.weight);
+        /*
+         * Tasks are intialized with full load to be seen as heavy tasks until
+         * they get a chance to stabilize to their real load level.
+         * Group entities are intialized with zero load to reflect the fact that
+         * nothing has been attached to the task group yet.
+         */
+        if (entity_is_task(se))
+                sa->load_avg = scale_load_down(se->load.weight);
        sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
        /*
         * At this point, util_avg won't be used in select_task_rq_fair anyway
@@ -5471,13 +5478,18 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
 */
 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
 {
-        struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+        struct sched_domain *this_sd;
-        u64 avg_idle = this_rq()->avg_idle;
+        u64 avg_cost, avg_idle = this_rq()->avg_idle;
-        u64 avg_cost = this_sd->avg_scan_cost;
        u64 time, cost;
        s64 delta;
        int cpu, wrap;
+        this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+        if (!this_sd)
+                return -1;
+        avg_cost = this_sd->avg_scan_cost;
        /*
         * Due to large variance we need a large fuzz factor; hackbench in
         * particularly is sensitive here.
@@ -8827,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct sched_entity *se;
        struct cfs_rq *cfs_rq;
-        struct rq *rq;
        int i;
        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8842,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        init_cfs_bandwidth(tg_cfs_bandwidth(tg));
        for_each_possible_cpu(i) {
-                rq = cpu_rq(i);
                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                      GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 4f7053579fe3..9453efe9b25a 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -480,16 +480,6 @@ void wake_up_bit(void *word, int bit)
 }
 EXPORT_SYMBOL(wake_up_bit);
-wait_queue_head_t *bit_waitqueue(void *word, int bit)
-{
-        const int shift = BITS_PER_LONG == 32 ? 5 : 6;
-        const struct zone *zone = page_zone(virt_to_page(word));
-        unsigned long val = (unsigned long)word << shift | bit;
-        return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
-}
-EXPORT_SYMBOL(bit_waitqueue);
 /*
 * Manipulate the atomic_t address to produce a better bit waitqueue table hash
 * index (we're keying off bit -1, but that would produce a horrible hash
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 1bf81ef91375..744fa611cae0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
 DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 const char * const softirq_to_name[NR_SOFTIRQS] = {
-        "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
+        "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
        "TASKLET", "SCHED", "HRTIMER", "RCU"
 };
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index c3aad685bbc0..12dd190634ab 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -542,7 +542,6 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
 static int alarm_timer_create(struct k_itimer *new_timer)
 {
        enum  alarmtimer_type type;
-        struct alarm_base *base;
        if (!alarmtimer_get_rtcdev())
                return -ENOTSUPP;
@@ -551,7 +550,6 @@ static int alarm_timer_create(struct k_itimer *new_timer)
                return -EPERM;
        type = clock2alarm(new_timer->it_clock);
-        base = &alarm_bases[type];
        alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer);
        return 0;
 }
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d47980a1bc4..c611c47de884 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -878,7 +878,7 @@ static inline struct timer_base *get_timer_base(u32 tflags)
 #ifdef CONFIG_NO_HZ_COMMON
 static inline struct timer_base *
-__get_target_base(struct timer_base *base, unsigned tflags)
+get_target_base(struct timer_base *base, unsigned tflags)
 {
 #ifdef CONFIG_SMP
        if ((tflags & TIMER_PINNED) || !base->migration_enabled)
@@ -891,25 +891,27 @@ __get_target_base(struct timer_base *base, unsigned tflags)
 static inline void forward_timer_base(struct timer_base *base)
 {
+        unsigned long jnow = READ_ONCE(jiffies);
        /*
         * We only forward the base when it's idle and we have a delta between
         * base clock and jiffies.
         */
-        if (!base->is_idle || (long) (jiffies - base->clk) < 2)
+        if (!base->is_idle || (long) (jnow - base->clk) < 2)
                return;
        /*
         * If the next expiry value is > jiffies, then we fast forward to
         * jiffies otherwise we forward to the next expiry value.
         */
-        if (time_after(base->next_expiry, jiffies))
+        if (time_after(base->next_expiry, jnow))
-                base->clk = jiffies;
+                base->clk = jnow;
        else
                base->clk = base->next_expiry;
 }
 #else
 static inline struct timer_base *
-__get_target_base(struct timer_base *base, unsigned tflags)
+get_target_base(struct timer_base *base, unsigned tflags)
 {
        return get_timer_this_cpu_base(tflags);
 }
@@ -917,14 +919,6 @@ __get_target_base(struct timer_base *base, unsigned tflags)
 static inline void forward_timer_base(struct timer_base *base) { }
 #endif
-static inline struct timer_base *
-get_target_base(struct timer_base *base, unsigned tflags)
-{
-        struct timer_base *target = __get_target_base(base, tflags);
-        forward_timer_base(target);
-        return target;
-}
 /*
 * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
@@ -943,7 +937,14 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
 {
        for (;;) {
                struct timer_base *base;
-                u32 tf = timer->flags;
+                u32 tf;
+                /*
+                 * We need to use READ_ONCE() here, otherwise the compiler
+                 * might re-read @tf between the check for TIMER_MIGRATING
+                 * and spin_lock().
+                 */
+                tf = READ_ONCE(timer->flags);
                if (!(tf & TIMER_MIGRATING)) {
                        base = get_timer_base(tf);
@@ -964,6 +965,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
        unsigned long clk = 0, flags;
        int ret = 0;
+        BUG_ON(!timer->function);
        /*
         * This is a common optimization triggered by the networking code - if
         * the timer is re-modified to have the same timeout or ends up in the
@@ -972,13 +975,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
        if (timer_pending(timer)) {
                if (timer->expires == expires)
                        return 1;
                /*
-                 * Take the current timer_jiffies of base, but without holding
+                 * We lock timer base and calculate the bucket index right
-                 * the lock!
+                 * here. If the timer ends up in the same bucket, then we
+                 * just update the expiry time and avoid the whole
+                 * dequeue/enqueue dance.
                 */
-                base = get_timer_base(timer->flags);
+                base = lock_timer_base(timer, &flags);
-                clk = base->clk;
+                clk = base->clk;
                idx = calc_wheel_index(expires, clk);
                /*
@@ -988,14 +994,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
                 */
                if (idx == timer_get_idx(timer)) {
                        timer->expires = expires;
-                        return 1;
+                        ret = 1;
+                        goto out_unlock;
                }
+        } else {
+                base = lock_timer_base(timer, &flags);
        }
        timer_stats_timer_set_start_info(timer);
-        BUG_ON(!timer->function);
-        base = lock_timer_base(timer, &flags);
        ret = detach_if_pending(timer, base, false);
        if (!ret && pending_only)
@@ -1025,12 +1031,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
                }
        }
+        /* Try to forward a stale timer base clock */
+        forward_timer_base(base);
        timer->expires = expires;
        /*
         * If 'idx' was calculated above and the base time did not advance
-         * between calculating 'idx' and taking the lock, only enqueue_timer()
+         * between calculating 'idx' and possibly switching the base, only
-         * and trigger_dyntick_cpu() is required. Otherwise we need to
+         * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise
-         * (re)calculate the wheel index via internal_add_timer().
+         * we need to (re)calculate the wheel index via
+         * internal_add_timer().
         */
        if (idx != UINT_MAX && clk == base->clk) {
                enqueue_timer(base, timer, idx);
@@ -1510,12 +1520,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
        is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
        base->next_expiry = nextevt;
        /*
-         * We have a fresh next event. Check whether we can forward the base:
+         * We have a fresh next event. Check whether we can forward the
+         * base. We can only do that when @basej is past base->clk
+         * otherwise we might rewind base->clk.
         */
-        if (time_after(nextevt, jiffies))
+        if (time_after(basej, base->clk)) {
-                base->clk = jiffies;
+                if (time_after(nextevt, basej))
-        else if (time_after(nextevt, base->clk))
+                        base->clk = basej;
-                base->clk = nextevt;
+                else if (time_after(nextevt, base->clk))
+                        base->clk = nextevt;
+        }
        if (time_before_eq(nextevt, basej)) {
                expires = basem;