24 files changed, 382 insertions, 99 deletions
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index fff3650d52fc..570eeca7bdfa 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -26,11 +26,18 @@ struct bpf_htab {
        struct bucket *buckets;
        void *elems;
        struct pcpu_freelist freelist;
+        void __percpu *extra_elems;
        atomic_t count; /* number of elements in this hashtable */
        u32 n_buckets;  /* number of hash buckets */
        u32 elem_size;  /* size of each element in bytes */
 };
+enum extra_elem_state {
+        HTAB_NOT_AN_EXTRA_ELEM = 0,
+        HTAB_EXTRA_ELEM_FREE,
+        HTAB_EXTRA_ELEM_USED
+};
 /* each htab element is struct htab_elem + key + value */
 struct htab_elem {
        union {
@@ -38,7 +45,10 @@ struct htab_elem {
                struct bpf_htab *htab;
                struct pcpu_freelist_node fnode;
        };
-        struct rcu_head rcu;
+        union {
+                struct rcu_head rcu;
+                enum extra_elem_state state;
+        };
        u32 hash;
        char key[0] __aligned(8);
 };
@@ -113,6 +123,23 @@ free_elems:
        return err;
 }
+static int alloc_extra_elems(struct bpf_htab *htab)
+{
+        void __percpu *pptr;
+        int cpu;
+        pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
+        if (!pptr)
+                return -ENOMEM;
+        for_each_possible_cpu(cpu) {
+                ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
+                        HTAB_EXTRA_ELEM_FREE;
+        }
+        htab->extra_elems = pptr;
+        return 0;
+}
 /* Called from syscall */
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 {
@@ -185,6 +212,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
        if (percpu)
                cost += (u64) round_up(htab->map.value_size, 8) *
                        num_possible_cpus() * htab->map.max_entries;
+        else
+               cost += (u64) htab->elem_size * num_possible_cpus();
        if (cost >= U32_MAX - PAGE_SIZE)
                /* make sure page count doesn't overflow */
@@ -212,14 +241,22 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
                raw_spin_lock_init(&htab->buckets[i].lock);
        }
+        if (!percpu) {
+                err = alloc_extra_elems(htab);
+                if (err)
+                        goto free_buckets;
+        }
        if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
                err = prealloc_elems_and_freelist(htab);
                if (err)
-                        goto free_buckets;
+                        goto free_extra_elems;
        }
        return &htab->map;
+free_extra_elems:
+        free_percpu(htab->extra_elems);
 free_buckets:
        kvfree(htab->buckets);
 free_htab:
@@ -349,7 +386,6 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
        if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
                free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
        kfree(l);
 }
 static void htab_elem_free_rcu(struct rcu_head *head)
@@ -370,6 +406,11 @@ static void htab_elem_free_rcu(struct rcu_head *head)
 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 {
+        if (l->state == HTAB_EXTRA_ELEM_USED) {
+                l->state = HTAB_EXTRA_ELEM_FREE;
+                return;
+        }
        if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
                pcpu_freelist_push(&htab->freelist, &l->fnode);
        } else {
@@ -381,25 +422,44 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
                                         void *value, u32 key_size, u32 hash,
-                                         bool percpu, bool onallcpus)
+                                         bool percpu, bool onallcpus,
+                                         bool old_elem_exists)
 {
        u32 size = htab->map.value_size;
        bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
        struct htab_elem *l_new;
        void __percpu *pptr;
+        int err = 0;
        if (prealloc) {
                l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
                if (!l_new)
-                        return ERR_PTR(-E2BIG);
+                        err = -E2BIG;
        } else {
                if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
                        atomic_dec(&htab->count);
-                        return ERR_PTR(-E2BIG);
+                        err = -E2BIG;
+                } else {
+                        l_new = kmalloc(htab->elem_size,
+                                        GFP_ATOMIC | __GFP_NOWARN);
+                        if (!l_new)
+                                return ERR_PTR(-ENOMEM);
                }
-                l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+        }
-                if (!l_new)
-                        return ERR_PTR(-ENOMEM);
+        if (err) {
+                if (!old_elem_exists)
+                        return ERR_PTR(err);
+                /* if we're updating the existing element and the hash table
+                 * is full, use per-cpu extra elems
+                 */
+                l_new = this_cpu_ptr(htab->extra_elems);
+                if (l_new->state != HTAB_EXTRA_ELEM_FREE)
+                        return ERR_PTR(-E2BIG);
+                l_new->state = HTAB_EXTRA_ELEM_USED;
+        } else {
+                l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
        }
        memcpy(l_new->key, key, key_size);
@@ -489,7 +549,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
        if (ret)
                goto err;
-        l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
+        l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
+                                !!l_old);
        if (IS_ERR(l_new)) {
                /* all pre-allocated elements are in use or memory exhausted */
                ret = PTR_ERR(l_new);
@@ -563,7 +624,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
                }
        } else {
                l_new = alloc_htab_elem(htab, key, value, key_size,
-                                        hash, true, onallcpus);
+                                        hash, true, onallcpus, false);
                if (IS_ERR(l_new)) {
                        ret = PTR_ERR(l_new);
                        goto err;
@@ -652,6 +713,7 @@ static void htab_map_free(struct bpf_map *map)
                htab_free_elems(htab);
                pcpu_freelist_destroy(&htab->freelist);
        }
+        free_percpu(htab->extra_elems);
        kvfree(htab->buckets);
        kfree(htab);
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f72f23b8fdab..daea765d72e6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -194,6 +194,7 @@ struct verifier_env {
        struct verifier_state_list **explored_states; /* search pruning optimization */
        struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
        u32 used_map_cnt;               /* number of used maps */
+        u32 id_gen;                     /* used to generate unique reg IDs */
        bool allow_ptr_leaks;
 };
@@ -1052,7 +1053,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
                        goto error;
                break;
        case BPF_MAP_TYPE_CGROUP_ARRAY:
-                if (func_id != BPF_FUNC_skb_in_cgroup)
+                if (func_id != BPF_FUNC_skb_under_cgroup)
                        goto error;
                break;
        default:
@@ -1074,7 +1075,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
                if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
                        goto error;
                break;
-        case BPF_FUNC_skb_in_cgroup:
+        case BPF_FUNC_skb_under_cgroup:
                if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
                        goto error;
                break;
@@ -1301,7 +1302,7 @@ add_imm:
                /* dst_reg stays as pkt_ptr type and since some positive
                 * integer value was added to the pointer, increment its 'id'
                 */
-                dst_reg->id++;
+                dst_reg->id = ++env->id_gen;
                /* something was added to pkt_ptr, set range and off to zero */
                dst_reg->off = 0;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a19550d80ab1..3cfabdf7b942 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -242,18 +242,6 @@ unlock:
        return ret;
 }
-static void event_function_local(struct perf_event *event, event_f func, void *data)
-{
-        struct event_function_struct efs = {
-                .event = event,
-                .func = func,
-                .data = data,
-        };
-        int ret = event_function(&efs);
-        WARN_ON_ONCE(ret);
-}
 static void event_function_call(struct perf_event *event, event_f func, void *data)
 {
        struct perf_event_context *ctx = event->ctx;
@@ -303,6 +291,54 @@ again:
        raw_spin_unlock_irq(&ctx->lock);
 }
+/*
+ * Similar to event_function_call() + event_function(), but hard assumes IRQs
+ * are already disabled and we're on the right CPU.
+ */
+static void event_function_local(struct perf_event *event, event_f func, void *data)
+{
+        struct perf_event_context *ctx = event->ctx;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+        struct task_struct *task = READ_ONCE(ctx->task);
+        struct perf_event_context *task_ctx = NULL;
+        WARN_ON_ONCE(!irqs_disabled());
+        if (task) {
+                if (task == TASK_TOMBSTONE)
+                        return;
+                task_ctx = ctx;
+        }
+        perf_ctx_lock(cpuctx, task_ctx);
+        task = ctx->task;
+        if (task == TASK_TOMBSTONE)
+                goto unlock;
+        if (task) {
+                /*
+                 * We must be either inactive or active and the right task,
+                 * otherwise we're screwed, since we cannot IPI to somewhere
+                 * else.
+                 */
+                if (ctx->is_active) {
+                        if (WARN_ON_ONCE(task != current))
+                                goto unlock;
+                        if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
+                                goto unlock;
+                }
+        } else {
+                WARN_ON_ONCE(&cpuctx->ctx != ctx);
+        }
+        func(event, cpuctx, ctx, data);
+unlock:
+        perf_ctx_unlock(cpuctx, task_ctx);
+}
 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
                       PERF_FLAG_FD_OUTPUT  |\
                       PERF_FLAG_PID_CGROUP |\
@@ -843,6 +879,32 @@ perf_cgroup_mark_enabled(struct perf_event *event,
                }
        }
 }
+/*
+ * Update cpuctx->cgrp so that it is set when first cgroup event is added and
+ * cleared when last cgroup event is removed.
+ */
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+                         struct perf_event_context *ctx, bool add)
+{
+        struct perf_cpu_context *cpuctx;
+        if (!is_cgroup_event(event))
+                return;
+        if (add && ctx->nr_cgroups++)
+                return;
+        else if (!add && --ctx->nr_cgroups)
+                return;
+        /*
+         * Because cgroup events are always per-cpu events,
+         * this will always be called from the right CPU.
+         */
+        cpuctx = __get_cpu_context(ctx);
+        cpuctx->cgrp = add ? event->cgrp : NULL;
+}
 #else /* !CONFIG_CGROUP_PERF */
 static inline bool
@@ -920,6 +982,13 @@ perf_cgroup_mark_enabled(struct perf_event *event,
                         struct perf_event_context *ctx)
 {
 }
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+                         struct perf_event_context *ctx, bool add)
+{
+}
 #endif
 /*
@@ -1392,6 +1461,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 static void
 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 {
        lockdep_assert_held(&ctx->lock);
        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
@@ -1412,8 +1482,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
                list_add_tail(&event->group_entry, list);
        }
-        if (is_cgroup_event(event))
+        list_update_cgroup_event(event, ctx, true);
-                ctx->nr_cgroups++;
        list_add_rcu(&event->event_entry, &ctx->event_list);
        ctx->nr_events++;
@@ -1581,8 +1650,6 @@ static void perf_group_attach(struct perf_event *event)
 static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
-        struct perf_cpu_context *cpuctx;
        WARN_ON_ONCE(event->ctx != ctx);
        lockdep_assert_held(&ctx->lock);
@@ -1594,20 +1661,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
        event->attach_state &= ~PERF_ATTACH_CONTEXT;
-        if (is_cgroup_event(event)) {
+        list_update_cgroup_event(event, ctx, false);
-                ctx->nr_cgroups--;
-                /*
-                 * Because cgroup events are always per-cpu events, this will
-                 * always be called from the right CPU.
-                 */
-                cpuctx = __get_cpu_context(ctx);
-                /*
-                 * If there are no more cgroup events then clear cgrp to avoid
-                 * stale pointer in update_cgrp_time_from_cpuctx().
-                 */
-                if (!ctx->nr_cgroups)
-                        cpuctx->cgrp = NULL;
-        }
        ctx->nr_events--;
        if (event->attr.inherit_stat)
@@ -1716,8 +1770,8 @@ static inline int pmu_filter_match(struct perf_event *event)
 static inline int
 event_filter_match(struct perf_event *event)
 {
-        return (event->cpu == -1 || event->cpu == smp_processor_id())
+        return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
-            && perf_cgroup_match(event) && pmu_filter_match(event);
+               perf_cgroup_match(event) && pmu_filter_match(event);
 }
 static void
@@ -1737,8 +1791,8 @@ event_sched_out(struct perf_event *event,
         * maintained, otherwise bogus information is return
         * via read() for time_enabled, time_running:
         */
-        if (event->state == PERF_EVENT_STATE_INACTIVE
+        if (event->state == PERF_EVENT_STATE_INACTIVE &&
-            && !event_filter_match(event)) {
+            !event_filter_match(event)) {
                delta = tstamp - event->tstamp_stopped;
                event->tstamp_running += delta;
                event->tstamp_stopped = tstamp;
@@ -2236,10 +2290,15 @@ perf_install_in_context(struct perf_event_context *ctx,
        lockdep_assert_held(&ctx->mutex);
-        event->ctx = ctx;
        if (event->cpu != -1)
                event->cpu = cpu;
+        /*
+         * Ensures that if we can observe event->ctx, both the event and ctx
+         * will be 'complete'. See perf_iterate_sb_cpu().
+         */
+        smp_store_release(&event->ctx, ctx);
        if (!task) {
                cpu_function_call(cpu, __perf_install_in_context, event);
                return;
@@ -3490,9 +3549,10 @@ static int perf_event_read(struct perf_event *event, bool group)
                        .group = group,
                        .ret = 0,
                };
-                smp_call_function_single(event->oncpu,
+                ret = smp_call_function_single(event->oncpu, __perf_event_read, &data, 1);
-                                         __perf_event_read, &data, 1);
+                /* The event must have been read from an online CPU: */
-                ret = data.ret;
+                WARN_ON_ONCE(ret);
+                ret = ret ? : data.ret;
        } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;
@@ -5969,6 +6029,14 @@ static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
        struct perf_event *event;
        list_for_each_entry_rcu(event, &pel->list, sb_list) {
+                /*
+                 * Skip events that are not fully formed yet; ensure that
+                 * if we observe event->ctx, both event and ctx will be
+                 * complete enough. See perf_install_in_context().
+                 */
+                if (!smp_load_acquire(&event->ctx))
+                        continue;
                if (event->state < PERF_EVENT_STATE_INACTIVE)
                        continue;
                if (!event_filter_match(event))
@@ -6098,7 +6166,7 @@ static int __perf_pmu_output_stop(void *info)
 {
        struct perf_event *event = info;
        struct pmu *pmu = event->pmu;
-        struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
        struct remote_output ro = {
                .rb     = event->rb,
        };
@@ -6553,15 +6621,6 @@ got_name:
 }
 /*
- * Whether this @filter depends on a dynamic object which is not loaded
- * yet or its load addresses are not known.
- */
-static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
-{
-        return filter->filter && filter->inode;
-}
-/*
 * Check whether inode and address range match filter criteria.
 */
 static bool perf_addr_filter_match(struct perf_addr_filter *filter,
@@ -6622,6 +6681,13 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
        struct perf_event_context *ctx;
        int ctxn;
+        /*
+         * Data tracing isn't supported yet and as such there is no need
+         * to keep track of anything that isn't related to executable code:
+         */
+        if (!(vma->vm_flags & VM_EXEC))
+                return;
        rcu_read_lock();
        for_each_task_context_nr(ctxn) {
                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
@@ -7774,7 +7840,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
        list_for_each_entry(filter, &ifh->list, entry) {
                event->addr_filters_offs[count] = 0;
-                if (perf_addr_filter_needs_mmap(filter))
+                /*
+                 * Adjust base offset if the filter is associated to a binary
+                 * that needs to be mapped:
+                 */
+                if (filter->inode)
                        event->addr_filters_offs[count] =
                                perf_addr_filter_apply(filter, mm);
@@ -7905,8 +7975,10 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                                        goto fail;
                        }
-                        if (token == IF_SRC_FILE) {
+                        if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
-                                filename = match_strdup(&args[2]);
+                                int fpos = filter->range ? 2 : 1;
+                                filename = match_strdup(&args[fpos]);
                                if (!filename) {
                                        ret = -ENOMEM;
                                        goto fail;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b7a525ab2083..8c50276b60d1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -172,8 +172,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        err = -EAGAIN;
        ptep = page_check_address(page, mm, addr, &ptl, 0);
-        if (!ptep)
+        if (!ptep) {
+                mem_cgroup_cancel_charge(kpage, memcg, false);
                goto unlock;
+        }
        get_page(kpage);
        page_add_new_anon_rmap(kpage, vma, addr, false);
@@ -200,7 +202,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        err = 0;
 unlock:
-        mem_cgroup_cancel_charge(kpage, memcg, false);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        unlock_page(page);
        return err;
diff --git a/kernel/futex.c b/kernel/futex.c
index 33664f70e2d2..46cb3a301bc1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -179,7 +179,15 @@ int __read_mostly futex_cmpxchg_enabled;
 * Futex flags used to encode options to functions and preserve them across
 * restarts.
 */
-#define FLAGS_SHARED            0x01
+#ifdef CONFIG_MMU
+# define FLAGS_SHARED           0x01
+#else
+/*
+ * NOMMU does not have per process address space. Let the compiler optimize
+ * code away.
+ */
+# define FLAGS_SHARED           0x00
+#endif
 #define FLAGS_CLOCKRT           0x02
 #define FLAGS_HAS_TIMEOUT       0x04
@@ -405,6 +413,16 @@ static void get_futex_key_refs(union futex_key *key)
        if (!key->both.ptr)
                return;
+        /*
+         * On MMU less systems futexes are always "private" as there is no per
+         * process address space. We need the smp wmb nevertheless - yes,
+         * arch/blackfin has MMU less SMP ...
+         */
+        if (!IS_ENABLED(CONFIG_MMU)) {
+                smp_mb(); /* explicit smp_mb(); (B) */
+                return;
+        }
        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
        case FUT_OFF_INODE:
                ihold(key->shared.inode); /* implies smp_mb(); (B) */
@@ -436,6 +454,9 @@ static void drop_futex_key_refs(union futex_key *key)
                return;
        }
+        if (!IS_ENABLED(CONFIG_MMU))
+                return;
        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
        case FUT_OFF_INODE:
                iput(key->shared.inode);
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f68959341c0f..32f6cfcff212 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -39,6 +39,7 @@ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
                return NULL;
        }
+        get_online_cpus();
        if (max_vecs >= num_online_cpus()) {
                cpumask_copy(affinity_mask, cpu_online_mask);
                *nr_vecs = num_online_cpus();
@@ -56,6 +57,7 @@ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
                }
                *nr_vecs = vecs;
        }
+        put_online_cpus();
        return affinity_mask;
 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b4c1bc7c9ca2..637389088b3f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -820,6 +820,17 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
        desc->name = name;
        if (handle != handle_bad_irq && is_chained) {
+                /*
+                 * We're about to start this interrupt immediately,
+                 * hence the need to set the trigger configuration.
+                 * But the .set_type callback may have overridden the
+                 * flow handler, ignoring that we're dealing with a
+                 * chained interrupt. Reset it immediately because we
+                 * do know better.
+                 */
+                __irq_set_trigger(desc, irqd_get_trigger_type(&desc->irq_data));
+                desc->handle_irq = handle;
                irq_settings_set_noprobe(desc);
                irq_settings_set_norequest(desc);
                irq_settings_set_nothread(desc);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 73a2b786b5e9..9530fcd27704 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1681,8 +1681,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
        action->dev_id = dev_id;
        retval = irq_chip_pm_get(&desc->irq_data);
-        if (retval < 0)
+        if (retval < 0) {
+                kfree(action);
                return retval;
+        }
        chip_bus_lock(desc);
        retval = __setup_irq(irq, desc, action);
@@ -1985,8 +1987,10 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
        action->percpu_dev_id = dev_id;
        retval = irq_chip_pm_get(&desc->irq_data);
-        if (retval < 0)
+        if (retval < 0) {
+                kfree(action);
                return retval;
+        }
        chip_bus_lock(desc);
        retval = __setup_irq(irq, desc, action);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 54999350162c..19e9dfbe97fa 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -359,6 +359,17 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
                else
                        dev_dbg(dev, "irq [%d-%d] for MSI\n",
                                virq, virq + desc->nvec_used - 1);
+                /*
+                 * This flag is set by the PCI layer as we need to activate
+                 * the MSI entries before the PCI layer enables MSI in the
+                 * card. Otherwise the card latches a random msi message.
+                 */
+                if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
+                        struct irq_data *irq_data;
+                        irq_data = irq_domain_get_irq_data(domain, desc->irq);
+                        irq_domain_activate_irq(irq_data);
+                }
        }
        return 0;
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 37649e69056c..8a99abf58080 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -450,7 +450,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
                                goto gotlock;
                        }
                }
-                WRITE_ONCE(pn->state, vcpu_halted);
+                WRITE_ONCE(pn->state, vcpu_hashed);
                qstat_inc(qstat_pv_wait_head, true);
                qstat_inc(qstat_pv_wait_again, waitcnt);
                pv_wait(&l->locked, _Q_SLOW_VAL);
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 22e025309845..b9d031516254 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -153,7 +153,6 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf,
                 */
                if ((counter == qstat_pv_latency_kick) ||
                    (counter == qstat_pv_latency_wake)) {
-                        stat = 0;
                        if (kicks)
                                stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
                }
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a881c6a7ba74..33c79b6105c5 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -300,12 +300,12 @@ static int create_image(int platform_mode)
        save_processor_state();
        trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
        error = swsusp_arch_suspend();
+        /* Restore control flow magically appears here */
+        restore_processor_state();
        trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
        if (error)
                printk(KERN_ERR "PM: Error %d creating hibernation image\n",
                        error);
-        /* Restore control flow magically appears here */
-        restore_processor_state();
        if (!in_suspend)
                events_check_enabled = false;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 9a0178c2ac1d..b02228411d57 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -835,9 +835,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
 */
 static bool rtree_next_node(struct memory_bitmap *bm)
 {
-        bm->cur.node = list_entry(bm->cur.node->list.next,
+        if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) {
-                                  struct rtree_node, list);
+                bm->cur.node = list_entry(bm->cur.node->list.next,
-        if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+                                          struct rtree_node, list);
                bm->cur.node_pfn += BM_BITS_PER_BLOCK;
                bm->cur.node_bit  = 0;
                touch_softlockup_watchdog();
@@ -845,9 +845,9 @@ static bool rtree_next_node(struct memory_bitmap *bm)
        }
        /* No more nodes, goto next zone */
-        bm->cur.zone = list_entry(bm->cur.zone->list.next,
+        if (!list_is_last(&bm->cur.zone->list, &bm->zones)) {
+                bm->cur.zone = list_entry(bm->cur.zone->list.next,
                                  struct mem_zone_bm_rtree, list);
-        if (&bm->cur.zone->list != &bm->zones) {
                bm->cur.node = list_entry(bm->cur.zone->leaves.next,
                                          struct rtree_node, list);
                bm->cur.node_pfn = 0;
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index 276762f3a460..d5760c42f042 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -9,10 +9,10 @@
 char *_braille_console_setup(char **str, char **brl_options)
 {
-        if (!memcmp(*str, "brl,", 4)) {
+        if (!strncmp(*str, "brl,", 4)) {
                *brl_options = "";
                *str += 4;
-        } else if (!memcmp(str, "brl=", 4)) {
+        } else if (!strncmp(*str, "brl=", 4)) {
                *brl_options = *str + 4;
                *str = strchr(*brl_options, ',');
                if (!*str)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5c883fe8e440..2a906f20fba7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,7 @@
 #include <linux/context_tracking.h>
 #include <linux/compiler.h>
 #include <linux/frame.h>
+#include <linux/prefetch.h>
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -2972,6 +2973,23 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
 /*
+ * The function fair_sched_class.update_curr accesses the struct curr
+ * and its field curr->exec_start; when called from task_sched_runtime(),
+ * we observe a high rate of cache misses in practice.
+ * Prefetching this data results in improved performance.
+ */
+static inline void prefetch_curr_exec_start(struct task_struct *p)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        struct sched_entity *curr = (&p->se)->cfs_rq->curr;
+#else
+        struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
+#endif
+        prefetch(curr);
+        prefetch(&curr->exec_start);
+}
+/*
 * Return accounted runtime for the task.
 * In case the task is currently running, return the runtime plus current's
 * pending runtime that have not been accounted yet.
@@ -3005,6 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
         * thread, breaking clock_gettime().
         */
        if (task_current(rq, p) && task_on_rq_queued(p)) {
+                prefetch_curr_exec_start(p);
                update_rq_clock(rq);
                p->sched_class->update_curr(rq);
        }
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5be58820465c..d4184498c9f5 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -168,7 +168,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
        if (old_idx == IDX_INVALID) {
                cp->size++;
-                cp->elements[cp->size - 1].dl = 0;
+                cp->elements[cp->size - 1].dl = dl;
                cp->elements[cp->size - 1].cpu = cpu;
                cp->elements[cpu].idx = cp->size - 1;
                cpudl_change_key(cp, cp->size - 1, dl);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 1934f658c036..a846cf89eb96 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -263,6 +263,11 @@ void account_idle_time(cputime_t cputime)
                cpustat[CPUTIME_IDLE] += (__force u64) cputime;
 }
+/*
+ * When a guest is interrupted for a longer amount of time, missed clock
+ * ticks are not redelivered later. Due to that, this function may on
+ * occasion account more time than the calling functions think elapsed.
+ */
 static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
 {
 #ifdef CONFIG_PARAVIRT
@@ -371,7 +376,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
         * idle, or potentially user or system time. Due to rounding,
         * other time can exceed ticks occasionally.
         */
-        other = account_other_time(cputime);
+        other = account_other_time(ULONG_MAX);
        if (other >= cputime)
                return;
        cputime -= other;
@@ -486,7 +491,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
        }
        cputime = cputime_one_jiffy;
-        steal = steal_account_process_time(cputime);
+        steal = steal_account_process_time(ULONG_MAX);
        if (steal >= cputime)
                return;
@@ -508,13 +513,21 @@ void account_process_tick(struct task_struct *p, int user_tick)
 */
 void account_idle_ticks(unsigned long ticks)
 {
+        cputime_t cputime, steal;
        if (sched_clock_irqtime) {
                irqtime_account_idle_ticks(ticks);
                return;
        }
-        account_idle_time(jiffies_to_cputime(ticks));
+        cputime = jiffies_to_cputime(ticks);
+        steal = steal_account_process_time(ULONG_MAX);
+        if (steal >= cputime)
+                return;
+        cputime -= steal;
+        account_idle_time(cputime);
 }
 /*
@@ -606,19 +619,25 @@ static void cputime_adjust(struct task_cputime *curr,
        stime = curr->stime;
        utime = curr->utime;
-        if (utime == 0) {
+        /*
-                stime = rtime;
+         * If either stime or both stime and utime are 0, assume all runtime is
+         * userspace. Once a task gets some ticks, the monotonicy code at
+         * 'update' will ensure things converge to the observed ratio.
+         */
+        if (stime == 0) {
+                utime = rtime;
                goto update;
        }
-        if (stime == 0) {
+        if (utime == 0) {
-                utime = rtime;
+                stime = rtime;
                goto update;
        }
        stime = scale_stime((__force u64)stime, (__force u64)rtime,
                            (__force u64)(stime + utime));
+update:
        /*
         * Make sure stime doesn't go backwards; this preserves monotonicity
         * for utime because rtime is monotonic.
@@ -641,7 +660,6 @@ static void cputime_adjust(struct task_cputime *curr,
                stime = rtime - utime;
        }
-update:
        prev->stime = stime;
        prev->utime = utime;
 out:
@@ -686,6 +704,13 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
        unsigned long now = READ_ONCE(jiffies);
        cputime_t delta, other;
+        /*
+         * Unlike tick based timing, vtime based timing never has lost
+         * ticks, and no need for steal time accounting to make up for
+         * lost ticks. Vtime accounts a rounded version of actual
+         * elapsed time. Limit account_other_time to prevent rounding
+         * errors from causing elapsed vtime to go negative.
+         */
        delta = jiffies_to_cputime(now - tsk->vtime_snap);
        other = account_other_time(delta);
        WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fcb7f0217ff4..1ce8867283dc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -658,8 +658,11 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
         *
         * XXX figure out if select_task_rq_dl() deals with offline cpus.
         */
-        if (unlikely(!rq->online))
+        if (unlikely(!rq->online)) {
+                lockdep_unpin_lock(&rq->lock, rf.cookie);
                rq = dl_task_offline_migration(rq, p);
+                rf.cookie = lockdep_pin_lock(&rq->lock);
+        }
        /*
         * Queueing this task back might have overloaded rq, check if we need
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4088eedea763..039de34f1521 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4269,7 +4269,7 @@ static void sync_throttle(struct task_group *tg, int cpu)
        pcfs_rq = tg->parent->cfs_rq[cpu];
        cfs_rq->throttle_count = pcfs_rq->throttle_count;
-        pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+        cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
 }
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b43d0b27c1fe..a13bbdaab47d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2140,6 +2140,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
        return 0;
 }
+static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
+                                 int *valp,
+                                 int write, void *data)
+{
+        if (write) {
+                if (*negp)
+                        return -EINVAL;
+                *valp = *lvalp;
+        } else {
+                unsigned int val = *valp;
+                *lvalp = (unsigned long)val;
+        }
+        return 0;
+}
 static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
 static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
@@ -2259,8 +2274,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
 int proc_dointvec(struct ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_dointvec(table,write,buffer,lenp,ppos,
+        return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
-                            NULL,NULL);
+}
+/**
+ * proc_douintvec - read a vector of unsigned integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec(struct ctl_table *table, int write,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        return do_proc_dointvec(table, write, buffer, lenp, ppos,
+                                do_proc_douintvec_conv, NULL);
 }
 /*
@@ -2858,6 +2892,12 @@ int proc_dointvec(struct ctl_table *table, int write,
        return -ENOSYS;
 }
+int proc_douintvec(struct ctl_table *table, int write,
+                  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        return -ENOSYS;
+}
 int proc_dointvec_minmax(struct ctl_table *table, int write,
                    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -2903,6 +2943,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 * exception granted :-)
 */
 EXPORT_SYMBOL(proc_dointvec);
+EXPORT_SYMBOL(proc_douintvec);
 EXPORT_SYMBOL(proc_dointvec_jiffies);
 EXPORT_SYMBOL(proc_dointvec_minmax);
 EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3b65746c7f15..e07fb093f819 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -401,7 +401,10 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
        do {
                seq = raw_read_seqcount_latch(&tkf->seq);
                tkr = tkf->base + (seq & 0x01);
-                now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+                now = ktime_to_ns(tkr->base);
+                now += clocksource_delta(tkr->read(tkr->clock),
+                                         tkr->cycle_last, tkr->mask);
        } while (read_seqcount_retry(&tkf->seq, seq));
        return now;
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index f6bd65236712..107310a6f36f 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -23,7 +23,9 @@
 #include "timekeeping_internal.h"
-static unsigned int sleep_time_bin[32] = {0};
+#define NUM_BINS 32
+static unsigned int sleep_time_bin[NUM_BINS] = {0};
 static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
 {
@@ -69,6 +71,9 @@ late_initcall(tk_debug_sleep_time_init);
 void tk_debug_account_sleep_time(struct timespec64 *t)
 {
-        sleep_time_bin[fls(t->tv_sec)]++;
+        /* Cap bin index so we don't overflow the array */
+        int bin = min(fls(t->tv_sec), NUM_BINS-1);
+        sleep_time_bin[bin]++;
 }
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 555670a5143c..32bf6f75a8fe 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1496,6 +1496,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
        u64 expires = KTIME_MAX;
        unsigned long nextevt;
+        bool is_max_delta;
        /*
         * Pretend that there is no timer pending if the cpu is offline.
@@ -1506,6 +1507,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
        spin_lock(&base->lock);
        nextevt = __next_timer_interrupt(base);
+        is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
        base->next_expiry = nextevt;
        /*
         * We have a fresh next event. Check whether we can forward the base:
@@ -1519,7 +1521,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
                expires = basem;
                base->is_idle = false;
        } else {
-                expires = basem + (nextevt - basej) * TICK_NSEC;
+                if (!is_max_delta)
+                        expires = basem + (nextevt - basej) * TICK_NSEC;
                /*
                 * If we expect to sleep more than a tick, mark the base idle:
                 */
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7598e6ca817a..dbafc5df03f3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -223,7 +223,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
        what |= MASK_TC_BIT(op_flags, META);
        what |= MASK_TC_BIT(op_flags, PREFLUSH);
        what |= MASK_TC_BIT(op_flags, FUA);
-        if (op == REQ_OP_DISCARD)
+        if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
                what |= BLK_TC_ACT(BLK_TC_DISCARD);
        if (op == REQ_OP_FLUSH)
                what |= BLK_TC_ACT(BLK_TC_FLUSH);