Merge branch 'x86/urgent' into x86/apic

Bring in the upstream modifications so we can fixup the silent merge conflict which is introduced by this merge. Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
author: Thomas Gleixner <tglx@linutronix.de> 2016-09-26 15:47:03 -0400
committer: Thomas Gleixner <tglx@linutronix.de> 2016-09-26 15:47:03 -0400
commit: 1e1b37273cf719545da50b76f214f983a710aaf4 (patch)
tree: 033f6062325ef7aaeefe8559bb409ab7d2be3c76 /kernel
parent: c183a603e8d8a5a189729b77d0c623a3d5950e5f (diff)
parent: c291b015158577be533dd5a959dfc09bab119eed (diff)
23 files changed, 349 insertions, 104 deletions
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index d6709eb70970..0d302a87f21b 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -19,6 +19,7 @@
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+#include <linux/file.h>
 #include <linux/kernel.h>
 #include <linux/audit.h>
 #include <linux/kthread.h>
@@ -544,10 +545,11 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
        unsigned long ino;
        dev_t dev;
-        rcu_read_lock();
+        exe_file = get_task_exe_file(tsk);
-        exe_file = rcu_dereference(tsk->mm->exe_file);
+        if (!exe_file)
+                return 0;
        ino = exe_file->f_inode->i_ino;
        dev = exe_file->f_inode->i_sb->s_dev;
-        rcu_read_unlock();
+        fput(exe_file);
        return audit_mark_compare(mark, ino, dev);
 }
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index fff3650d52fc..570eeca7bdfa 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -26,11 +26,18 @@ struct bpf_htab {
        struct bucket *buckets;
        void *elems;
        struct pcpu_freelist freelist;
+        void __percpu *extra_elems;
        atomic_t count; /* number of elements in this hashtable */
        u32 n_buckets;  /* number of hash buckets */
        u32 elem_size;  /* size of each element in bytes */
 };
+enum extra_elem_state {
+        HTAB_NOT_AN_EXTRA_ELEM = 0,
+        HTAB_EXTRA_ELEM_FREE,
+        HTAB_EXTRA_ELEM_USED
+};
 /* each htab element is struct htab_elem + key + value */
 struct htab_elem {
        union {
@@ -38,7 +45,10 @@ struct htab_elem {
                struct bpf_htab *htab;
                struct pcpu_freelist_node fnode;
        };
-        struct rcu_head rcu;
+        union {
+                struct rcu_head rcu;
+                enum extra_elem_state state;
+        };
        u32 hash;
        char key[0] __aligned(8);
 };
@@ -113,6 +123,23 @@ free_elems:
        return err;
 }
+static int alloc_extra_elems(struct bpf_htab *htab)
+{
+        void __percpu *pptr;
+        int cpu;
+        pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
+        if (!pptr)
+                return -ENOMEM;
+        for_each_possible_cpu(cpu) {
+                ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
+                        HTAB_EXTRA_ELEM_FREE;
+        }
+        htab->extra_elems = pptr;
+        return 0;
+}
 /* Called from syscall */
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 {
@@ -185,6 +212,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
        if (percpu)
                cost += (u64) round_up(htab->map.value_size, 8) *
                        num_possible_cpus() * htab->map.max_entries;
+        else
+               cost += (u64) htab->elem_size * num_possible_cpus();
        if (cost >= U32_MAX - PAGE_SIZE)
                /* make sure page count doesn't overflow */
@@ -212,14 +241,22 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
                raw_spin_lock_init(&htab->buckets[i].lock);
        }
+        if (!percpu) {
+                err = alloc_extra_elems(htab);
+                if (err)
+                        goto free_buckets;
+        }
        if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
                err = prealloc_elems_and_freelist(htab);
                if (err)
-                        goto free_buckets;
+                        goto free_extra_elems;
        }
        return &htab->map;
+free_extra_elems:
+        free_percpu(htab->extra_elems);
 free_buckets:
        kvfree(htab->buckets);
 free_htab:
@@ -349,7 +386,6 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
        if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
                free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
        kfree(l);
 }
 static void htab_elem_free_rcu(struct rcu_head *head)
@@ -370,6 +406,11 @@ static void htab_elem_free_rcu(struct rcu_head *head)
 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 {
+        if (l->state == HTAB_EXTRA_ELEM_USED) {
+                l->state = HTAB_EXTRA_ELEM_FREE;
+                return;
+        }
        if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
                pcpu_freelist_push(&htab->freelist, &l->fnode);
        } else {
@@ -381,25 +422,44 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
                                         void *value, u32 key_size, u32 hash,
-                                         bool percpu, bool onallcpus)
+                                         bool percpu, bool onallcpus,
+                                         bool old_elem_exists)
 {
        u32 size = htab->map.value_size;
        bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
        struct htab_elem *l_new;
        void __percpu *pptr;
+        int err = 0;
        if (prealloc) {
                l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
                if (!l_new)
-                        return ERR_PTR(-E2BIG);
+                        err = -E2BIG;
        } else {
                if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
                        atomic_dec(&htab->count);
-                        return ERR_PTR(-E2BIG);
+                        err = -E2BIG;
+                } else {
+                        l_new = kmalloc(htab->elem_size,
+                                        GFP_ATOMIC | __GFP_NOWARN);
+                        if (!l_new)
+                                return ERR_PTR(-ENOMEM);
                }
-                l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+        }
-                if (!l_new)
-                        return ERR_PTR(-ENOMEM);
+        if (err) {
+                if (!old_elem_exists)
+                        return ERR_PTR(err);
+                /* if we're updating the existing element and the hash table
+                 * is full, use per-cpu extra elems
+                 */
+                l_new = this_cpu_ptr(htab->extra_elems);
+                if (l_new->state != HTAB_EXTRA_ELEM_FREE)
+                        return ERR_PTR(-E2BIG);
+                l_new->state = HTAB_EXTRA_ELEM_USED;
+        } else {
+                l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
        }
        memcpy(l_new->key, key, key_size);
@@ -489,7 +549,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
        if (ret)
                goto err;
-        l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
+        l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
+                                !!l_old);
        if (IS_ERR(l_new)) {
                /* all pre-allocated elements are in use or memory exhausted */
                ret = PTR_ERR(l_new);
@@ -563,7 +624,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
                }
        } else {
                l_new = alloc_htab_elem(htab, key, value, key_size,
-                                        hash, true, onallcpus);
+                                        hash, true, onallcpus, false);
                if (IS_ERR(l_new)) {
                        ret = PTR_ERR(l_new);
                        goto err;
@@ -652,6 +713,7 @@ static void htab_map_free(struct bpf_map *map)
                htab_free_elems(htab);
                pcpu_freelist_destroy(&htab->freelist);
        }
+        free_percpu(htab->extra_elems);
        kvfree(htab->buckets);
        kfree(htab);
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f72f23b8fdab..daea765d72e6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -194,6 +194,7 @@ struct verifier_env {
        struct verifier_state_list **explored_states; /* search pruning optimization */
        struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
        u32 used_map_cnt;               /* number of used maps */
+        u32 id_gen;                     /* used to generate unique reg IDs */
        bool allow_ptr_leaks;
 };
@@ -1052,7 +1053,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
                        goto error;
                break;
        case BPF_MAP_TYPE_CGROUP_ARRAY:
-                if (func_id != BPF_FUNC_skb_in_cgroup)
+                if (func_id != BPF_FUNC_skb_under_cgroup)
                        goto error;
                break;
        default:
@@ -1074,7 +1075,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
                if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
                        goto error;
                break;
-        case BPF_FUNC_skb_in_cgroup:
+        case BPF_FUNC_skb_under_cgroup:
                if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
                        goto error;
                break;
@@ -1301,7 +1302,7 @@ add_imm:
                /* dst_reg stays as pkt_ptr type and since some positive
                 * integer value was added to the pointer, increment its 'id'
                 */
-                dst_reg->id++;
+                dst_reg->id = ++env->id_gen;
                /* something was added to pkt_ptr, set range and off to zero */
                dst_reg->off = 0;
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index c2de56ab0fce..7fa0c4ae6394 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -1,4 +1,12 @@
+# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+# CONFIG_KERNEL_GZIP is not set
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
 CONFIG_KERNEL_XZ=y
+# CONFIG_KERNEL_LZO is not set
+# CONFIG_KERNEL_LZ4 is not set
 CONFIG_OPTIMIZE_INLINING=y
+# CONFIG_SLAB is not set
+# CONFIG_SLUB is not set
 CONFIG_SLOB=y
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c7fd2778ed50..c27e53326bef 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2069,6 +2069,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
        mutex_unlock(&cpuset_mutex);
 }
+/*
+ * Make sure the new task conform to the current state of its parent,
+ * which could have been changed by cpuset just after it inherits the
+ * state from the parent and before it sits on the cgroup's task list.
+ */
+void cpuset_fork(struct task_struct *task)
+{
+        if (task_css_is_root(task, cpuset_cgrp_id))
+                return;
+        set_cpus_allowed_ptr(task, &current->cpus_allowed);
+        task->mems_allowed = current->mems_allowed;
+}
 struct cgroup_subsys cpuset_cgrp_subsys = {
        .css_alloc      = cpuset_css_alloc,
        .css_online     = cpuset_css_online,
@@ -2079,6 +2093,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
        .attach         = cpuset_attach,
        .post_attach    = cpuset_post_attach,
        .bind           = cpuset_bind,
+        .fork           = cpuset_fork,
        .legacy_cftypes = files,
        .early_init     = true,
 };
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1903b8f3a705..3cfabdf7b942 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -242,18 +242,6 @@ unlock:
        return ret;
 }
-static void event_function_local(struct perf_event *event, event_f func, void *data)
-{
-        struct event_function_struct efs = {
-                .event = event,
-                .func = func,
-                .data = data,
-        };
-        int ret = event_function(&efs);
-        WARN_ON_ONCE(ret);
-}
 static void event_function_call(struct perf_event *event, event_f func, void *data)
 {
        struct perf_event_context *ctx = event->ctx;
@@ -303,6 +291,54 @@ again:
        raw_spin_unlock_irq(&ctx->lock);
 }
+/*
+ * Similar to event_function_call() + event_function(), but hard assumes IRQs
+ * are already disabled and we're on the right CPU.
+ */
+static void event_function_local(struct perf_event *event, event_f func, void *data)
+{
+        struct perf_event_context *ctx = event->ctx;
+        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+        struct task_struct *task = READ_ONCE(ctx->task);
+        struct perf_event_context *task_ctx = NULL;
+        WARN_ON_ONCE(!irqs_disabled());
+        if (task) {
+                if (task == TASK_TOMBSTONE)
+                        return;
+                task_ctx = ctx;
+        }
+        perf_ctx_lock(cpuctx, task_ctx);
+        task = ctx->task;
+        if (task == TASK_TOMBSTONE)
+                goto unlock;
+        if (task) {
+                /*
+                 * We must be either inactive or active and the right task,
+                 * otherwise we're screwed, since we cannot IPI to somewhere
+                 * else.
+                 */
+                if (ctx->is_active) {
+                        if (WARN_ON_ONCE(task != current))
+                                goto unlock;
+                        if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
+                                goto unlock;
+                }
+        } else {
+                WARN_ON_ONCE(&cpuctx->ctx != ctx);
+        }
+        func(event, cpuctx, ctx, data);
+unlock:
+        perf_ctx_unlock(cpuctx, task_ctx);
+}
 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
                       PERF_FLAG_FD_OUTPUT  |\
                       PERF_FLAG_PID_CGROUP |\
@@ -3513,9 +3549,10 @@ static int perf_event_read(struct perf_event *event, bool group)
                        .group = group,
                        .ret = 0,
                };
-                smp_call_function_single(event->oncpu,
+                ret = smp_call_function_single(event->oncpu, __perf_event_read, &data, 1);
-                                         __perf_event_read, &data, 1);
+                /* The event must have been read from an online CPU: */
-                ret = data.ret;
+                WARN_ON_ONCE(ret);
+                ret = ret ? : data.ret;
        } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;
@@ -6129,7 +6166,7 @@ static int __perf_pmu_output_stop(void *info)
 {
        struct perf_event *event = info;
        struct pmu *pmu = event->pmu;
-        struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
        struct remote_output ro = {
                .rb     = event->rb,
        };
@@ -6584,15 +6621,6 @@ got_name:
 }
 /*
- * Whether this @filter depends on a dynamic object which is not loaded
- * yet or its load addresses are not known.
- */
-static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
-{
-        return filter->filter && filter->inode;
-}
-/*
 * Check whether inode and address range match filter criteria.
 */
 static bool perf_addr_filter_match(struct perf_addr_filter *filter,
@@ -6653,6 +6681,13 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
        struct perf_event_context *ctx;
        int ctxn;
+        /*
+         * Data tracing isn't supported yet and as such there is no need
+         * to keep track of anything that isn't related to executable code:
+         */
+        if (!(vma->vm_flags & VM_EXEC))
+                return;
        rcu_read_lock();
        for_each_task_context_nr(ctxn) {
                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
@@ -7805,7 +7840,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
        list_for_each_entry(filter, &ifh->list, entry) {
                event->addr_filters_offs[count] = 0;
-                if (perf_addr_filter_needs_mmap(filter))
+                /*
+                 * Adjust base offset if the filter is associated to a binary
+                 * that needs to be mapped:
+                 */
+                if (filter->inode)
                        event->addr_filters_offs[count] =
                                perf_addr_filter_apply(filter, mm);
@@ -7936,8 +7975,10 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                                        goto fail;
                        }
-                        if (token == IF_SRC_FILE) {
+                        if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
-                                filename = match_strdup(&args[2]);
+                                int fpos = filter->range ? 2 : 1;
+                                filename = match_strdup(&args[fpos]);
                                if (!filename) {
                                        ret = -ENOMEM;
                                        goto fail;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b7a525ab2083..8c50276b60d1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -172,8 +172,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        err = -EAGAIN;
        ptep = page_check_address(page, mm, addr, &ptl, 0);
-        if (!ptep)
+        if (!ptep) {
+                mem_cgroup_cancel_charge(kpage, memcg, false);
                goto unlock;
+        }
        get_page(kpage);
        page_add_new_anon_rmap(kpage, vma, addr, false);
@@ -200,7 +202,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        err = 0;
 unlock:
-        mem_cgroup_cancel_charge(kpage, memcg, false);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        unlock_page(page);
        return err;
diff --git a/kernel/exit.c b/kernel/exit.c
index 2f974ae042a6..091a78be3b09 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -848,12 +848,7 @@ void do_exit(long code)
        TASKS_RCU(preempt_enable());
        exit_notify(tsk, group_dead);
        proc_exit_connector(tsk);
-#ifdef CONFIG_NUMA
+        mpol_put_task_policy(tsk);
-        task_lock(tsk);
-        mpol_put(tsk->mempolicy);
-        tsk->mempolicy = NULL;
-        task_unlock(tsk);
-#endif
 #ifdef CONFIG_FUTEX
        if (unlikely(current->pi_state_cache))
                kfree(current->pi_state_cache);
diff --git a/kernel/fork.c b/kernel/fork.c
index 52e725d4a866..beb31725f7e2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -799,6 +799,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
 EXPORT_SYMBOL(get_mm_exe_file);
 /**
+ * get_task_exe_file - acquire a reference to the task's executable file
+ *
+ * Returns %NULL if task's mm (if any) has no associated executable file or
+ * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
+ * User must release file via fput().
+ */
+struct file *get_task_exe_file(struct task_struct *task)
+{
+        struct file *exe_file = NULL;
+        struct mm_struct *mm;
+        task_lock(task);
+        mm = task->mm;
+        if (mm) {
+                if (!(task->flags & PF_KTHREAD))
+                        exe_file = get_mm_exe_file(mm);
+        }
+        task_unlock(task);
+        return exe_file;
+}
+EXPORT_SYMBOL(get_task_exe_file);
+/**
 * get_task_mm - acquire a reference to the task's mm
 *
 * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
@@ -913,14 +936,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
        deactivate_mm(tsk, mm);
        /*
-         * If we're exiting normally, clear a user-space tid field if
+         * Signal userspace if we're not exiting with a core dump
-         * requested.  We leave this alone when dying by signal, to leave
+         * because we want to leave the value intact for debugging
-         * the value intact in a core dump, and to save the unnecessary
+         * purposes.
-         * trouble, say, a killed vfork parent shouldn't touch this mm.
-         * Userland only wants this done for a sys_exit.
         */
        if (tsk->clear_child_tid) {
-                if (!(tsk->flags & PF_SIGNALED) &&
+                if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
                    atomic_read(&mm->mm_users) > 1) {
                        /*
                         * We don't check the error code - if userspace has
@@ -1404,7 +1425,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->real_start_time = ktime_get_boot_ns();
        p->io_context = NULL;
        p->audit_context = NULL;
-        threadgroup_change_begin(current);
        cgroup_fork(p);
 #ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
@@ -1556,6 +1576,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        INIT_LIST_HEAD(&p->thread_group);
        p->task_works = NULL;
+        threadgroup_change_begin(current);
        /*
         * Ensure that the cgroup subsystem policies allow the new process to be
         * forked. It should be noted the the new process's css_set can be changed
@@ -1656,6 +1677,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 bad_fork_cancel_cgroup:
        cgroup_cancel_fork(p);
 bad_fork_free_pid:
+        threadgroup_change_end(current);
        if (pid != &init_struct_pid)
                free_pid(pid);
 bad_fork_cleanup_thread:
@@ -1688,7 +1710,6 @@ bad_fork_cleanup_policy:
        mpol_put(p->mempolicy);
 bad_fork_cleanup_threadgroup_lock:
 #endif
-        threadgroup_change_end(current);
        delayacct_tsk_free(p);
 bad_fork_cleanup_count:
        atomic_dec(&p->cred->user->processes);
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f68959341c0f..32f6cfcff212 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -39,6 +39,7 @@ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
                return NULL;
        }
+        get_online_cpus();
        if (max_vecs >= num_online_cpus()) {
                cpumask_copy(affinity_mask, cpu_online_mask);
                *nr_vecs = num_online_cpus();
@@ -56,6 +57,7 @@ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
                }
                *nr_vecs = vecs;
        }
+        put_online_cpus();
        return affinity_mask;
 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b4c1bc7c9ca2..637389088b3f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -820,6 +820,17 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
        desc->name = name;
        if (handle != handle_bad_irq && is_chained) {
+                /*
+                 * We're about to start this interrupt immediately,
+                 * hence the need to set the trigger configuration.
+                 * But the .set_type callback may have overridden the
+                 * flow handler, ignoring that we're dealing with a
+                 * chained interrupt. Reset it immediately because we
+                 * do know better.
+                 */
+                __irq_set_trigger(desc, irqd_get_trigger_type(&desc->irq_data));
+                desc->handle_irq = handle;
                irq_settings_set_noprobe(desc);
                irq_settings_set_norequest(desc);
                irq_settings_set_nothread(desc);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 73a2b786b5e9..9530fcd27704 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1681,8 +1681,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
        action->dev_id = dev_id;
        retval = irq_chip_pm_get(&desc->irq_data);
-        if (retval < 0)
+        if (retval < 0) {
+                kfree(action);
                return retval;
+        }
        chip_bus_lock(desc);
        retval = __setup_irq(irq, desc, action);
@@ -1985,8 +1987,10 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
        action->percpu_dev_id = dev_id;
        retval = irq_chip_pm_get(&desc->irq_data);
-        if (retval < 0)
+        if (retval < 0) {
+                kfree(action);
                return retval;
+        }
        chip_bus_lock(desc);
        retval = __setup_irq(irq, desc, action);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 503bc2d348e5..037c321c5618 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -887,7 +887,10 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
        return 0;
 out:
        vfree(pi->sechdrs);
+        pi->sechdrs = NULL;
        vfree(pi->purgatory_buf);
+        pi->purgatory_buf = NULL;
        return ret;
 }
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 9a0178c2ac1d..b02228411d57 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -835,9 +835,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
 */
 static bool rtree_next_node(struct memory_bitmap *bm)
 {
-        bm->cur.node = list_entry(bm->cur.node->list.next,
+        if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) {
-                                  struct rtree_node, list);
+                bm->cur.node = list_entry(bm->cur.node->list.next,
-        if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+                                          struct rtree_node, list);
                bm->cur.node_pfn += BM_BITS_PER_BLOCK;
                bm->cur.node_bit  = 0;
                touch_softlockup_watchdog();
@@ -845,9 +845,9 @@ static bool rtree_next_node(struct memory_bitmap *bm)
        }
        /* No more nodes, goto next zone */
-        bm->cur.zone = list_entry(bm->cur.zone->list.next,
+        if (!list_is_last(&bm->cur.zone->list, &bm->zones)) {
+                bm->cur.zone = list_entry(bm->cur.zone->list.next,
                                  struct mem_zone_bm_rtree, list);
-        if (&bm->cur.zone->list != &bm->zones) {
                bm->cur.node = list_entry(bm->cur.zone->leaves.next,
                                          struct rtree_node, list);
                bm->cur.node_pfn = 0;
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index 276762f3a460..d5760c42f042 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -9,10 +9,10 @@
 char *_braille_console_setup(char **str, char **brl_options)
 {
-        if (!memcmp(*str, "brl,", 4)) {
+        if (!strncmp(*str, "brl,", 4)) {
                *brl_options = "";
                *str += 4;
-        } else if (!memcmp(str, "brl=", 4)) {
+        } else if (!strncmp(*str, "brl=", 4)) {
                *brl_options = *str + 4;
                *str = strchr(*brl_options, ',');
                if (!*str)
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index b69eb8a2876f..16bab471c7e2 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -99,27 +99,33 @@ again:
        return add;
 }
-/*
+static void printk_nmi_flush_line(const char *text, int len)
- * printk one line from the temporary buffer from @start index until
- * and including the @end index.
- */
-static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end)
 {
-        const char *buf = s->buffer + start;
        /*
         * The buffers are flushed in NMI only on panic.  The messages must
         * go only into the ring buffer at this stage.  Consoles will get
         * explicitly called later when a crashdump is not generated.
         */
        if (in_nmi())
-                printk_deferred("%.*s", (end - start) + 1, buf);
+                printk_deferred("%.*s", len, text);
        else
-                printk("%.*s", (end - start) + 1, buf);
+                printk("%.*s", len, text);
 }
 /*
+ * printk one line from the temporary buffer from @start index until
+ * and including the @end index.
+ */
+static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s,
+                                        int start, int end)
+{
+        const char *buf = s->buffer + start;
+        printk_nmi_flush_line(buf, (end - start) + 1);
+}
+/*
 * Flush data from the associated per_CPU buffer. The function
 * can be called either via IRQ work or independently.
 */
@@ -150,9 +156,11 @@ more:
         * the buffer an unexpected way. If we printed something then
         * @len must only increase.
         */
-        if (i && i >= len)
+        if (i && i >= len) {
-                pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n",
+                const char *msg = "printk_nmi_flush: internal error\n";
-                       i, len);
+                printk_nmi_flush_line(msg, strlen(msg));
+        }
        if (!len)
                goto out; /* Someone else has already flushed the buffer. */
@@ -166,14 +174,14 @@ more:
        /* Print line by line. */
        for (; i < size; i++) {
                if (s->buffer[i] == '\n') {
-                        print_nmi_seq_line(s, last_i, i);
+                        printk_nmi_flush_seq_line(s, last_i, i);
                        last_i = i + 1;
                }
        }
        /* Check if there was a partial line. */
        if (last_i < size) {
-                print_nmi_seq_line(s, last_i, size - 1);
+                printk_nmi_flush_seq_line(s, last_i, size - 1);
-                pr_cont("\n");
+                printk_nmi_flush_line("\n", strlen("\n"));
        }
        /*
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9858266fb0b3..a846cf89eb96 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -263,6 +263,11 @@ void account_idle_time(cputime_t cputime)
                cpustat[CPUTIME_IDLE] += (__force u64) cputime;
 }
+/*
+ * When a guest is interrupted for a longer amount of time, missed clock
+ * ticks are not redelivered later. Due to that, this function may on
+ * occasion account more time than the calling functions think elapsed.
+ */
 static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
 {
 #ifdef CONFIG_PARAVIRT
@@ -371,7 +376,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
         * idle, or potentially user or system time. Due to rounding,
         * other time can exceed ticks occasionally.
         */
-        other = account_other_time(cputime);
+        other = account_other_time(ULONG_MAX);
        if (other >= cputime)
                return;
        cputime -= other;
@@ -486,7 +491,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
        }
        cputime = cputime_one_jiffy;
-        steal = steal_account_process_time(cputime);
+        steal = steal_account_process_time(ULONG_MAX);
        if (steal >= cputime)
                return;
@@ -516,7 +521,7 @@ void account_idle_ticks(unsigned long ticks)
        }
        cputime = jiffies_to_cputime(ticks);
-        steal = steal_account_process_time(cputime);
+        steal = steal_account_process_time(ULONG_MAX);
        if (steal >= cputime)
                return;
@@ -614,19 +619,25 @@ static void cputime_adjust(struct task_cputime *curr,
        stime = curr->stime;
        utime = curr->utime;
-        if (utime == 0) {
+        /*
-                stime = rtime;
+         * If either stime or both stime and utime are 0, assume all runtime is
+         * userspace. Once a task gets some ticks, the monotonicy code at
+         * 'update' will ensure things converge to the observed ratio.
+         */
+        if (stime == 0) {
+                utime = rtime;
                goto update;
        }
-        if (stime == 0) {
+        if (utime == 0) {
-                utime = rtime;
+                stime = rtime;
                goto update;
        }
        stime = scale_stime((__force u64)stime, (__force u64)rtime,
                            (__force u64)(stime + utime));
+update:
        /*
         * Make sure stime doesn't go backwards; this preserves monotonicity
         * for utime because rtime is monotonic.
@@ -649,7 +660,6 @@ static void cputime_adjust(struct task_cputime *curr,
                stime = rtime - utime;
        }
-update:
        prev->stime = stime;
        prev->utime = utime;
 out:
@@ -694,6 +704,13 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
        unsigned long now = READ_ONCE(jiffies);
        cputime_t delta, other;
+        /*
+         * Unlike tick based timing, vtime based timing never has lost
+         * ticks, and no need for steal time accounting to make up for
+         * lost ticks. Vtime accounts a rounded version of actual
+         * elapsed time. Limit account_other_time to prevent rounding
+         * errors from causing elapsed vtime to go negative.
+         */
        delta = jiffies_to_cputime(now - tsk->vtime_snap);
        other = account_other_time(delta);
        WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ef6c6c3f9d8a..0db7c8a2afe2 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -605,12 +605,16 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
                ptrace_event(PTRACE_EVENT_SECCOMP, data);
                /*
                 * The delivery of a fatal signal during event
-                 * notification may silently skip tracer notification.
+                 * notification may silently skip tracer notification,
-                 * Terminating the task now avoids executing a system
+                 * which could leave us with a potentially unmodified
-                 * call that may not be intended.
+                 * syscall that the tracer would have liked to have
+                 * changed. Since the process is about to die, we just
+                 * force the syscall to be skipped and let the signal
+                 * kill the process and correctly handle any tracer exit
+                 * notifications.
                 */
                if (fatal_signal_pending(current))
-                        do_exit(SIGSYS);
+                        goto skip;
                /* Check if the tracer forced the syscall to be skipped. */
                this_syscall = syscall_get_nr(current, task_pt_regs(current));
                if (this_syscall < 0)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b43d0b27c1fe..a13bbdaab47d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2140,6 +2140,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
        return 0;
 }
+static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
+                                 int *valp,
+                                 int write, void *data)
+{
+        if (write) {
+                if (*negp)
+                        return -EINVAL;
+                *valp = *lvalp;
+        } else {
+                unsigned int val = *valp;
+                *lvalp = (unsigned long)val;
+        }
+        return 0;
+}
 static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
 static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
@@ -2259,8 +2274,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
 int proc_dointvec(struct ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_dointvec(table,write,buffer,lenp,ppos,
+        return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
-                            NULL,NULL);
+}
+/**
+ * proc_douintvec - read a vector of unsigned integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec(struct ctl_table *table, int write,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        return do_proc_dointvec(table, write, buffer, lenp, ppos,
+                                do_proc_douintvec_conv, NULL);
 }
 /*
@@ -2858,6 +2892,12 @@ int proc_dointvec(struct ctl_table *table, int write,
        return -ENOSYS;
 }
+int proc_douintvec(struct ctl_table *table, int write,
+                  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        return -ENOSYS;
+}
 int proc_dointvec_minmax(struct ctl_table *table, int write,
                    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -2903,6 +2943,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 * exception granted :-)
 */
 EXPORT_SYMBOL(proc_dointvec);
+EXPORT_SYMBOL(proc_douintvec);
 EXPORT_SYMBOL(proc_dointvec_jiffies);
 EXPORT_SYMBOL(proc_dointvec_minmax);
 EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 204fdc86863d..2ec7c00228f3 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -908,10 +908,11 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
        ktime_t now, expires;
        int cpu = smp_processor_id();
+        now = tick_nohz_start_idle(ts);
        if (can_stop_idle_tick(cpu, ts)) {
                int was_stopped = ts->tick_stopped;
-                now = tick_nohz_start_idle(ts);
                ts->idle_calls++;
                expires = tick_nohz_stop_sched_tick(ts, now, cpu);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3b65746c7f15..e07fb093f819 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -401,7 +401,10 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
        do {
                seq = raw_read_seqcount_latch(&tkf->seq);
                tkr = tkf->base + (seq & 0x01);
-                now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+                now = ktime_to_ns(tkr->base);
+                now += clocksource_delta(tkr->read(tkr->clock),
+                                         tkr->cycle_last, tkr->mask);
        } while (read_seqcount_retry(&tkf->seq, seq));
        return now;
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index f6bd65236712..107310a6f36f 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -23,7 +23,9 @@
 #include "timekeeping_internal.h"
-static unsigned int sleep_time_bin[32] = {0};
+#define NUM_BINS 32
+static unsigned int sleep_time_bin[NUM_BINS] = {0};
 static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
 {
@@ -69,6 +71,9 @@ late_initcall(tk_debug_sleep_time_init);
 void tk_debug_account_sleep_time(struct timespec64 *t)
 {
-        sleep_time_bin[fls(t->tv_sec)]++;
+        /* Cap bin index so we don't overflow the array */
+        int bin = min(fls(t->tv_sec), NUM_BINS-1);
+        sleep_time_bin[bin]++;
 }
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7598e6ca817a..dbafc5df03f3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -223,7 +223,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
        what |= MASK_TC_BIT(op_flags, META);
        what |= MASK_TC_BIT(op_flags, PREFLUSH);
        what |= MASK_TC_BIT(op_flags, FUA);
-        if (op == REQ_OP_DISCARD)
+        if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
                what |= BLK_TC_ACT(BLK_TC_DISCARD);
        if (op == REQ_OP_FLUSH)
                what |= BLK_TC_ACT(BLK_TC_FLUSH);
author	Thomas Gleixner <tglx@linutronix.de>	2016-09-26 15:47:03 -0400
committer	Thomas Gleixner <tglx@linutronix.de>	2016-09-26 15:47:03 -0400
commit	1e1b37273cf719545da50b76f214f983a710aaf4 (patch)
tree	033f6062325ef7aaeefe8559bb409ab7d2be3c76 /kernel
parent	c183a603e8d8a5a189729b77d0c623a3d5950e5f (diff)
parent	c291b015158577be533dd5a959dfc09bab119eed (diff)