24 files changed, 742 insertions, 358 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a4eb5227a19e..14d32588cccd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -416,7 +416,7 @@ int __cpuinit cpu_up(unsigned int cpu)
        if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
                mutex_lock(&zonelists_mutex);
-                build_all_zonelists(NULL);
+                build_all_zonelists(NULL, NULL);
                mutex_unlock(&zonelists_mutex);
        }
 #endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8c8bd652dd12..f33c7153b6d7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -147,6 +147,12 @@ typedef enum {
        CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
+/* the type of hotplug event */
+enum hotplug_event {
+        CPUSET_CPU_OFFLINE,
+        CPUSET_MEM_OFFLINE,
+};
 /* convenient tests for these bits */
 static inline int is_cpu_exclusive(const struct cpuset *cs)
 {
@@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 }
 /*
- * Walk the specified cpuset subtree and look for empty cpusets.
+ * Helper function to traverse cpusets.
- * The tasks of such cpuset must be moved to a parent cpuset.
+ * It can be used to walk the cpuset tree from top to bottom, completing
+ * one layer before dropping down to the next (thus always processing a
+ * node before any of its children).
+ */
+static struct cpuset *cpuset_next(struct list_head *queue)
+{
+        struct cpuset *cp;
+        struct cpuset *child;   /* scans child cpusets of cp */
+        struct cgroup *cont;
+        if (list_empty(queue))
+                return NULL;
+        cp = list_first_entry(queue, struct cpuset, stack_list);
+        list_del(queue->next);
+        list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+                child = cgroup_cs(cont);
+                list_add_tail(&child->stack_list, queue);
+        }
+        return cp;
+}
+/*
+ * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
+ * online/offline) and update the cpusets accordingly.
+ * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
+ * cpuset must be moved to a parent cpuset.
 *
 * Called with cgroup_mutex held.  We take callback_mutex to modify
 * cpus_allowed and mems_allowed.
@@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 * before dropping down to the next.  It always processes a node before
 * any of its children.
 *
- * For now, since we lack memory hot unplug, we'll never see a cpuset
+ * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY
- * that has tasks along with an empty 'mems'.  But if we did see such
+ * if all present pages from a node are offlined.
- * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
 */
-static void scan_for_empty_cpusets(struct cpuset *root)
+static void
+scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
 {
        LIST_HEAD(queue);
-        struct cpuset *cp;      /* scans cpusets being updated */
+        struct cpuset *cp;              /* scans cpusets being updated */
-        struct cpuset *child;   /* scans child cpusets of cp */
-        struct cgroup *cont;
        static nodemask_t oldmems;      /* protected by cgroup_mutex */
        list_add_tail((struct list_head *)&root->stack_list, &queue);
-        while (!list_empty(&queue)) {
+        switch (event) {
-                cp = list_first_entry(&queue, struct cpuset, stack_list);
+        case CPUSET_CPU_OFFLINE:
-                list_del(queue.next);
+                while ((cp = cpuset_next(&queue)) != NULL) {
-                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
-                        child = cgroup_cs(cont);
+                        /* Continue past cpusets with all cpus online */
-                        list_add_tail(&child->stack_list, &queue);
+                        if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
+                                continue;
+                        /* Remove offline cpus from this cpuset. */
+                        mutex_lock(&callback_mutex);
+                        cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
+                                                        cpu_active_mask);
+                        mutex_unlock(&callback_mutex);
+                        /* Move tasks from the empty cpuset to a parent */
+                        if (cpumask_empty(cp->cpus_allowed))
+                                remove_tasks_in_empty_cpuset(cp);
+                        else
+                                update_tasks_cpumask(cp, NULL);
                }
+                break;
-                /* Continue past cpusets with all cpus, mems online */
+        case CPUSET_MEM_OFFLINE:
-                if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
+                while ((cp = cpuset_next(&queue)) != NULL) {
-                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
-                        continue;
-                oldmems = cp->mems_allowed;
+                        /* Continue past cpusets with all mems online */
+                        if (nodes_subset(cp->mems_allowed,
+                                        node_states[N_HIGH_MEMORY]))
+                                continue;
-                /* Remove offline cpus and mems from this cpuset. */
+                        oldmems = cp->mems_allowed;
-                mutex_lock(&callback_mutex);
-                cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
+                        /* Remove offline mems from this cpuset. */
-                            cpu_active_mask);
+                        mutex_lock(&callback_mutex);
-                nodes_and(cp->mems_allowed, cp->mems_allowed,
+                        nodes_and(cp->mems_allowed, cp->mems_allowed,
                                                node_states[N_HIGH_MEMORY]);
-                mutex_unlock(&callback_mutex);
+                        mutex_unlock(&callback_mutex);
-                /* Move tasks from the empty cpuset to a parent */
+                        /* Move tasks from the empty cpuset to a parent */
-                if (cpumask_empty(cp->cpus_allowed) ||
+                        if (nodes_empty(cp->mems_allowed))
-                     nodes_empty(cp->mems_allowed))
+                                remove_tasks_in_empty_cpuset(cp);
-                        remove_tasks_in_empty_cpuset(cp);
+                        else
-                else {
+                                update_tasks_nodemask(cp, &oldmems, NULL);
-                        update_tasks_cpumask(cp, NULL);
-                        update_tasks_nodemask(cp, &oldmems, NULL);
                }
        }
 }
@@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 * (of no affect) on systems that are actively using CPU hotplug
 * but making no active use of cpusets.
 *
+ * The only exception to this is suspend/resume, where we don't
+ * modify cpusets at all.
+ *
 * This routine ensures that top_cpuset.cpus_allowed tracks
 * cpu_active_mask on each CPU hotplug (cpuhp) event.
 *
 * Called within get_online_cpus().  Needs to call cgroup_lock()
 * before calling generate_sched_domains().
+ *
+ * @cpu_online: Indicates whether this is a CPU online event (true) or
+ * a CPU offline event (false).
 */
-void cpuset_update_active_cpus(void)
+void cpuset_update_active_cpus(bool cpu_online)
 {
        struct sched_domain_attr *attr;
        cpumask_var_t *doms;
@@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void)
        mutex_lock(&callback_mutex);
        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
        mutex_unlock(&callback_mutex);
-        scan_for_empty_cpusets(&top_cpuset);
+        if (!cpu_online)
+                scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
        ndoms = generate_sched_domains(&doms, &attr);
        cgroup_unlock();
@@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void)
 /*
 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
- * See also the previous routine cpuset_track_online_cpus().
+ * See cpuset_update_active_cpus() for CPU hotplug handling.
 */
 static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
@@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
        case MEM_OFFLINE:
                /*
                 * needn't update top_cpuset.mems_allowed explicitly because
-                 * scan_for_empty_cpusets() will update it.
+                 * scan_cpusets_upon_hotplug() will update it.
                 */
-                scan_for_empty_cpusets(&top_cpuset);
+                scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
                break;
        default:
                break;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f93532748bca..c08a22d02f72 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -32,6 +32,7 @@
 #include <linux/swap.h>         /* try_to_free_swap */
 #include <linux/ptrace.h>       /* user_enable_single_step */
 #include <linux/kdebug.h>       /* notifier mechanism */
+#include "../../mm/internal.h"  /* munlock_vma_page */
 #include <linux/uprobes.h>
@@ -112,14 +113,14 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register)
        return false;
 }
-static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
+static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
 {
-        loff_t vaddr;
+        return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+}
-        vaddr = vma->vm_start + offset;
-        vaddr -= vma->vm_pgoff << PAGE_SHIFT;
-        return vaddr;
+static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
+{
+        return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
 }
 /**
@@ -127,25 +128,27 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
 * based on replace_page in mm/ksm.c
 *
 * @vma:      vma that holds the pte pointing to page
+ * @addr:     address the old @page is mapped at
 * @page:     the cowed page we are replacing by kpage
 * @kpage:    the modified page we replace page by
 *
 * Returns 0 on success, -EFAULT on failure.
 */
-static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage)
+static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
+                                struct page *page, struct page *kpage)
 {
        struct mm_struct *mm = vma->vm_mm;
-        unsigned long addr;
        spinlock_t *ptl;
        pte_t *ptep;
+        int err;
-        addr = page_address_in_vma(page, vma);
+        /* For try_to_free_swap() and munlock_vma_page() below */
-        if (addr == -EFAULT)
+        lock_page(page);
-                return -EFAULT;
+        err = -EAGAIN;
        ptep = page_check_address(page, mm, addr, &ptl, 0);
        if (!ptep)
-                return -EAGAIN;
+                goto unlock;
        get_page(kpage);
        page_add_new_anon_rmap(kpage, vma, addr);
@@ -162,10 +165,16 @@ static int __replace_page(struct vm_area_struct *vma, struct page *page, struct
        page_remove_rmap(page);
        if (!page_mapped(page))
                try_to_free_swap(page);
-        put_page(page);
        pte_unmap_unlock(ptep, ptl);
-        return 0;
+        if (vma->vm_flags & VM_LOCKED)
+                munlock_vma_page(page);
+        put_page(page);
+        err = 0;
+ unlock:
+        unlock_page(page);
+        return err;
 }
 /**
@@ -206,45 +215,23 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
                        unsigned long vaddr, uprobe_opcode_t opcode)
 {
        struct page *old_page, *new_page;
-        struct address_space *mapping;
        void *vaddr_old, *vaddr_new;
        struct vm_area_struct *vma;
-        struct uprobe *uprobe;
        int ret;
 retry:
        /* Read the page with vaddr into memory */
        ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
        if (ret <= 0)
                return ret;
-        ret = -EINVAL;
-        /*
-         * We are interested in text pages only. Our pages of interest
-         * should be mapped for read and execute only. We desist from
-         * adding probes in write mapped pages since the breakpoints
-         * might end up in the file copy.
-         */
-        if (!valid_vma(vma, is_swbp_insn(&opcode)))
-                goto put_out;
-        uprobe = container_of(auprobe, struct uprobe, arch);
-        mapping = uprobe->inode->i_mapping;
-        if (mapping != vma->vm_file->f_mapping)
-                goto put_out;
        ret = -ENOMEM;
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
        if (!new_page)
-                goto put_out;
+                goto put_old;
        __SetPageUptodate(new_page);
-        /*
-         * lock page will serialize against do_wp_page()'s
-         * PageAnon() handling
-         */
-        lock_page(old_page);
        /* copy the page now that we've got it stable */
        vaddr_old = kmap_atomic(old_page);
        vaddr_new = kmap_atomic(new_page);
@@ -257,17 +244,13 @@ retry:
        ret = anon_vma_prepare(vma);
        if (ret)
-                goto unlock_out;
+                goto put_new;
-        lock_page(new_page);
+        ret = __replace_page(vma, vaddr, old_page, new_page);
-        ret = __replace_page(vma, old_page, new_page);
-        unlock_page(new_page);
-unlock_out:
+put_new:
-        unlock_page(old_page);
        page_cache_release(new_page);
+put_old:
-put_out:
        put_page(old_page);
        if (unlikely(ret == -EAGAIN))
@@ -791,7 +774,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
                curr = info;
                info->mm = vma->vm_mm;
-                info->vaddr = vma_address(vma, offset);
+                info->vaddr = offset_to_vaddr(vma, offset);
        }
        mutex_unlock(&mapping->i_mmap_mutex);
@@ -839,12 +822,13 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
                        goto free;
                down_write(&mm->mmap_sem);
-                vma = find_vma(mm, (unsigned long)info->vaddr);
+                vma = find_vma(mm, info->vaddr);
-                if (!vma || !valid_vma(vma, is_register))
+                if (!vma || !valid_vma(vma, is_register) ||
+                    vma->vm_file->f_mapping->host != uprobe->inode)
                        goto unlock;
-                if (vma->vm_file->f_mapping->host != uprobe->inode ||
+                if (vma->vm_start > info->vaddr ||
-                    vma_address(vma, uprobe->offset) != info->vaddr)
+                    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
                        goto unlock;
                if (is_register) {
@@ -960,59 +944,66 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
                put_uprobe(uprobe);
 }
-/*
+static struct rb_node *
- * Of all the nodes that correspond to the given inode, return the node
+find_node_in_range(struct inode *inode, loff_t min, loff_t max)
- * with the least offset.
- */
-static struct rb_node *find_least_offset_node(struct inode *inode)
 {
-        struct uprobe u = { .inode = inode, .offset = 0};
        struct rb_node *n = uprobes_tree.rb_node;
-        struct rb_node *close_node = NULL;
-        struct uprobe *uprobe;
-        int match;
        while (n) {
-                uprobe = rb_entry(n, struct uprobe, rb_node);
+                struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
-                match = match_uprobe(&u, uprobe);
-                if (uprobe->inode == inode)
-                        close_node = n;
-                if (!match)
-                        return close_node;
-                if (match < 0)
+                if (inode < u->inode) {
                        n = n->rb_left;
-                else
+                } else if (inode > u->inode) {
                        n = n->rb_right;
+                } else {
+                        if (max < u->offset)
+                                n = n->rb_left;
+                        else if (min > u->offset)
+                                n = n->rb_right;
+                        else
+                                break;
+                }
        }
-        return close_node;
+        return n;
 }
 /*
- * For a given inode, build a list of probes that need to be inserted.
+ * For a given range in vma, build a list of probes that need to be inserted.
 */
-static void build_probe_list(struct inode *inode, struct list_head *head)
+static void build_probe_list(struct inode *inode,
+                                struct vm_area_struct *vma,
+                                unsigned long start, unsigned long end,
+                                struct list_head *head)
 {
-        struct uprobe *uprobe;
+        loff_t min, max;
        unsigned long flags;
-        struct rb_node *n;
+        struct rb_node *n, *t;
+        struct uprobe *u;
-        spin_lock_irqsave(&uprobes_treelock, flags);
-        n = find_least_offset_node(inode);
-        for (; n; n = rb_next(n)) {
+        INIT_LIST_HEAD(head);
-                uprobe = rb_entry(n, struct uprobe, rb_node);
+        min = vaddr_to_offset(vma, start);
-                if (uprobe->inode != inode)
+        max = min + (end - start) - 1;
-                        break;
-                list_add(&uprobe->pending_list, head);
+        spin_lock_irqsave(&uprobes_treelock, flags);
-                atomic_inc(&uprobe->ref);
+        n = find_node_in_range(inode, min, max);
+        if (n) {
+                for (t = n; t; t = rb_prev(t)) {
+                        u = rb_entry(t, struct uprobe, rb_node);
+                        if (u->inode != inode || u->offset < min)
+                                break;
+                        list_add(&u->pending_list, head);
+                        atomic_inc(&u->ref);
+                }
+                for (t = n; (t = rb_next(t)); ) {
+                        u = rb_entry(t, struct uprobe, rb_node);
+                        if (u->inode != inode || u->offset > max)
+                                break;
+                        list_add(&u->pending_list, head);
+                        atomic_inc(&u->ref);
+                }
        }
        spin_unlock_irqrestore(&uprobes_treelock, flags);
 }
@@ -1031,7 +1022,7 @@ static void build_probe_list(struct inode *inode, struct list_head *head)
 int uprobe_mmap(struct vm_area_struct *vma)
 {
        struct list_head tmp_list;
-        struct uprobe *uprobe;
+        struct uprobe *uprobe, *u;
        struct inode *inode;
        int ret, count;
@@ -1042,21 +1033,15 @@ int uprobe_mmap(struct vm_area_struct *vma)
        if (!inode)
                return 0;
-        INIT_LIST_HEAD(&tmp_list);
        mutex_lock(uprobes_mmap_hash(inode));
-        build_probe_list(inode, &tmp_list);
+        build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
        ret = 0;
        count = 0;
-        list_for_each_entry(uprobe, &tmp_list, pending_list) {
+        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
                if (!ret) {
-                        loff_t vaddr = vma_address(vma, uprobe->offset);
+                        unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
-                        if (vaddr < vma->vm_start || vaddr >= vma->vm_end) {
-                                put_uprobe(uprobe);
-                                continue;
-                        }
                        ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
                        /*
@@ -1097,12 +1082,15 @@ int uprobe_mmap(struct vm_area_struct *vma)
 void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
 {
        struct list_head tmp_list;
-        struct uprobe *uprobe;
+        struct uprobe *uprobe, *u;
        struct inode *inode;
        if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
                return;
+        if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
+                return;
        if (!atomic_read(&vma->vm_mm->uprobes_state.count))
                return;
@@ -1110,21 +1098,17 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
        if (!inode)
                return;
-        INIT_LIST_HEAD(&tmp_list);
        mutex_lock(uprobes_mmap_hash(inode));
-        build_probe_list(inode, &tmp_list);
+        build_probe_list(inode, vma, start, end, &tmp_list);
-        list_for_each_entry(uprobe, &tmp_list, pending_list) {
+        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
-                loff_t vaddr = vma_address(vma, uprobe->offset);
+                unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
+                /*
-                if (vaddr >= start && vaddr < end) {
+                 * An unregister could have removed the probe before
-                        /*
+                 * unmap. So check before we decrement the count.
-                         * An unregister could have removed the probe before
+                 */
-                         * unmap. So check before we decrement the count.
+                if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
-                         */
+                        atomic_dec(&vma->vm_mm->uprobes_state.count);
-                        if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
-                                atomic_dec(&vma->vm_mm->uprobes_state.count);
-                }
                put_uprobe(uprobe);
        }
        mutex_unlock(uprobes_mmap_hash(inode));
@@ -1463,12 +1447,9 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
        vma = find_vma(mm, bp_vaddr);
        if (vma && vma->vm_start <= bp_vaddr) {
                if (valid_vma(vma, false)) {
-                        struct inode *inode;
+                        struct inode *inode = vma->vm_file->f_mapping->host;
-                        loff_t offset;
+                        loff_t offset = vaddr_to_offset(vma, bp_vaddr);
-                        inode = vma->vm_file->f_mapping->host;
-                        offset = bp_vaddr - vma->vm_start;
-                        offset += (vma->vm_pgoff << PAGE_SHIFT);
                        uprobe = find_uprobe(inode, offset);
                }
diff --git a/kernel/exit.c b/kernel/exit.c
index d17f6c4ddfa9..f65345f9e5bb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -483,7 +483,7 @@ static void close_files(struct files_struct * files)
        rcu_read_unlock();
        for (;;) {
                unsigned long set;
-                i = j * __NFDBITS;
+                i = j * BITS_PER_LONG;
                if (i >= fdt->max_fds)
                        break;
                set = fdt->open_fds[j++];
diff --git a/kernel/fork.c b/kernel/fork.c
index ff1cad3b7bdc..3bd2280d79f6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -114,6 +114,10 @@ int nr_processes(void)
        return total;
 }
+void __weak arch_release_task_struct(struct task_struct *tsk)
+{
+}
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 static struct kmem_cache *task_struct_cachep;
@@ -122,17 +126,17 @@ static inline struct task_struct *alloc_task_struct_node(int node)
        return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
 }
-void __weak arch_release_task_struct(struct task_struct *tsk) { }
 static inline void free_task_struct(struct task_struct *tsk)
 {
-        arch_release_task_struct(tsk);
        kmem_cache_free(task_struct_cachep, tsk);
 }
 #endif
+void __weak arch_release_thread_info(struct thread_info *ti)
+{
+}
 #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
-void __weak arch_release_thread_info(struct thread_info *ti) { }
 /*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
@@ -150,7 +154,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 static inline void free_thread_info(struct thread_info *ti)
 {
-        arch_release_thread_info(ti);
        free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
 }
 # else
@@ -164,7 +167,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 static void free_thread_info(struct thread_info *ti)
 {
-        arch_release_thread_info(ti);
        kmem_cache_free(thread_info_cache, ti);
 }
@@ -205,10 +207,12 @@ static void account_kernel_stack(struct thread_info *ti, int account)
 void free_task(struct task_struct *tsk)
 {
        account_kernel_stack(tsk->stack, -1);
+        arch_release_thread_info(tsk->stack);
        free_thread_info(tsk->stack);
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        put_seccomp_filter(tsk);
+        arch_release_task_struct(tsk);
        free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -298,23 +302,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
                return NULL;
        ti = alloc_thread_info_node(tsk, node);
-        if (!ti) {
+        if (!ti)
-                free_task_struct(tsk);
+                goto free_tsk;
-                return NULL;
-        }
        err = arch_dup_task_struct(tsk, orig);
+        if (err)
+                goto free_ti;
-        /*
-         * We defer looking at err, because we will need this setup
-         * for the clean up path to work correctly.
-         */
        tsk->stack = ti;
-        setup_thread_stack(tsk, orig);
-        if (err)
-                goto out;
+        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
        stackend = end_of_stack(tsk);
@@ -338,8 +335,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        return tsk;
-out:
+free_ti:
        free_thread_info(ti);
+free_tsk:
        free_task_struct(tsk);
        return NULL;
 }
@@ -383,16 +381,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                struct file *file;
                if (mpnt->vm_flags & VM_DONTCOPY) {
-                        long pages = vma_pages(mpnt);
-                        mm->total_vm -= pages;
                        vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
-                                                                -pages);
+                                                        -vma_pages(mpnt));
                        continue;
                }
                charge = 0;
                if (mpnt->vm_flags & VM_ACCOUNT) {
-                        unsigned long len;
+                        unsigned long len = vma_pages(mpnt);
-                        len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
                        if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                                goto fail_nomem;
                        charge = len;
@@ -1310,7 +1306,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
 #endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
        p->memcg_batch.do_batch = 0;
        p->memcg_batch.memcg = NULL;
 #endif
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index bdb180325551..131ca176b497 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,7 +133,7 @@ irqreturn_t
 handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 {
        irqreturn_t retval = IRQ_NONE;
-        unsigned int random = 0, irq = desc->irq_data.irq;
+        unsigned int flags = 0, irq = desc->irq_data.irq;
        do {
                irqreturn_t res;
@@ -161,7 +161,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
                        /* Fall through to add to randomness */
                case IRQ_HANDLED:
-                        random |= action->flags;
+                        flags |= action->flags;
                        break;
                default:
@@ -172,8 +172,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
                action = action->next;
        } while (action);
-        if (random & IRQF_SAMPLE_RANDOM)
+        add_interrupt_randomness(irq, flags);
-                add_interrupt_randomness(irq);
        if (!noirqdebug)
                note_interrupt(irq, desc, retval);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 814c9ef6bba1..0a8e8f059627 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -893,22 +893,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                return -ENOSYS;
        if (!try_module_get(desc->owner))
                return -ENODEV;
-        /*
-         * Some drivers like serial.c use request_irq() heavily,
-         * so we have to be careful not to interfere with a
-         * running system.
-         */
-        if (new->flags & IRQF_SAMPLE_RANDOM) {
-                /*
-                 * This function might sleep, we want to call it first,
-                 * outside of the atomic block.
-                 * Yes, this might clear the entropy pool if the wrong
-                 * driver is attempted to be loaded, without actually
-                 * installing a new handler, but is this really a problem,
-                 * only the sysadmin is able to do this.
-                 */
-                rand_initialize_irq(irq);
-        }
        /*
         * Check whether the interrupt nests into another interrupt
@@ -1354,7 +1338,6 @@ EXPORT_SYMBOL(free_irq);
 *      Flags:
 *
 *      IRQF_SHARED             Interrupt is shared
- *      IRQF_SAMPLE_RANDOM      The interrupt can be used for entropy
 *      IRQF_TRIGGER_*          Specify active edge(s) or level
 *
 */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4e2e472f6aeb..0668d58d6413 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1424,7 +1424,7 @@ static void update_vmcoreinfo_note(void)
 void crash_save_vmcoreinfo(void)
 {
-        vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
+        vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
        update_vmcoreinfo_note();
 }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ff2c7cb86d77..6f99aead66c6 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,6 +45,13 @@ extern int max_threads;
 static struct workqueue_struct *khelper_wq;
+/*
+ * kmod_thread_locker is used for deadlock avoidance.  There is no explicit
+ * locking to protect this global - it is private to the singleton khelper
+ * thread and should only ever be modified by that thread.
+ */
+static const struct task_struct *kmod_thread_locker;
 #define CAP_BSET        (void *)1
 #define CAP_PI          (void *)2
@@ -221,6 +228,13 @@ fail:
        return 0;
 }
+static int call_helper(void *data)
+{
+        /* Worker thread started blocking khelper thread. */
+        kmod_thread_locker = current;
+        return ____call_usermodehelper(data);
+}
 static void call_usermodehelper_freeinfo(struct subprocess_info *info)
 {
        if (info->cleanup)
@@ -295,9 +309,12 @@ static void __call_usermodehelper(struct work_struct *work)
        if (wait == UMH_WAIT_PROC)
                pid = kernel_thread(wait_for_helper, sub_info,
                                    CLONE_FS | CLONE_FILES | SIGCHLD);
-        else
+        else {
-                pid = kernel_thread(____call_usermodehelper, sub_info,
+                pid = kernel_thread(call_helper, sub_info,
                                    CLONE_VFORK | SIGCHLD);
+                /* Worker thread stopped blocking khelper thread. */
+                kmod_thread_locker = NULL;
+        }
        switch (wait) {
        case UMH_NO_WAIT:
@@ -548,6 +565,16 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
                retval = -EBUSY;
                goto out;
        }
+        /*
+         * Worker thread must not wait for khelper thread at below
+         * wait_for_completion() if the thread was created with CLONE_VFORK
+         * flag, for khelper thread is already waiting for the thread at
+         * wait_for_completion() in do_fork().
+         */
+        if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
+                retval = -EBUSY;
+                goto out;
+        }
        sub_info->complete = &done;
        sub_info->wait = wait;
@@ -577,6 +604,12 @@ unlock:
        return retval;
 }
+/*
+ * call_usermodehelper_fns() will not run the caller-provided cleanup function
+ * if a memory allocation failure is experienced.  So the caller might need to
+ * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform
+ * the necessaary cleanup within the caller.
+ */
 int call_usermodehelper_fns(
        char *path, char **argv, char **envp, int wait,
        int (*init)(struct subprocess_info *info, struct cred *new),
diff --git a/kernel/panic.c b/kernel/panic.c
index d2a5f4ecc6dd..e1b2822fff97 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -75,6 +75,14 @@ void panic(const char *fmt, ...)
        int state = 0;
        /*
+         * Disable local interrupts. This will prevent panic_smp_self_stop
+         * from deadlocking the first cpu that invokes the panic, since
+         * there is nothing to prevent an interrupt handler (that runs
+         * after the panic_lock is acquired) from invoking panic again.
+         */
+        local_irq_disable();
+        /*
         * It's possible to come here directly from a panic-assertion and
         * not have preempt disabled. Some functions called from here want
         * preempt to be disabled. No point enabling it later though...
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446b27df..1da39ea248fd 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -178,6 +178,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
        arch_suspend_enable_irqs();
        BUG_ON(irqs_disabled());
+        /* Kick the lockup detector */
+        lockup_detector_bootcpu_resume();
 Enable_cpus:
        enable_nonboot_cpus();
diff --git a/kernel/printk.c b/kernel/printk.c
index ac4bc9e79465..6a76ab9d4476 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -216,6 +216,7 @@ struct log {
 */
 static DEFINE_RAW_SPINLOCK(logbuf_lock);
+#ifdef CONFIG_PRINTK
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
 static u64 syslog_seq;
 static u32 syslog_idx;
@@ -228,14 +229,19 @@ static u32 log_first_idx;
 /* index and sequence number of the next record to store in the buffer */
 static u64 log_next_seq;
-#ifdef CONFIG_PRINTK
 static u32 log_next_idx;
+/* the next printk record to write to the console */
+static u64 console_seq;
+static u32 console_idx;
+static enum log_flags console_prev;
 /* the next printk record to read after the last 'clear' command */
 static u64 clear_seq;
 static u32 clear_idx;
-#define LOG_LINE_MAX 1024
+#define PREFIX_MAX              32
+#define LOG_LINE_MAX            1024 - PREFIX_MAX
 /* record buffer */
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
@@ -360,6 +366,7 @@ static void log_store(int facility, int level,
 struct devkmsg_user {
        u64 seq;
        u32 idx;
+        enum log_flags prev;
        struct mutex lock;
        char buf[8192];
 };
@@ -382,8 +389,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
        line = buf;
        for (i = 0; i < count; i++) {
-                if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len))
+                if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) {
+                        ret = -EFAULT;
                        goto out;
+                }
                line += iv[i].iov_len;
        }
@@ -425,6 +434,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
        struct log *msg;
        u64 ts_usec;
        size_t i;
+        char cont = '-';
        size_t len;
        ssize_t ret;
@@ -462,8 +472,25 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
        msg = log_from_idx(user->idx);
        ts_usec = msg->ts_nsec;
        do_div(ts_usec, 1000);
-        len = sprintf(user->buf, "%u,%llu,%llu;",
-                      (msg->facility << 3) | msg->level, user->seq, ts_usec);
+        /*
+         * If we couldn't merge continuation line fragments during the print,
+         * export the stored flags to allow an optional external merge of the
+         * records. Merging the records isn't always neccessarily correct, like
+         * when we hit a race during printing. In most cases though, it produces
+         * better readable output. 'c' in the record flags mark the first
+         * fragment of a line, '+' the following.
+         */
+        if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT))
+                cont = 'c';
+        else if ((msg->flags & LOG_CONT) ||
+                 ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
+                cont = '+';
+        len = sprintf(user->buf, "%u,%llu,%llu,%c;",
+                      (msg->facility << 3) | msg->level,
+                      user->seq, ts_usec, cont);
+        user->prev = msg->flags;
        /* escape non-printable characters */
        for (i = 0; i < msg->text_len; i++) {
@@ -646,6 +673,15 @@ void log_buf_kexec_setup(void)
        VMCOREINFO_SYMBOL(log_buf_len);
        VMCOREINFO_SYMBOL(log_first_idx);
        VMCOREINFO_SYMBOL(log_next_idx);
+        /*
+         * Export struct log size and field offsets. User space tools can
+         * parse it and detect any changes to structure down the line.
+         */
+        VMCOREINFO_STRUCT_SIZE(log);
+        VMCOREINFO_OFFSET(log, ts_nsec);
+        VMCOREINFO_OFFSET(log, len);
+        VMCOREINFO_OFFSET(log, text_len);
+        VMCOREINFO_OFFSET(log, dict_len);
 }
 #endif
@@ -876,7 +912,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev,
                if (buf) {
                        if (print_prefix(msg, syslog, NULL) +
-                            text_len + 1>= size - len)
+                            text_len + 1 >= size - len)
                                break;
                        if (prefix)
@@ -907,7 +943,7 @@ static int syslog_print(char __user *buf, int size)
        struct log *msg;
        int len = 0;
-        text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
+        text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
        if (!text)
                return -ENOMEM;
@@ -930,7 +966,8 @@ static int syslog_print(char __user *buf, int size)
                skip = syslog_partial;
                msg = log_from_idx(syslog_idx);
-                n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX);
+                n = msg_print_text(msg, syslog_prev, true, text,
+                                   LOG_LINE_MAX + PREFIX_MAX);
                if (n - syslog_partial <= size) {
                        /* message fits into buffer, move forward */
                        syslog_idx = log_next(syslog_idx);
@@ -969,7 +1006,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
        char *text;
        int len = 0;
-        text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
+        text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
        if (!text)
                return -ENOMEM;
@@ -1022,7 +1059,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                        struct log *msg = log_from_idx(idx);
                        int textlen;
-                        textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX);
+                        textlen = msg_print_text(msg, prev, true, text,
+                                                 LOG_LINE_MAX + PREFIX_MAX);
                        if (textlen < 0) {
                                len = textlen;
                                break;
@@ -1349,20 +1387,36 @@ static struct cont {
        u64 ts_nsec;                    /* time of first print */
        u8 level;                       /* log level of first message */
        u8 facility;                    /* log level of first message */
+        enum log_flags flags;           /* prefix, newline flags */
        bool flushed:1;                 /* buffer sealed and committed */
 } cont;
-static void cont_flush(void)
+static void cont_flush(enum log_flags flags)
 {
        if (cont.flushed)
                return;
        if (cont.len == 0)
                return;
-        log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec,
+        if (cont.cons) {
-                  NULL, 0, cont.buf, cont.len);
+                /*
+                 * If a fragment of this line was directly flushed to the
-        cont.flushed = true;
+                 * console; wait for the console to pick up the rest of the
+                 * line. LOG_NOCONS suppresses a duplicated output.
+                 */
+                log_store(cont.facility, cont.level, flags | LOG_NOCONS,
+                          cont.ts_nsec, NULL, 0, cont.buf, cont.len);
+                cont.flags = flags;
+                cont.flushed = true;
+        } else {
+                /*
+                 * If no fragment of this line ever reached the console,
+                 * just submit it to the store and free the buffer.
+                 */
+                log_store(cont.facility, cont.level, flags, 0,
+                          NULL, 0, cont.buf, cont.len);
+                cont.len = 0;
+        }
 }
 static bool cont_add(int facility, int level, const char *text, size_t len)
@@ -1371,7 +1425,8 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
                return false;
        if (cont.len + len > sizeof(cont.buf)) {
-                cont_flush();
+                /* the line gets too long, split it up in separate records */
+                cont_flush(LOG_CONT);
                return false;
        }
@@ -1380,12 +1435,17 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
                cont.level = level;
                cont.owner = current;
                cont.ts_nsec = local_clock();
+                cont.flags = 0;
                cont.cons = 0;
                cont.flushed = false;
        }
        memcpy(cont.buf + cont.len, text, len);
        cont.len += len;
+        if (cont.len > (sizeof(cont.buf) * 80) / 100)
+                cont_flush(LOG_CONT);
        return true;
 }
@@ -1394,7 +1454,7 @@ static size_t cont_print_text(char *text, size_t size)
        size_t textlen = 0;
        size_t len;
-        if (cont.cons == 0) {
+        if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
                textlen += print_time(cont.ts_nsec, text);
                size -= textlen;
        }
@@ -1409,7 +1469,8 @@ static size_t cont_print_text(char *text, size_t size)
        }
        if (cont.flushed) {
-                text[textlen++] = '\n';
+                if (cont.flags & LOG_NEWLINE)
+                        text[textlen++] = '\n';
                /* got everything, release buffer */
                cont.len = 0;
        }
@@ -1481,17 +1542,23 @@ asmlinkage int vprintk_emit(int facility, int level,
                lflags |= LOG_NEWLINE;
        }
-        /* strip syslog prefix and extract log level or control flags */
+        /* strip kernel syslog prefix and extract log level or control flags */
-        if (text[0] == '<' && text[1] && text[2] == '>') {
+        if (facility == 0) {
-                switch (text[1]) {
+                int kern_level = printk_get_level(text);
-                case '0' ... '7':
-                        if (level == -1)
+                if (kern_level) {
-                                level = text[1] - '0';
+                        const char *end_of_header = printk_skip_level(text);
-                case 'd':       /* KERN_DEFAULT */
+                        switch (kern_level) {
-                        lflags |= LOG_PREFIX;
+                        case '0' ... '7':
-                case 'c':       /* KERN_CONT */
+                                if (level == -1)
-                        text += 3;
+                                        level = kern_level - '0';
-                        text_len -= 3;
+                        case 'd':       /* KERN_DEFAULT */
+                                lflags |= LOG_PREFIX;
+                        case 'c':       /* KERN_CONT */
+                                break;
+                        }
+                        text_len -= end_of_header - text;
+                        text = (char *)end_of_header;
                }
        }
@@ -1507,7 +1574,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                 * or another task also prints continuation lines.
                 */
                if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
-                        cont_flush();
+                        cont_flush(LOG_NEWLINE);
                /* buffer line if possible, otherwise store it right away */
                if (!cont_add(facility, level, text, text_len))
@@ -1525,7 +1592,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                if (cont.len && cont.owner == current) {
                        if (!(lflags & LOG_PREFIX))
                                stored = cont_add(facility, level, text, text_len);
-                        cont_flush();
+                        cont_flush(LOG_NEWLINE);
                }
                if (!stored)
@@ -1616,9 +1683,20 @@ asmlinkage int printk(const char *fmt, ...)
 }
 EXPORT_SYMBOL(printk);
-#else
+#else /* CONFIG_PRINTK */
+#define LOG_LINE_MAX            0
+#define PREFIX_MAX              0
 #define LOG_LINE_MAX 0
+static u64 syslog_seq;
+static u32 syslog_idx;
+static u64 console_seq;
+static u32 console_idx;
+static enum log_flags syslog_prev;
+static u64 log_first_seq;
+static u32 log_first_idx;
+static u64 log_next_seq;
+static enum log_flags console_prev;
 static struct cont {
        size_t len;
        size_t cons;
@@ -1902,10 +1980,34 @@ void wake_up_klogd(void)
                this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
 }
-/* the next printk record to write to the console */
+static void console_cont_flush(char *text, size_t size)
-static u64 console_seq;
+{
-static u32 console_idx;
+        unsigned long flags;
-static enum log_flags console_prev;
+        size_t len;
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        if (!cont.len)
+                goto out;
+        /*
+         * We still queue earlier records, likely because the console was
+         * busy. The earlier ones need to be printed before this one, we
+         * did not flush any fragment so far, so just let it queue up.
+         */
+        if (console_seq < log_next_seq && !cont.cons)
+                goto out;
+        len = cont_print_text(text, size);
+        raw_spin_unlock(&logbuf_lock);
+        stop_critical_timings();
+        call_console_drivers(cont.level, text, len);
+        start_critical_timings();
+        local_irq_restore(flags);
+        return;
+out:
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+}
 /**
 * console_unlock - unlock the console system
@@ -1923,7 +2025,7 @@ static enum log_flags console_prev;
 */
 void console_unlock(void)
 {
-        static char text[LOG_LINE_MAX];
+        static char text[LOG_LINE_MAX + PREFIX_MAX];
        static u64 seen_seq;
        unsigned long flags;
        bool wake_klogd = false;
@@ -1937,19 +2039,7 @@ void console_unlock(void)
        console_may_schedule = 0;
        /* flush buffered message fragment immediately to console */
-        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        console_cont_flush(text, sizeof(text));
-        if (cont.len && (cont.cons < cont.len || cont.flushed)) {
-                size_t len;
-                len = cont_print_text(text, sizeof(text));
-                raw_spin_unlock(&logbuf_lock);
-                stop_critical_timings();
-                call_console_drivers(cont.level, text, len);
-                start_critical_timings();
-                local_irq_restore(flags);
-        } else
-                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 again:
        for (;;) {
                struct log *msg;
@@ -1986,6 +2076,7 @@ skip:
                         * will properly dump everything later.
                         */
                        msg->flags &= ~LOG_NOCONS;
+                        console_prev = msg->flags;
                        goto skip;
                }
diff --git a/kernel/resource.c b/kernel/resource.c
index dc8b47764443..34d45886ee84 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,6 +7,8 @@
 * Arbitrary resource management.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/export.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
@@ -791,8 +793,28 @@ void __init reserve_region_with_split(struct resource *root,
                resource_size_t start, resource_size_t end,
                const char *name)
 {
+        int abort = 0;
        write_lock(&resource_lock);
-        __reserve_region_with_split(root, start, end, name);
+        if (root->start > start || root->end < end) {
+                pr_err("requested range [0x%llx-0x%llx] not in root %pr\n",
+                       (unsigned long long)start, (unsigned long long)end,
+                       root);
+                if (start > root->end || end < root->start)
+                        abort = 1;
+                else {
+                        if (end > root->end)
+                                end = root->end;
+                        if (start < root->start)
+                                start = root->start;
+                        pr_err("fixing request to [0x%llx-0x%llx]\n",
+                               (unsigned long long)start,
+                               (unsigned long long)end);
+                }
+                dump_stack();
+        }
+        if (!abort)
+                __reserve_region_with_split(root, start, end, name);
        write_unlock(&resource_lock);
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 468bdd44c1ba..d325c4b2dcbb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
         *
         * sched_move_task() holds both and thus holding either pins the cgroup,
-         * see set_task_rq().
+         * see task_group().
         *
         * Furthermore, all task_rq users should acquire both locks, see
         * task_rq_lock().
@@ -1910,12 +1910,12 @@ static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
 {
+        trace_sched_switch(prev, next);
        sched_info_switch(prev, next);
        perf_event_task_sched_out(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
        prepare_lock_switch(rq, next);
        prepare_arch_switch(next);
-        trace_sched_switch(prev, next);
 }
 /**
@@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
 * allows us to avoid some pointer chasing select_idle_sibling().
 *
+ * Iterate domains and sched_groups downward, assigning CPUs to be
+ * select_idle_sibling() hw buddy.  Cross-wiring hw makes bouncing
+ * due to random perturbation self canceling, ie sw buddies pull
+ * their counterpart to their CPU's hw counterpart.
+ *
 * Also keep a unique ID per domain (we use the first cpu number in
 * the cpumask of the domain), this allows us to quickly tell if
 * two cpus are in the same cache domain, see cpus_share_cache().
@@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu)
        int id = cpu;
        sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
-        if (sd)
+        if (sd) {
+                struct sched_domain *tmp = sd;
+                struct sched_group *sg, *prev;
+                bool right;
+                /*
+                 * Traverse to first CPU in group, and count hops
+                 * to cpu from there, switching direction on each
+                 * hop, never ever pointing the last CPU rightward.
+                 */
+                do {
+                        id = cpumask_first(sched_domain_span(tmp));
+                        prev = sg = tmp->groups;
+                        right = 1;
+                        while (cpumask_first(sched_group_cpus(sg)) != id)
+                                sg = sg->next;
+                        while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
+                                prev = sg;
+                                sg = sg->next;
+                                right = !right;
+                        }
+                        /* A CPU went down, never point back to domain start. */
+                        if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
+                                right = false;
+                        sg = right ? sg->next : prev;
+                        tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
+                } while ((tmp = tmp->child));
                id = cpumask_first(sched_domain_span(sd));
+        }
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_id, cpu) = id;
@@ -7097,34 +7134,66 @@ match2:
        mutex_unlock(&sched_domains_mutex);
 }
+static int num_cpus_frozen;     /* used to mark begin/end of suspend/resume */
 /*
 * Update cpusets according to cpu_active mask.  If cpusets are
 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
 * around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
 */
 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
                             void *hcpu)
 {
-        switch (action & ~CPU_TASKS_FROZEN) {
+        switch (action) {
+        case CPU_ONLINE_FROZEN:
+        case CPU_DOWN_FAILED_FROZEN:
+                /*
+                 * num_cpus_frozen tracks how many CPUs are involved in suspend
+                 * resume sequence. As long as this is not the last online
+                 * operation in the resume sequence, just build a single sched
+                 * domain, ignoring cpusets.
+                 */
+                num_cpus_frozen--;
+                if (likely(num_cpus_frozen)) {
+                        partition_sched_domains(1, NULL, NULL);
+                        break;
+                }
+                /*
+                 * This is the last CPU online operation. So fall through and
+                 * restore the original sched domains by considering the
+                 * cpuset configurations.
+                 */
        case CPU_ONLINE:
        case CPU_DOWN_FAILED:
-                cpuset_update_active_cpus();
+                cpuset_update_active_cpus(true);
-                return NOTIFY_OK;
+                break;
        default:
                return NOTIFY_DONE;
        }
+        return NOTIFY_OK;
 }
 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
                               void *hcpu)
 {
-        switch (action & ~CPU_TASKS_FROZEN) {
+        switch (action) {
        case CPU_DOWN_PREPARE:
-                cpuset_update_active_cpus();
+                cpuset_update_active_cpus(false);
-                return NOTIFY_OK;
+                break;
+        case CPU_DOWN_PREPARE_FROZEN:
+                num_cpus_frozen++;
+                partition_sched_domains(1, NULL, NULL);
+                break;
        default:
                return NOTIFY_DONE;
        }
+        return NOTIFY_OK;
 }
 void __init sched_init_smp(void)
@@ -7589,6 +7658,7 @@ void sched_destroy_group(struct task_group *tg)
 */
 void sched_move_task(struct task_struct *tsk)
 {
+        struct task_group *tg;
        int on_rq, running;
        unsigned long flags;
        struct rq *rq;
@@ -7603,6 +7673,12 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                tsk->sched_class->put_prev_task(rq, tsk);
+        tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
+                                lockdep_is_held(&tsk->sighand->siglock)),
+                          struct task_group, css);
+        tg = autogroup_task_group(tsk, tg);
+        tsk->sched_task_group = tg;
 #ifdef CONFIG_FAIR_GROUP_SCHED
        if (tsk->sched_class->task_move_group)
                tsk->sched_class->task_move_group(tsk, on_rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c099cc6eebe3..22321db64952 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
        int cpu = smp_processor_id();
        int prev_cpu = task_cpu(p);
        struct sched_domain *sd;
-        struct sched_group *sg;
-        int i;
        /*
         * If the task is going to be woken-up on this cpu and if it is
@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
                return prev_cpu;
        /*
-         * Otherwise, iterate the domains and find an elegible idle cpu.
+         * Otherwise, check assigned siblings to find an elegible idle cpu.
         */
        sd = rcu_dereference(per_cpu(sd_llc, target));
-        for_each_lower_domain(sd) {
-                sg = sd->groups;
-                do {
-                        if (!cpumask_intersects(sched_group_cpus(sg),
-                                                tsk_cpus_allowed(p)))
-                                goto next;
-                        for_each_cpu(i, sched_group_cpus(sg)) {
-                                if (!idle_cpu(i))
-                                        goto next;
-                        }
-                        target = cpumask_first_and(sched_group_cpus(sg),
+        for_each_lower_domain(sd) {
-                                        tsk_cpus_allowed(p));
+                if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
-                        goto done;
+                        continue;
-next:
+                if (idle_cpu(sd->idle_buddy))
-                        sg = sg->next;
+                        return sd->idle_buddy;
-                } while (sg != sd->groups);
        }
-done:
        return target;
 }
@@ -3068,16 +3054,19 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 #define LBF_ALL_PINNED  0x01
 #define LBF_NEED_BREAK  0x02
+#define LBF_SOME_PINNED 0x04
 struct lb_env {
        struct sched_domain     *sd;
-        int                     src_cpu;
        struct rq               *src_rq;
+        int                     src_cpu;
        int                     dst_cpu;
        struct rq               *dst_rq;
+        struct cpumask          *dst_grpmask;
+        int                     new_dst_cpu;
        enum cpu_idle_type      idle;
        long                    imbalance;
        unsigned int            flags;
@@ -3145,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
         * 3) are cache-hot on their current CPU.
         */
        if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+                int new_dst_cpu;
                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+                /*
+                 * Remember if this task can be migrated to any other cpu in
+                 * our sched_group. We may want to revisit it if we couldn't
+                 * meet load balance goals by pulling other tasks on src_cpu.
+                 *
+                 * Also avoid computing new_dst_cpu if we have already computed
+                 * one in current iteration.
+                 */
+                if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+                        return 0;
+                new_dst_cpu = cpumask_first_and(env->dst_grpmask,
+                                                tsk_cpus_allowed(p));
+                if (new_dst_cpu < nr_cpu_ids) {
+                        env->flags |= LBF_SOME_PINNED;
+                        env->new_dst_cpu = new_dst_cpu;
+                }
                return 0;
        }
+        /* Record that we found atleast one task that could run on dst_cpu */
        env->flags &= ~LBF_ALL_PINNED;
        if (task_running(env->src_rq, p)) {
@@ -4227,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
                        int *balance)
 {
-        int ld_moved, active_balance = 0;
+        int ld_moved, cur_ld_moved, active_balance = 0;
+        int lb_iterations, max_lb_iterations;
        struct sched_group *group;
        struct rq *busiest;
        unsigned long flags;
@@ -4237,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .sd             = sd,
                .dst_cpu        = this_cpu,
                .dst_rq         = this_rq,
+                .dst_grpmask    = sched_group_cpus(sd->groups),
                .idle           = idle,
                .loop_break     = sched_nr_migrate_break,
        };
        cpumask_copy(cpus, cpu_active_mask);
+        max_lb_iterations = cpumask_weight(env.dst_grpmask);
        schedstat_inc(sd, lb_count[idle]);
@@ -4267,6 +4281,7 @@ redo:
        schedstat_add(sd, lb_imbalance[idle], env.imbalance);
        ld_moved = 0;
+        lb_iterations = 1;
        if (busiest->nr_running > 1) {
                /*
                 * Attempt to move tasks. If find_busiest_group has found
@@ -4284,7 +4299,13 @@ more_balance:
                double_rq_lock(this_rq, busiest);
                if (!env.loop)
                        update_h_load(env.src_cpu);
-                ld_moved += move_tasks(&env);
+                /*
+                 * cur_ld_moved - load moved in current iteration
+                 * ld_moved     - cumulative load moved across iterations
+                 */
+                cur_ld_moved = move_tasks(&env);
+                ld_moved += cur_ld_moved;
                double_rq_unlock(this_rq, busiest);
                local_irq_restore(flags);
@@ -4296,14 +4317,52 @@ more_balance:
                /*
                 * some other cpu did the load balance for us.
                 */
-                if (ld_moved && this_cpu != smp_processor_id())
+                if (cur_ld_moved && env.dst_cpu != smp_processor_id())
-                        resched_cpu(this_cpu);
+                        resched_cpu(env.dst_cpu);
+                /*
+                 * Revisit (affine) tasks on src_cpu that couldn't be moved to
+                 * us and move them to an alternate dst_cpu in our sched_group
+                 * where they can run. The upper limit on how many times we
+                 * iterate on same src_cpu is dependent on number of cpus in our
+                 * sched_group.
+                 *
+                 * This changes load balance semantics a bit on who can move
+                 * load to a given_cpu. In addition to the given_cpu itself
+                 * (or a ilb_cpu acting on its behalf where given_cpu is
+                 * nohz-idle), we now have balance_cpu in a position to move
+                 * load to given_cpu. In rare situations, this may cause
+                 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
+                 * _independently_ and at _same_ time to move some load to
+                 * given_cpu) causing exceess load to be moved to given_cpu.
+                 * This however should not happen so much in practice and
+                 * moreover subsequent load balance cycles should correct the
+                 * excess load moved.
+                 */
+                if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
+                                lb_iterations++ < max_lb_iterations) {
+                        this_rq          = cpu_rq(env.new_dst_cpu);
+                        env.dst_rq       = this_rq;
+                        env.dst_cpu      = env.new_dst_cpu;
+                        env.flags       &= ~LBF_SOME_PINNED;
+                        env.loop         = 0;
+                        env.loop_break   = sched_nr_migrate_break;
+                        /*
+                         * Go back to "more_balance" rather than "redo" since we
+                         * need to continue with same src_cpu.
+                         */
+                        goto more_balance;
+                }
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(env.flags & LBF_ALL_PINNED)) {
                        cpumask_clear_cpu(cpu_of(busiest), cpus);
-                        if (!cpumask_empty(cpus))
+                        if (!cpumask_empty(cpus)) {
+                                env.loop = 0;
+                                env.loop_break = sched_nr_migrate_break;
                                goto redo;
+                        }
                        goto out_balanced;
                }
        }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 55844f24435a..c35a1a7dd4d6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -538,22 +538,19 @@ extern int group_balance_cpu(struct sched_group *sg);
 /*
 * Return the group to which this tasks belongs.
 *
- * We use task_subsys_state_check() and extend the RCU verification with
+ * We cannot use task_subsys_state() and friends because the cgroup
- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
+ * subsystem changes that value before the cgroup_subsys::attach() method
- * task it moves into the cgroup. Therefore by holding either of those locks,
+ * is called, therefore we cannot pin it and might observe the wrong value.
- * we pin the task to the current cgroup.
+ *
+ * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
+ * core changes this before calling sched_move_task().
+ *
+ * Instead we use a 'copy' which is updated from sched_move_task() while
+ * holding both task_struct::pi_lock and rq::lock.
 */
 static inline struct task_group *task_group(struct task_struct *p)
 {
-        struct task_group *tg;
+        return p->sched_task_group;
-        struct cgroup_subsys_state *css;
-        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
-                        lockdep_is_held(&p->pi_lock) ||
-                        lockdep_is_held(&task_rq(p)->lock));
-        tg = container_of(css, struct task_group, css);
-        return autogroup_task_group(p, tg);
 }
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 671f9594e368..b73e681df09e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -210,6 +210,14 @@ asmlinkage void __do_softirq(void)
        __u32 pending;
        int max_restart = MAX_SOFTIRQ_RESTART;
        int cpu;
+        unsigned long old_flags = current->flags;
+        /*
+         * Mask out PF_MEMALLOC s current task context is borrowed for the
+         * softirq. A softirq handled such as network RX might set PF_MEMALLOC
+         * again if the socket is related to swap
+         */
+        current->flags &= ~PF_MEMALLOC;
        pending = local_softirq_pending();
        account_system_vtime(current);
@@ -265,6 +273,7 @@ restart:
        account_system_vtime(current);
        __local_bh_enable(SOFTIRQ_OFFSET);
+        tsk_restore_flags(current, old_flags, PF_MEMALLOC);
 }
 #ifndef __ARCH_HAS_DO_SOFTIRQ
diff --git a/kernel/sys.c b/kernel/sys.c
index 2d39a84cd857..241507f23eca 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2015,7 +2015,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                break;
                        }
                        me->pdeath_signal = arg2;
-                        error = 0;
                        break;
                case PR_GET_PDEATHSIG:
                        error = put_user(me->pdeath_signal, (int __user *)arg2);
@@ -2029,7 +2028,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                break;
                        }
                        set_dumpable(me->mm, arg2);
-                        error = 0;
                        break;
                case PR_SET_UNALIGN:
@@ -2056,10 +2054,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                case PR_SET_TIMING:
                        if (arg2 != PR_TIMING_STATISTICAL)
                                error = -EINVAL;
-                        else
-                                error = 0;
                        break;
                case PR_SET_NAME:
                        comm[sizeof(me->comm)-1] = 0;
                        if (strncpy_from_user(comm, (char __user *)arg2,
@@ -2067,20 +2062,19 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                return -EFAULT;
                        set_task_comm(me, comm);
                        proc_comm_connector(me);
-                        return 0;
+                        break;
                case PR_GET_NAME:
                        get_task_comm(comm, me);
                        if (copy_to_user((char __user *)arg2, comm,
                                         sizeof(comm)))
                                return -EFAULT;
-                        return 0;
+                        break;
                case PR_GET_ENDIAN:
                        error = GET_ENDIAN(me, arg2);
                        break;
                case PR_SET_ENDIAN:
                        error = SET_ENDIAN(me, arg2);
                        break;
                case PR_GET_SECCOMP:
                        error = prctl_get_seccomp();
                        break;
@@ -2108,7 +2102,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                        current->default_timer_slack_ns;
                        else
                                current->timer_slack_ns = arg2;
-                        error = 0;
                        break;
                case PR_MCE_KILL:
                        if (arg4 | arg5)
@@ -2134,7 +2127,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                        default:
                                return -EINVAL;
                        }
-                        error = 0;
                        break;
                case PR_MCE_KILL_GET:
                        if (arg2 | arg3 | arg4 | arg5)
@@ -2153,7 +2145,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                        break;
                case PR_SET_CHILD_SUBREAPER:
                        me->signal->is_child_subreaper = !!arg2;
-                        error = 0;
                        break;
                case PR_GET_CHILD_SUBREAPER:
                        error = put_user(me->signal->is_child_subreaper,
@@ -2195,46 +2186,52 @@ static void argv_cleanup(struct subprocess_info *info)
        argv_free(info->argv);
 }
-/**
+static int __orderly_poweroff(void)
- * orderly_poweroff - Trigger an orderly system poweroff
- * @force: force poweroff if command execution fails
- *
- * This may be called from any context to trigger a system shutdown.
- * If the orderly shutdown fails, it will force an immediate shutdown.
- */
-int orderly_poweroff(bool force)
 {
        int argc;
-        char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
+        char **argv;
        static char *envp[] = {
                "HOME=/",
                "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
                NULL
        };
-        int ret = -ENOMEM;
+        int ret;
+        argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
        if (argv == NULL) {
                printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
                       __func__, poweroff_cmd);
-                goto out;
+                return -ENOMEM;
        }
        ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT,
                                      NULL, argv_cleanup, NULL);
-out:
-        if (likely(!ret))
-                return 0;
        if (ret == -ENOMEM)
                argv_free(argv);
-        if (force) {
+        return ret;
+}
+/**
+ * orderly_poweroff - Trigger an orderly system poweroff
+ * @force: force poweroff if command execution fails
+ *
+ * This may be called from any context to trigger a system shutdown.
+ * If the orderly shutdown fails, it will force an immediate shutdown.
+ */
+int orderly_poweroff(bool force)
+{
+        int ret = __orderly_poweroff();
+        if (ret && force) {
                printk(KERN_WARNING "Failed to start orderly shutdown: "
                       "forcing the issue\n");
-                /* I guess this should try to kick off some daemon to
+                /*
-                   sync and poweroff asap.  Or not even bother syncing
+                 * I guess this should try to kick off some daemon to sync and
-                   if we're doing an emergency shutdown? */
+                 * poweroff asap.  Or not even bother syncing if we're doing an
+                 * emergency shutdown?
+                 */
                emergency_sync();
                kernel_power_off();
        }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4ab11879aeb4..6502d35a25ba 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -30,6 +30,7 @@
 #include <linux/security.h>
 #include <linux/ctype.h>
 #include <linux/kmemcheck.h>
+#include <linux/kmemleak.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -174,6 +175,11 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
+static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp, loff_t *ppos);
+static int proc_dostring_coredump(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp, loff_t *ppos);
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses it's own private copy */
 static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -410,7 +416,7 @@ static struct ctl_table kern_table[] = {
                .data           = core_pattern,
                .maxlen         = CORENAME_MAX_SIZE,
                .mode           = 0644,
-                .proc_handler   = proc_dostring,
+                .proc_handler   = proc_dostring_coredump,
        },
        {
                .procname       = "core_pipe_limit",
@@ -1095,11 +1101,9 @@ static struct ctl_table vm_table[] = {
                .extra1         = &zero,
        },
        {
-                .procname       = "nr_pdflush_threads",
+                .procname       = "nr_pdflush_threads",
-                .data           = &nr_pdflush_threads,
+                .mode           = 0444 /* read-only */,
-                .maxlen         = sizeof nr_pdflush_threads,
+                .proc_handler   = pdflush_proc_obsolete,
-                .mode           = 0444 /* read-only*/,
-                .proc_handler   = proc_dointvec,
        },
        {
                .procname       = "swappiness",
@@ -1498,7 +1502,7 @@ static struct ctl_table fs_table[] = {
                .data           = &suid_dumpable,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax_coredump,
                .extra1         = &zero,
                .extra2         = &two,
        },
@@ -1551,7 +1555,10 @@ static struct ctl_table dev_table[] = {
 int __init sysctl_init(void)
 {
-        register_sysctl_table(sysctl_base_table);
+        struct ctl_table_header *hdr;
+        hdr = register_sysctl_table(sysctl_base_table);
+        kmemleak_not_leak(hdr);
        return 0;
 }
@@ -2009,6 +2016,34 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
                                do_proc_dointvec_minmax_conv, &param);
 }
+static void validate_coredump_safety(void)
+{
+        if (suid_dumpable == SUID_DUMPABLE_SAFE &&
+            core_pattern[0] != '/' && core_pattern[0] != '|') {
+                printk(KERN_WARNING "Unsafe core_pattern used with "\
+                        "suid_dumpable=2. Pipe handler or fully qualified "\
+                        "core dump path required.\n");
+        }
+}
+static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        if (!error)
+                validate_coredump_safety();
+        return error;
+}
+static int proc_dostring_coredump(struct ctl_table *table, int write,
+                  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        int error = proc_dostring(table, write, buffer, lenp, ppos);
+        if (!error)
+                validate_coredump_safety();
+        return error;
+}
 static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
                                     void __user *buffer,
                                     size_t *lenp, loff_t *ppos,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index a650694883a1..65bdcf198d4e 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = {
        { CTL_INT,      VM_DIRTY_RATIO,                 "dirty_ratio" },
        /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
        /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
-        { CTL_INT,      VM_NR_PDFLUSH_THREADS,          "nr_pdflush_threads" },
+        /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */
        { CTL_INT,      VM_OVERCOMMIT_RATIO,            "overcommit_ratio" },
        /* VM_PAGEBUF unused */
        /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index e66046456f4f..d0a32796550f 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -436,6 +436,11 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
        na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
                                sizeof(struct cgroupstats));
+        if (na == NULL) {
+                rc = -EMSGSIZE;
+                goto err;
+        }
        stats = nla_data(na);
        memset(stats, 0, sizeof(*stats));
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a120f98c4112..5c38c81496ce 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3187,10 +3187,10 @@ static int tracing_set_tracer(const char *buf)
        }
        destroy_trace_option_files(topts);
-        current_trace = t;
+        current_trace = &nop_trace;
-        topts = create_trace_option_files(current_trace);
+        topts = create_trace_option_files(t);
-        if (current_trace->use_max_tr) {
+        if (t->use_max_tr) {
                int cpu;
                /* we need to make per cpu buffer sizes equivalent */
                for_each_tracing_cpu(cpu) {
@@ -3210,6 +3210,7 @@ static int tracing_set_tracer(const char *buf)
                        goto out;
        }
+        current_trace = t;
        trace_branch_enable(tr);
 out:
        mutex_unlock(&trace_types_lock);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c7b0c6a7db09..a426f410c060 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -13,6 +13,7 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/pstore.h>
 #include <linux/fs.h>
 #include "trace.h"
@@ -74,6 +75,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
        preempt_enable_notrace();
 }
+/* Our two options */
+enum {
+        TRACE_FUNC_OPT_STACK    = 0x1,
+        TRACE_FUNC_OPT_PSTORE   = 0x2,
+};
+static struct tracer_flags func_flags;
 static void
 function_trace_call(unsigned long ip, unsigned long parent_ip)
 {
@@ -97,6 +106,12 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
        disabled = atomic_inc_return(&data->disabled);
        if (likely(disabled == 1)) {
+                /*
+                 * So far tracing doesn't support multiple buffers, so
+                 * we make an explicit call for now.
+                 */
+                if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE))
+                        pstore_ftrace_call(ip, parent_ip);
                pc = preempt_count();
                trace_function(tr, ip, parent_ip, flags, pc);
        }
@@ -158,15 +173,13 @@ static struct ftrace_ops trace_stack_ops __read_mostly =
        .flags = FTRACE_OPS_FL_GLOBAL,
 };
-/* Our two options */
-enum {
-        TRACE_FUNC_OPT_STACK = 0x1,
-};
 static struct tracer_opt func_opts[] = {
 #ifdef CONFIG_STACKTRACE
        { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
 #endif
+#ifdef CONFIG_PSTORE_FTRACE
+        { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) },
+#endif
        { } /* Always set a last empty entry */
 };
@@ -204,10 +217,11 @@ static void tracing_stop_function_trace(void)
 static int func_set_flag(u32 old_flags, u32 bit, int set)
 {
-        if (bit == TRACE_FUNC_OPT_STACK) {
+        switch (bit) {
+        case TRACE_FUNC_OPT_STACK:
                /* do nothing if already set */
                if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
-                        return 0;
+                        break;
                if (set) {
                        unregister_ftrace_function(&trace_ops);
@@ -217,10 +231,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
                        register_ftrace_function(&trace_ops);
                }
-                return 0;
+                break;
+        case TRACE_FUNC_OPT_PSTORE:
+                break;
+        default:
+                return -EINVAL;
        }
-        return -EINVAL;
+        return 0;
 }
 static struct tracer function_trace __read_mostly =
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4b1dfba70f7c..69add8a9da68 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -575,7 +575,7 @@ out:
 /*
 * Create/destroy watchdog threads as CPUs come and go:
 */
-static int __cpuinit
+static int
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
        int hotcpu = (unsigned long)hcpu;
@@ -610,10 +610,27 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
        return NOTIFY_OK;
 }
-static struct notifier_block __cpuinitdata cpu_nfb = {
+static struct notifier_block cpu_nfb = {
        .notifier_call = cpu_callback
 };
+#ifdef CONFIG_SUSPEND
+/*
+ * On exit from suspend we force an offline->online transition on the boot CPU
+ * so that the PMU state that was lost while in suspended state gets set up
+ * properly for the boot CPU.  This information is required for restarting the
+ * NMI watchdog.
+ */
+void lockup_detector_bootcpu_resume(void)
+{
+        void *cpu = (void *)(long)smp_processor_id();
+        cpu_callback(&cpu_nfb, CPU_DEAD_FROZEN, cpu);
+        cpu_callback(&cpu_nfb, CPU_UP_PREPARE_FROZEN, cpu);
+        cpu_callback(&cpu_nfb, CPU_ONLINE_FROZEN, cpu);
+}
+#endif
 void __init lockup_detector_init(void)
 {
        void *cpu = (void *)(long)smp_processor_id();