63 files changed, 1061 insertions, 576 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 353d3fe8ba33..85cbfb31e73e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
 obj-$(CONFIG_PADATA) += padata.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit.c b/kernel/audit.c
index e4956244ae50..939500317066 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -74,6 +74,8 @@ static int	audit_initialized;
 int             audit_enabled;
 int             audit_ever_enabled;
+EXPORT_SYMBOL_GPL(audit_enabled);
 /* Default state when kernel boots without any parameters. */
 static int      audit_default;
@@ -671,9 +673,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        pid  = NETLINK_CREDS(skb)->pid;
        uid  = NETLINK_CREDS(skb)->uid;
-        loginuid = NETLINK_CB(skb).loginuid;
+        loginuid = audit_get_loginuid(current);
-        sessionid = NETLINK_CB(skb).sessionid;
+        sessionid = audit_get_sessionid(current);
-        sid  = NETLINK_CB(skb).sid;
+        security_task_getsecid(current, &sid);
        seq  = nlh->nlmsg_seq;
        data = NLMSG_DATA(nlh);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index add2819af71b..f8277c80d678 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1238,6 +1238,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
        for (i = 0; i < rule->field_count; i++) {
                struct audit_field *f = &rule->fields[i];
                int result = 0;
+                u32 sid;
                switch (f->type) {
                case AUDIT_PID:
@@ -1250,19 +1251,22 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
                        result = audit_comparator(cb->creds.gid, f->op, f->val);
                        break;
                case AUDIT_LOGINUID:
-                        result = audit_comparator(cb->loginuid, f->op, f->val);
+                        result = audit_comparator(audit_get_loginuid(current),
+                                                  f->op, f->val);
                        break;
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
                case AUDIT_SUBJ_TYPE:
                case AUDIT_SUBJ_SEN:
                case AUDIT_SUBJ_CLR:
-                        if (f->lsm_rule)
+                        if (f->lsm_rule) {
-                                result = security_audit_rule_match(cb->sid,
+                                security_task_getsecid(current, &sid);
+                                result = security_audit_rule_match(sid,
                                                                   f->type,
                                                                   f->op,
                                                                   f->lsm_rule,
                                                                   NULL);
+                        }
                        break;
                }
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 98a51f26c136..0c9b862292b2 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,11 +9,13 @@
 #include <linux/page-flags.h>
 #include <linux/mmzone.h>
 #include <linux/kbuild.h>
+#include <linux/page_cgroup.h>
 void foo(void)
 {
        /* The enum constants to put into include/generated/bounds.h */
        DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
        DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
+        DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
        /* End of constants */
 }
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e9385f132c8..bf0c734d0c12 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -14,6 +14,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include <asm/uaccess.h>
 /*
@@ -290,6 +291,60 @@ error:
 }
 /**
+ * has_capability - Does a task have a capability in init_user_ns
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the initial user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability(struct task_struct *t, int cap)
+{
+        int ret = security_real_capable(t, &init_user_ns, cap);
+        return (ret == 0);
+}
+/**
+ * has_capability - Does a task have a capability in a specific user ns
+ * @t: The task in question
+ * @ns: target user namespace
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the specified user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_ns_capability(struct task_struct *t,
+                       struct user_namespace *ns, int cap)
+{
+        int ret = security_real_capable(t, ns, cap);
+        return (ret == 0);
+}
+/**
+ * has_capability_noaudit - Does a task have a capability (unaudited)
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to init_user_ns, false if not.  Don't write an
+ * audit message for the check.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability_noaudit(struct task_struct *t, int cap)
+{
+        int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
+        return (ret == 0);
+}
+/**
 * capable - Determine if the current task has a superior capability in effect
 * @cap: The capability to be tested for
 *
@@ -299,17 +354,48 @@ error:
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
-int capable(int cap)
+bool capable(int cap)
+{
+        return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+/**
+ * ns_capable - Determine if the current task has a superior capability in effect
+ * @ns:  The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable(struct user_namespace *ns, int cap)
 {
        if (unlikely(!cap_valid(cap))) {
                printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
                BUG();
        }
-        if (security_capable(current_cred(), cap) == 0) {
+        if (security_capable(ns, current_cred(), cap) == 0) {
                current->flags |= PF_SUPERPRIV;
-                return 1;
+                return true;
        }
-        return 0;
+        return false;
 }
-EXPORT_SYMBOL(capable);
+EXPORT_SYMBOL(ns_capable);
+/**
+ * task_ns_capable - Determine whether current task has a superior
+ * capability targeted at a specific task's user namespace.
+ * @t: The task whose user namespace is targeted.
+ * @cap: The capability in question.
+ *
+ *  Return true if it does, false otherwise.
+ */
+bool task_ns_capable(struct task_struct *t, int cap)
+{
+        return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
+}
+EXPORT_SYMBOL(task_ns_capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 95362d15128c..e31b220a743d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1813,10 +1813,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        /* Update the css_set linked lists if we're using them */
        write_lock(&css_set_lock);
-        if (!list_empty(&tsk->cg_list)) {
+        if (!list_empty(&tsk->cg_list))
-                list_del(&tsk->cg_list);
+                list_move(&tsk->cg_list, &newcg->tasks);
-                list_add(&tsk->cg_list, &newcg->tasks);
-        }
        write_unlock(&css_set_lock);
        for_each_subsys(root, ss) {
@@ -3655,12 +3653,12 @@ again:
        spin_lock(&release_list_lock);
        set_bit(CGRP_REMOVED, &cgrp->flags);
        if (!list_empty(&cgrp->release_list))
-                list_del(&cgrp->release_list);
+                list_del_init(&cgrp->release_list);
        spin_unlock(&release_list_lock);
        cgroup_lock_hierarchy(cgrp->root);
        /* delete this cgroup from parent->children */
-        list_del(&cgrp->sibling);
+        list_del_init(&cgrp->sibling);
        cgroup_unlock_hierarchy(cgrp->root);
        d = dget(cgrp->dentry);
@@ -3879,7 +3877,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
        subsys[ss->subsys_id] = NULL;
        /* remove subsystem from rootnode's list of subsystems */
-        list_del(&ss->sibling);
+        list_del_init(&ss->sibling);
        /*
         * disentangle the css from all css_sets attached to the dummytop. as
@@ -4241,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        if (!list_empty(&tsk->cg_list)) {
                write_lock(&css_set_lock);
                if (!list_empty(&tsk->cg_list))
-                        list_del(&tsk->cg_list);
+                        list_del_init(&tsk->cg_list);
                write_unlock(&css_set_lock);
        }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 156cc5556140..c95fc4df0faa 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v)
 {
        BUG_ON(cpu_notify(val, v));
 }
 EXPORT_SYMBOL(register_cpu_notifier);
 void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -205,7 +204,6 @@ static int __ref take_cpu_down(void *_param)
                return err;
        cpu_notify(CPU_DYING | param->mod, param->hcpu);
        return 0;
 }
@@ -227,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
                return -EINVAL;
        cpu_hotplug_begin();
        err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
        if (err) {
                nr_calls--;
@@ -304,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
        ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
        if (ret) {
                nr_calls--;
-                printk("%s: attempt to bring up CPU %u failed\n",
+                printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
                                __func__, cpu);
                goto out_notify;
        }
@@ -450,14 +449,14 @@ void __ref enable_nonboot_cpus(void)
        if (cpumask_empty(frozen_cpus))
                goto out;
-        printk("Enabling non-boot CPUs ...\n");
+        printk(KERN_INFO "Enabling non-boot CPUs ...\n");
        arch_enable_nonboot_cpus_begin();
        for_each_cpu(cpu, frozen_cpus) {
                error = _cpu_up(cpu, 1);
                if (!error) {
-                        printk("CPU%d is up\n", cpu);
+                        printk(KERN_INFO "CPU%d is up\n", cpu);
                        continue;
                }
                printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
@@ -509,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
 */
 /* cpu_bit_bitmap[0] is empty - so we can back into it */
-#define MASK_DECLARE_1(x)       [x+1][0] = 1UL << (x)
+#define MASK_DECLARE_1(x)       [x+1][0] = (1UL << (x))
 #define MASK_DECLARE_2(x)       MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
 #define MASK_DECLARE_4(x)       MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
 #define MASK_DECLARE_8(x)       MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e92e98189032..33eee16addb8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
        struct cpuset *cs;
        int migrate;
        const nodemask_t *oldmem = scan->data;
-        NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
+        static nodemask_t newmems;      /* protected by cgroup_mutex */
-        if (!newmems)
-                return;
        cs = cgroup_cs(scan->cg);
-        guarantee_online_mems(cs, newmems);
+        guarantee_online_mems(cs, &newmems);
-        cpuset_change_task_nodemask(p, newmems);
-        NODEMASK_FREE(newmems);
+        cpuset_change_task_nodemask(p, &newmems);
        mm = get_task_mm(p);
        if (!mm)
@@ -1438,44 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
        struct mm_struct *mm;
        struct cpuset *cs = cgroup_cs(cont);
        struct cpuset *oldcs = cgroup_cs(oldcont);
-        NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
+        static nodemask_t to;           /* protected by cgroup_mutex */
-        NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
-        if (from == NULL || to == NULL)
-                goto alloc_fail;
        if (cs == &top_cpuset) {
                cpumask_copy(cpus_attach, cpu_possible_mask);
        } else {
                guarantee_online_cpus(cs, cpus_attach);
        }
-        guarantee_online_mems(cs, to);
+        guarantee_online_mems(cs, &to);
        /* do per-task migration stuff possibly for each in the threadgroup */
-        cpuset_attach_task(tsk, to, cs);
+        cpuset_attach_task(tsk, &to, cs);
        if (threadgroup) {
                struct task_struct *c;
                rcu_read_lock();
                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        cpuset_attach_task(c, to, cs);
+                        cpuset_attach_task(c, &to, cs);
                }
                rcu_read_unlock();
        }
        /* change mm; only needs to be done once even if threadgroup */
-        *from = oldcs->mems_allowed;
+        to = cs->mems_allowed;
-        *to = cs->mems_allowed;
        mm = get_task_mm(tsk);
        if (mm) {
-                mpol_rebind_mm(mm, to);
+                mpol_rebind_mm(mm, &to);
                if (is_memory_migrate(cs))
-                        cpuset_migrate_mm(mm, from, to);
+                        cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
                mmput(mm);
        }
-alloc_fail:
-        NODEMASK_FREE(from);
-        NODEMASK_FREE(to);
 }
 /* The various types of files and directories in a cpuset file system */
@@ -1610,34 +1596,26 @@ out:
 * across a page fault.
 */
-static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
+static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
-        int ret;
+        size_t count;
        mutex_lock(&callback_mutex);
-        ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
+        count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
        mutex_unlock(&callback_mutex);
-        return ret;
+        return count;
 }
-static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
+static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
-        NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
+        size_t count;
-        int retval;
-        if (mask == NULL)
-                return -ENOMEM;
        mutex_lock(&callback_mutex);
-        *mask = cs->mems_allowed;
+        count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
        mutex_unlock(&callback_mutex);
-        retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
+        return count;
-        NODEMASK_FREE(mask);
-        return retval;
 }
 static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1862,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
        cs = cgroup_cs(cgroup);
        parent_cs = cgroup_cs(parent);
+        mutex_lock(&callback_mutex);
        cs->mems_allowed = parent_cs->mems_allowed;
        cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
+        mutex_unlock(&callback_mutex);
        return;
 }
@@ -2066,10 +2046,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
        struct cpuset *cp;      /* scans cpusets being updated */
        struct cpuset *child;   /* scans child cpusets of cp */
        struct cgroup *cont;
-        NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
+        static nodemask_t oldmems;      /* protected by cgroup_mutex */
-        if (oldmems == NULL)
-                return;
        list_add_tail((struct list_head *)&root->stack_list, &queue);
@@ -2086,7 +2063,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
                        continue;
-                *oldmems = cp->mems_allowed;
+                oldmems = cp->mems_allowed;
                /* Remove offline cpus and mems from this cpuset. */
                mutex_lock(&callback_mutex);
@@ -2102,10 +2079,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                        remove_tasks_in_empty_cpuset(cp);
                else {
                        update_tasks_cpumask(cp, NULL);
-                        update_tasks_nodemask(cp, oldmems, NULL);
+                        update_tasks_nodemask(cp, &oldmems, NULL);
                }
        }
-        NODEMASK_FREE(oldmems);
 }
 /*
@@ -2147,19 +2123,16 @@ void cpuset_update_active_cpus(void)
 static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
 {
-        NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
+        static nodemask_t oldmems;      /* protected by cgroup_mutex */
-        if (oldmems == NULL)
-                return NOTIFY_DONE;
        cgroup_lock();
        switch (action) {
        case MEM_ONLINE:
-                *oldmems = top_cpuset.mems_allowed;
+                oldmems = top_cpuset.mems_allowed;
                mutex_lock(&callback_mutex);
                top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
                mutex_unlock(&callback_mutex);
-                update_tasks_nodemask(&top_cpuset, oldmems, NULL);
+                update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
                break;
        case MEM_OFFLINE:
                /*
@@ -2173,7 +2146,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
        }
        cgroup_unlock();
-        NODEMASK_FREE(oldmems);
        return NOTIFY_OK;
 }
 #endif
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..5f85690285d4
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,34 @@
+#include <linux/kernel.h>
+#include <linux/crash_dump.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+/*
+ * If we have booted due to a crash, max_pfn will be a very low value. We need
+ * to know the amount of memory that the previous kernel used.
+ */
+unsigned long saved_max_pfn;
+/*
+ * stores the physical address of elf header of crash image
+ *
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence put
+ * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+/*
+ * elfcorehdr= specifies the location of elf core header stored by the crashed
+ * kernel. This option will be passed by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+        char *end;
+        if (!arg)
+                return -EINVAL;
+        elfcorehdr_addr = memparse(arg, &end);
+        return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index 2343c132c5a7..5557b55048df 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -741,6 +741,12 @@ int set_create_files_as(struct cred *new, struct inode *inode)
 }
 EXPORT_SYMBOL(set_create_files_as);
+struct user_namespace *current_user_ns(void)
+{
+        return _current_user_ns();
+}
+EXPORT_SYMBOL(current_user_ns);
 #ifdef CONFIG_DEBUG_CREDENTIALS
 bool creds_are_invalid(const struct cred *cred)
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 481a7bd2dfe7..a11db956dd62 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
        put_packet(remcom_out_buffer);
        return 0;
 }
+/**
+ * gdbstub_exit - Send an exit message to GDB
+ * @status: The exit code to report.
+ */
+void gdbstub_exit(int status)
+{
+        unsigned char checksum, ch, buffer[3];
+        int loop;
+        buffer[0] = 'W';
+        buffer[1] = hex_asc_hi(status);
+        buffer[2] = hex_asc_lo(status);
+        dbg_io_ops->write_char('$');
+        checksum = 0;
+        for (loop = 0; loop < 3; loop++) {
+                ch = buffer[loop];
+                checksum += ch;
+                dbg_io_ops->write_char(ch);
+        }
+        dbg_io_ops->write_char('#');
+        dbg_io_ops->write_char(hex_asc_hi(checksum));
+        dbg_io_ops->write_char(hex_asc_lo(checksum));
+        /* make sure the output is flushed, lest the bootloader clobber it */
+        dbg_io_ops->flush();
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45ebcc7b1..6a488ad2dce5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code)
        profile_task_exit(tsk);
        WARN_ON(atomic_read(&tsk->fs_excl));
+        WARN_ON(blk_needs_flush_plug(tsk));
        if (unlikely(in_interrupt()))
                panic("Aiee, killing interrupt handler!");
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e429152ddc..e7548dee636b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
 #include <linux/tracehook.h>
 #include <linux/futex.h>
 #include <linux/compat.h>
+#include <linux/kthread.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/rcupdate.h>
 #include <linux/ptrace.h>
@@ -109,20 +110,25 @@ int nr_processes(void)
 }
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-# define alloc_task_struct()    kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
+# define alloc_task_struct_node(node)           \
-# define free_task_struct(tsk)  kmem_cache_free(task_struct_cachep, (tsk))
+                kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
+# define free_task_struct(tsk)                  \
+                kmem_cache_free(task_struct_cachep, (tsk))
 static struct kmem_cache *task_struct_cachep;
 #endif
 #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
-static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
+static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+                                                  int node)
 {
 #ifdef CONFIG_DEBUG_STACK_USAGE
        gfp_t mask = GFP_KERNEL | __GFP_ZERO;
 #else
        gfp_t mask = GFP_KERNEL;
 #endif
-        return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
+        struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
+        return page ? page_address(page) : NULL;
 }
 static inline void free_thread_info(struct thread_info *ti)
@@ -193,6 +199,7 @@ void __put_task_struct(struct task_struct *tsk)
        if (!profile_handoff_task(tsk))
                free_task(tsk);
 }
+EXPORT_SYMBOL_GPL(__put_task_struct);
 /*
 * macro override instead of weak attribute alias, to workaround
@@ -248,16 +255,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        struct task_struct *tsk;
        struct thread_info *ti;
        unsigned long *stackend;
+        int node = tsk_fork_get_node(orig);
        int err;
        prepare_to_copy(orig);
-        tsk = alloc_task_struct();
+        tsk = alloc_task_struct_node(node);
        if (!tsk)
                return NULL;
-        ti = alloc_thread_info(tsk);
+        ti = alloc_thread_info_node(tsk, node);
        if (!ti) {
                free_task_struct(tsk);
                return NULL;
@@ -1180,12 +1187,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                pid = alloc_pid(p->nsproxy->pid_ns);
                if (!pid)
                        goto bad_fork_cleanup_io;
-                if (clone_flags & CLONE_NEWPID) {
-                        retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
-                        if (retval < 0)
-                                goto bad_fork_free_pid;
-                }
        }
        p->pid = pid_nr(pid);
@@ -1204,6 +1205,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         * Clear TID on mm_release()?
         */
        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+#ifdef CONFIG_BLOCK
+        p->plug = NULL;
+#endif
 #ifdef CONFIG_FUTEX
        p->robust_list = NULL;
 #ifdef CONFIG_COMPAT
@@ -1289,7 +1293,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                tracehook_finish_clone(p, clone_flags, trace);
                if (thread_group_leader(p)) {
-                        if (clone_flags & CLONE_NEWPID)
+                        if (is_child_reaper(pid))
                                p->nsproxy->pid_ns->child_reaper = p;
                        p->signal->leader_pid = pid;
@@ -1512,38 +1516,24 @@ void __init proc_caches_init(void)
 }
 /*
- * Check constraints on flags passed to the unshare system call and
+ * Check constraints on flags passed to the unshare system call.
- * force unsharing of additional process context as appropriate.
 */
-static void check_unshare_flags(unsigned long *flags_ptr)
+static int check_unshare_flags(unsigned long unshare_flags)
 {
+        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
+                                CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+                return -EINVAL;
        /*
-         * If unsharing a thread from a thread group, must also
+         * Not implemented, but pretend it works if there is nothing to
-         * unshare vm.
+         * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
-         */
+         * needs to unshare vm.
-        if (*flags_ptr & CLONE_THREAD)
-                *flags_ptr |= CLONE_VM;
-        /*
-         * If unsharing vm, must also unshare signal handlers.
-         */
-        if (*flags_ptr & CLONE_VM)
-                *flags_ptr |= CLONE_SIGHAND;
-        /*
-         * If unsharing namespace, must also unshare filesystem information.
         */
-        if (*flags_ptr & CLONE_NEWNS)
+        if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
-                *flags_ptr |= CLONE_FS;
+                /* FIXME: get_task_mm() increments ->mm_users */
-}
+                if (atomic_read(&current->mm->mm_users) > 1)
+                        return -EINVAL;
-/*
+        }
- * Unsharing of tasks created with CLONE_THREAD is not supported yet
- */
-static int unshare_thread(unsigned long unshare_flags)
-{
-        if (unshare_flags & CLONE_THREAD)
-                return -EINVAL;
        return 0;
 }
@@ -1570,34 +1560,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 }
 /*
- * Unsharing of sighand is not supported yet
- */
-static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
-{
-        struct sighand_struct *sigh = current->sighand;
-        if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
-                return -EINVAL;
-        else
-                return 0;
-}
-/*
- * Unshare vm if it is being shared
- */
-static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
-{
-        struct mm_struct *mm = current->mm;
-        if ((unshare_flags & CLONE_VM) &&
-            (mm && atomic_read(&mm->mm_users) > 1)) {
-                return -EINVAL;
-        }
-        return 0;
-}
-/*
 * Unshare file descriptor table if it is being shared
 */
 static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
@@ -1625,45 +1587,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
 */
 SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 {
-        int err = 0;
        struct fs_struct *fs, *new_fs = NULL;
-        struct sighand_struct *new_sigh = NULL;
-        struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
        struct files_struct *fd, *new_fd = NULL;
        struct nsproxy *new_nsproxy = NULL;
        int do_sysvsem = 0;
+        int err;
-        check_unshare_flags(&unshare_flags);
+        err = check_unshare_flags(unshare_flags);
+        if (err)
-        /* Return -EINVAL for all unsupported flags */
-        err = -EINVAL;
-        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
-                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
-                                CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
                goto bad_unshare_out;
        /*
+         * If unsharing namespace, must also unshare filesystem information.
+         */
+        if (unshare_flags & CLONE_NEWNS)
+                unshare_flags |= CLONE_FS;
+        /*
         * CLONE_NEWIPC must also detach from the undolist: after switching
         * to a new ipc namespace, the semaphore arrays from the old
         * namespace are unreachable.
         */
        if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
                do_sysvsem = 1;
-        if ((err = unshare_thread(unshare_flags)))
-                goto bad_unshare_out;
        if ((err = unshare_fs(unshare_flags, &new_fs)))
-                goto bad_unshare_cleanup_thread;
+                goto bad_unshare_out;
-        if ((err = unshare_sighand(unshare_flags, &new_sigh)))
-                goto bad_unshare_cleanup_fs;
-        if ((err = unshare_vm(unshare_flags, &new_mm)))
-                goto bad_unshare_cleanup_sigh;
        if ((err = unshare_fd(unshare_flags, &new_fd)))
-                goto bad_unshare_cleanup_vm;
+                goto bad_unshare_cleanup_fs;
        if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
                        new_fs)))
                goto bad_unshare_cleanup_fd;
-        if (new_fs ||  new_mm || new_fd || do_sysvsem || new_nsproxy) {
+        if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
                if (do_sysvsem) {
                        /*
                         * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1689,19 +1643,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
                        spin_unlock(&fs->lock);
                }
-                if (new_mm) {
-                        mm = current->mm;
-                        active_mm = current->active_mm;
-                        current->mm = new_mm;
-                        current->active_mm = new_mm;
-                        if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
-                                atomic_dec(&mm->oom_disable_count);
-                                atomic_inc(&new_mm->oom_disable_count);
-                        }
-                        activate_mm(active_mm, new_mm);
-                        new_mm = mm;
-                }
                if (new_fd) {
                        fd = current->files;
                        current->files = new_fd;
@@ -1718,20 +1659,10 @@ bad_unshare_cleanup_fd:
        if (new_fd)
                put_files_struct(new_fd);
-bad_unshare_cleanup_vm:
-        if (new_mm)
-                mmput(new_mm);
-bad_unshare_cleanup_sigh:
-        if (new_sigh)
-                if (atomic_dec_and_test(&new_sigh->count))
-                        kmem_cache_free(sighand_cachep, new_sigh);
 bad_unshare_cleanup_fs:
        if (new_fs)
                free_fs_struct(new_fs);
-bad_unshare_cleanup_thread:
 bad_unshare_out:
        return err;
 }
diff --git a/kernel/futex.c b/kernel/futex.c
index bda415715382..dfb924ffe65b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -782,8 +782,8 @@ static void __unqueue_futex(struct futex_q *q)
 {
        struct futex_hash_bucket *hb;
-        if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr)
+        if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
-                        || plist_node_empty(&q->list)))
+            || WARN_ON(plist_node_empty(&q->list)))
                return;
        hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
@@ -2418,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
                        goto err_unlock;
                ret = -EPERM;
                pcred = __task_cred(p);
+                /* If victim is in different user_ns, then uids are not
+                   comparable, so we must have CAP_SYS_PTRACE */
+                if (cred->user->user_ns != pcred->user->user_ns) {
+                        if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+                                goto err_unlock;
+                        goto ok;
+                }
+                /* If victim is in same user_ns, then uids are comparable */
                if (cred->euid != pcred->euid &&
                    cred->euid != pcred->uid &&
-                    !capable(CAP_SYS_PTRACE))
+                    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
                        goto err_unlock;
+ok:
                head = p->robust_list;
                rcu_read_unlock();
        }
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index a7934ac75e5b..5f9e689dc8f0 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
                        goto err_unlock;
                ret = -EPERM;
                pcred = __task_cred(p);
+                /* If victim is in different user_ns, then uids are not
+                   comparable, so we must have CAP_SYS_PTRACE */
+                if (cred->user->user_ns != pcred->user->user_ns) {
+                        if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+                                goto err_unlock;
+                        goto ok;
+                }
+                /* If victim is in same user_ns, then uids are comparable */
                if (cred->euid != pcred->euid &&
                    cred->euid != pcred->uid &&
-                    !capable(CAP_SYS_PTRACE))
+                    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
                        goto err_unlock;
+ok:
                head = p->compat_robust_list;
                rcu_read_unlock();
        }
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 70a298d6da71..b8cadf70b1fb 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
 config GCOV_PROFILE_ALL
        bool "Profile entire Kernel"
        depends on GCOV_KERNEL
-        depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
+        depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
        default n
        ---help---
        This options activates profiling for the entire kernel.
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 3f761001d517..e97ca59e2520 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,3 @@
-EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
+ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
 obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/groups.c b/kernel/groups.c
index 253dc0f35cf4..1cc476d52dd3 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
        struct group_info *group_info;
        int retval;
-        if (!capable(CAP_SETGID))
+        if (!nsown_capable(CAP_SETGID))
                return -EPERM;
        if ((unsigned)gidsetsize > NGROUPS_MAX)
                return -EINVAL;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index dbccc799407f..6fb014f172f7 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -198,15 +198,6 @@ err:
        return -ENOMEM;
 }
-struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
-{
-        int res = irq_alloc_descs(irq, irq, 1, node);
-        if (res == -EEXIST || res == irq)
-                return irq_to_desc(irq);
-        return NULL;
-}
 static int irq_expand_nr_irqs(unsigned int nr)
 {
        if (nr > IRQ_BITMAP_BITS)
@@ -283,11 +274,6 @@ struct irq_desc *irq_to_desc(unsigned int irq)
        return (irq < NR_IRQS) ? irq_desc + irq : NULL;
 }
-struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
-{
-        return irq_to_desc(irq);
-}
 static void free_desc(unsigned int irq)
 {
        dynamic_irq_cleanup(irq);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index acd599a43bfb..0a2aa73e536c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1064,10 +1064,10 @@ mismatch:
        ret = -EBUSY;
 out_mask:
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        free_cpumask_var(mask);
 out_thread:
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
        if (new->thread) {
                struct task_struct *t = new->thread;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4cc2e5ed0bec..760248de109d 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -405,7 +405,8 @@ int show_interrupts(struct seq_file *p, void *v)
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
        seq_printf(p, " %8s", desc->irq_data.chip->name);
-        seq_printf(p, "-%-8s", desc->name);
+        if (desc->name)
+                seq_printf(p, "-%-8s", desc->name);
        if (action) {
                seq_printf(p, "  %s", action->name);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6f6d091b5757..079f1d39a8b8 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr)
        if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
            arch_is_kernel_text(addr))
                return 1;
-        return in_gate_area_no_task(addr);
+        return in_gate_area_no_mm(addr);
 }
 static inline int is_kernel(unsigned long addr)
 {
        if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
                return 1;
-        return in_gate_area_no_task(addr);
+        return in_gate_area_no_mm(addr);
 }
 static int is_ksym_addr(unsigned long addr)
@@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
 }
 /* Look up a kernel symbol and return it in a text buffer. */
-int sprint_symbol(char *buffer, unsigned long address)
+static int __sprint_symbol(char *buffer, unsigned long address,
+                           int symbol_offset)
 {
        char *modname;
        const char *name;
        unsigned long offset, size;
        int len;
+        address += symbol_offset;
        name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
        if (!name)
                return sprintf(buffer, "0x%lx", address);
@@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address)
                strcpy(buffer, name);
        len = strlen(buffer);
        buffer += len;
+        offset -= symbol_offset;
        if (modname)
-                len += sprintf(buffer, "+%#lx/%#lx [%s]",
+                len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname);
-                                                offset, size, modname);
        else
                len += sprintf(buffer, "+%#lx/%#lx", offset, size);
        return len;
 }
+/**
+ * sprint_symbol - Look up a kernel symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function looks up a kernel symbol with @address and stores its name,
+ * offset, size and module name to @buffer if possible. If no symbol was found,
+ * just saves its @address as is.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_symbol(char *buffer, unsigned long address)
+{
+        return __sprint_symbol(buffer, address, 0);
+}
 EXPORT_SYMBOL_GPL(sprint_symbol);
+/**
+ * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function is for stack backtrace and does the same thing as
+ * sprint_symbol() but with modified/decreased @address. If there is a
+ * tail-call to the function marked "noreturn", gcc optimized out code after
+ * the call so that the stack-saved return address could point outside of the
+ * caller. This function ensures that kallsyms will find the original caller
+ * by decreasing @address.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_backtrace(char *buffer, unsigned long address)
+{
+        return __sprint_symbol(buffer, address, -1);
+}
 /* Look up a kernel symbol and print it to the kernel messages. */
 void __print_symbol(const char *fmt, unsigned long address)
 {
@@ -477,13 +515,11 @@ static int s_show(struct seq_file *m, void *p)
                 */
                type = iter->exported ? toupper(iter->type) :
                                        tolower(iter->type);
-                seq_printf(m, "%0*lx %c %s\t[%s]\n",
+                seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
-                           (int)(2 * sizeof(void *)),
+                           type, iter->name, iter->module_name);
-                           iter->value, type, iter->name, iter->module_name);
        } else
-                seq_printf(m, "%0*lx %c %s\n",
+                seq_printf(m, "%pK %c %s\n", (void *)iter->value,
-                           (int)(2 * sizeof(void *)),
+                           iter->type, iter->name);
-                           iter->value, iter->type, iter->name);
        return 0;
 }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c55afba990a3..684ab3f7dd72 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,6 +27,7 @@ struct kthread_create_info
        /* Information passed to kthread() from kthreadd. */
        int (*threadfn)(void *data);
        void *data;
+        int node;
        /* Result passed back to kthread_create() from kthreadd. */
        struct task_struct *result;
@@ -98,10 +99,23 @@ static int kthread(void *_create)
        do_exit(ret);
 }
+/* called from do_fork() to get node information for about to be created task */
+int tsk_fork_get_node(struct task_struct *tsk)
+{
+#ifdef CONFIG_NUMA
+        if (tsk == kthreadd_task)
+                return tsk->pref_node_fork;
+#endif
+        return numa_node_id();
+}
 static void create_kthread(struct kthread_create_info *create)
 {
        int pid;
+#ifdef CONFIG_NUMA
+        current->pref_node_fork = create->node;
+#endif
        /* We want our own signal handler (we take no signals by default). */
        pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
        if (pid < 0) {
@@ -111,15 +125,18 @@ static void create_kthread(struct kthread_create_info *create)
 }
 /**
- * kthread_create - create a kthread.
+ * kthread_create_on_node - create a kthread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
+ * @node: memory node number.
 * @namefmt: printf-style name for the thread.
 *
 * Description: This helper function creates and names a kernel
 * thread.  The thread will be stopped: use wake_up_process() to start
 * it.  See also kthread_run().
 *
+ * If thread is going to be bound on a particular cpu, give its node
+ * in @node, to get NUMA affinity for kthread stack, or else give -1.
 * When woken, the thread will run @threadfn() with @data as its
 * argument. @threadfn() can either call do_exit() directly if it is a
 * standalone thread for which noone will call kthread_stop(), or
@@ -129,15 +146,17 @@ static void create_kthread(struct kthread_create_info *create)
 *
 * Returns a task_struct or ERR_PTR(-ENOMEM).
 */
-struct task_struct *kthread_create(int (*threadfn)(void *data),
+struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
-                                   void *data,
+                                           void *data,
-                                   const char namefmt[],
+                                           int node,
-                                   ...)
+                                           const char namefmt[],
+                                           ...)
 {
        struct kthread_create_info create;
        create.threadfn = threadfn;
        create.data = data;
+        create.node = node;
        init_completion(&create.done);
        spin_lock(&kthread_create_lock);
@@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
        }
        return create.result;
 }
-EXPORT_SYMBOL(kthread_create);
+EXPORT_SYMBOL(kthread_create_on_node);
 /**
 * kthread_bind - bind a just-created kthread to a cpu.
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 1969d2fc4b36..71edd2f60c02 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
                      nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
                      nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
                      nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
-                      sum_forward_deps = 0, factor = 0;
+                      sum_forward_deps = 0;
        list_for_each_entry(class, &all_lock_classes, lock_entry) {
@@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
                        nr_hardirq_unsafe * nr_hardirq_safe +
                        nr_list_entries);
-        /*
-         * Estimated factor between direct and indirect
-         * dependencies:
-         */
-        if (nr_list_entries)
-                factor = sum_forward_deps / nr_list_entries;
 #ifdef CONFIG_PROVE_LOCKING
        seq_printf(m, " dependency chains:             %11lu [max: %lu]\n",
                        nr_lock_chains, MAX_LOCKDEP_CHAINS);
diff --git a/kernel/module.c b/kernel/module.c
index efa290ea94bf..1f9f7bc56ca1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1168,7 +1168,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
 {
        struct module_sect_attr *sattr =
                container_of(mattr, struct module_sect_attr, mattr);
-        return sprintf(buf, "0x%lx\n", sattr->address);
+        return sprintf(buf, "0x%pK\n", (void *)sattr->address);
 }
 static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -3224,7 +3224,7 @@ static int m_show(struct seq_file *m, void *p)
                   mod->state == MODULE_STATE_COMING ? "Loading":
                   "Live");
        /* Used by oprofile and other similar tools. */
-        seq_printf(m, " 0x%p", mod->module_core);
+        seq_printf(m, " 0x%pK", mod->module_core);
        /* Taints info */
        if (mod->taints)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f74e6c00e26d..a05d191ffdd9 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -69,13 +69,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
                goto out_ns;
        }
-        new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
+        new_nsp->uts_ns = copy_utsname(flags, tsk);
        if (IS_ERR(new_nsp->uts_ns)) {
                err = PTR_ERR(new_nsp->uts_ns);
                goto out_uts;
        }
-        new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
+        new_nsp->ipc_ns = copy_ipcs(flags, tsk);
        if (IS_ERR(new_nsp->ipc_ns)) {
                err = PTR_ERR(new_nsp->ipc_ns);
                goto out_ipc;
diff --git a/kernel/panic.c b/kernel/panic.c
index 991bb87a1704..69231670eb95 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
 core_param(panic, panic_timeout, int, 0644);
 core_param(pause_on_oops, pause_on_oops, int, 0644);
+static int __init oops_setup(char *s)
+{
+        if (!s)
+                return -EINVAL;
+        if (!strcmp(s, "panic"))
+                panic_on_oops = 1;
+        return 0;
+}
+early_param("oops", oops_setup);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index ed253aa24ba4..c75925c4d1e2 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -145,7 +145,8 @@ static struct srcu_struct pmus_srcu;
 */
 int sysctl_perf_event_paranoid __read_mostly = 1;
-int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
+/* Minimum for 128 pages + 1 for the user control page */
+int sysctl_perf_event_mlock __read_mostly = 516; /* 'free' kb per user */
 /*
 * max perf event sample rate
@@ -941,6 +942,7 @@ static void perf_group_attach(struct perf_event *event)
 static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
+        struct perf_cpu_context *cpuctx;
        /*
         * We can have double detach due to exit/hot-unplug + close.
         */
@@ -949,8 +951,17 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
        event->attach_state &= ~PERF_ATTACH_CONTEXT;
-        if (is_cgroup_event(event))
+        if (is_cgroup_event(event)) {
                ctx->nr_cgroups--;
+                cpuctx = __get_cpu_context(ctx);
+                /*
+                 * if there are no more cgroup events
+                 * then cler cgrp to avoid stale pointer
+                 * in update_cgrp_time_from_cpuctx()
+                 */
+                if (!ctx->nr_cgroups)
+                        cpuctx->cgrp = NULL;
+        }
        ctx->nr_events--;
        if (event->attr.inherit_stat)
@@ -5122,7 +5133,7 @@ static int perf_exclude_event(struct perf_event *event,
                              struct pt_regs *regs)
 {
        if (event->hw.state & PERF_HES_STOPPED)
-                return 0;
+                return 1;
        if (regs) {
                if (event->attr.exclude_user && user_mode(regs))
@@ -5478,6 +5489,8 @@ static int perf_tp_event_match(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
 {
+        if (event->hw.state & PERF_HES_STOPPED)
+                return 0;
        /*
         * All tracepoints are from kernel-space.
         */
@@ -6720,17 +6733,20 @@ __perf_event_exit_task(struct perf_event *child_event,
                         struct perf_event_context *child_ctx,
                         struct task_struct *child)
 {
-        struct perf_event *parent_event;
+        if (child_event->parent) {
+                raw_spin_lock_irq(&child_ctx->lock);
+                perf_group_detach(child_event);
+                raw_spin_unlock_irq(&child_ctx->lock);
+        }
        perf_remove_from_context(child_event);
-        parent_event = child_event->parent;
        /*
-         * It can happen that parent exits first, and has events
+         * It can happen that the parent exits first, and has events
         * that are still around due to the child reference. These
-         * events need to be zapped - but otherwise linger.
+         * events need to be zapped.
         */
-        if (parent_event) {
+        if (child_event->parent) {
                sync_child_event(child_event, child);
                free_event(child_event);
        }
diff --git a/kernel/pid.c b/kernel/pid.c
index 39b65b69584f..02f221274265 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -435,6 +435,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
        rcu_read_unlock();
        return pid;
 }
+EXPORT_SYMBOL_GPL(get_task_pid);
 struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
 {
@@ -446,6 +447,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
        rcu_read_unlock();
        return result;
 }
+EXPORT_SYMBOL_GPL(get_pid_task);
 struct pid *find_get_pid(pid_t nr)
 {
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94e1f0b..e9c9adc84ca6 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -14,6 +14,7 @@
 #include <linux/err.h>
 #include <linux/acct.h>
 #include <linux/slab.h>
+#include <linux/proc_fs.h>
 #define BITS_PER_PAGE           (PAGE_SIZE*8)
@@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
 {
        struct pid_namespace *ns;
        unsigned int level = parent_pid_ns->level + 1;
-        int i;
+        int i, err = -ENOMEM;
        ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
        if (ns == NULL)
@@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
        for (i = 1; i < PIDMAP_ENTRIES; i++)
                atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
+        err = pid_ns_prepare_proc(ns);
+        if (err)
+                goto out_put_parent_pid_ns;
        return ns;
+out_put_parent_pid_ns:
+        put_pid_ns(parent_pid_ns);
 out_free_map:
        kfree(ns->pidmap[0].page);
 out_free:
        kmem_cache_free(pid_ns_cachep, ns);
 out:
-        return ERR_PTR(-ENOMEM);
+        return ERR_PTR(err);
 }
 static void destroy_pid_namespace(struct pid_namespace *ns)
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index aeaa7f846821..0da058bff8eb 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -103,11 +103,14 @@ static struct pm_qos_object *pm_qos_array[] = {
 static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
                size_t count, loff_t *f_pos);
+static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
+                size_t count, loff_t *f_pos);
 static int pm_qos_power_open(struct inode *inode, struct file *filp);
 static int pm_qos_power_release(struct inode *inode, struct file *filp);
 static const struct file_operations pm_qos_power_fops = {
        .write = pm_qos_power_write,
+        .read = pm_qos_power_read,
        .open = pm_qos_power_open,
        .release = pm_qos_power_release,
        .llseek = noop_llseek,
@@ -376,6 +379,27 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
 }
+static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
+                size_t count, loff_t *f_pos)
+{
+        s32 value;
+        unsigned long flags;
+        struct pm_qos_object *o;
+        struct pm_qos_request_list *pm_qos_req = filp->private_data;;
+        if (!pm_qos_req)
+                return -EINVAL;
+        if (!pm_qos_request_active(pm_qos_req))
+                return -EINVAL;
+        o = pm_qos_array[pm_qos_req->pm_qos_class];
+        spin_lock_irqsave(&pm_qos_lock, flags);
+        value = pm_qos_get_value(o);
+        spin_unlock_irqrestore(&pm_qos_lock, flags);
+        return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
+}
 static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
                size_t count, loff_t *f_pos)
 {
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 265729966ece..4603f08dc47b 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,125 +1,12 @@
-config PM
-        bool "Power Management support"
-        depends on !IA64_HP_SIM
-        ---help---
-          "Power Management" means that parts of your computer are shut
-          off or put into a power conserving "sleep" mode if they are not
-          being used.  There are two competing standards for doing this: APM
-          and ACPI.  If you want to use either one, say Y here and then also
-          to the requisite support below.
-          Power Management is most important for battery powered laptop
-          computers; if you have a laptop, check out the Linux Laptop home
-          page on the WWW at <http://www.linux-on-laptops.com/> or
-          Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
-          and the Battery Powered Linux mini-HOWTO, available from
-          <http://www.tldp.org/docs.html#howto>.
-          Note that, even if you say N here, Linux on the x86 architecture
-          will issue the hlt instruction if nothing is to be done, thereby
-          sending the processor to sleep and saving power.
-config PM_DEBUG
-        bool "Power Management Debug Support"
-        depends on PM
-        ---help---
-        This option enables various debugging support in the Power Management
-        code. This is helpful when debugging and reporting PM bugs, like
-        suspend support.
-config PM_ADVANCED_DEBUG
-        bool "Extra PM attributes in sysfs for low-level debugging/testing"
-        depends on PM_DEBUG
-        default n
-        ---help---
-        Add extra sysfs attributes allowing one to access some Power Management
-        fields of device objects from user space.  If you are not a kernel
-        developer interested in debugging/testing Power Management, say "no".
-config PM_VERBOSE
-        bool "Verbose Power Management debugging"
-        depends on PM_DEBUG
-        default n
-        ---help---
-        This option enables verbose messages from the Power Management code.
-config CAN_PM_TRACE
-        def_bool y
-        depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL
-config PM_TRACE
-        bool
-        help
-          This enables code to save the last PM event point across
-          reboot. The architecture needs to support this, x86 for
-          example does by saving things in the RTC, see below.
-          The architecture specific code must provide the extern
-          functions from <linux/resume-trace.h> as well as the
-          <asm/resume-trace.h> header with a TRACE_RESUME() macro.
-          The way the information is presented is architecture-
-          dependent, x86 will print the information during a
-          late_initcall.
-config PM_TRACE_RTC
-        bool "Suspend/resume event tracing"
-        depends on CAN_PM_TRACE
-        depends on X86
-        select PM_TRACE
-        default n
-        ---help---
-        This enables some cheesy code to save the last PM event point in the
-        RTC across reboots, so that you can debug a machine that just hangs
-        during suspend (or more commonly, during resume).
-        To use this debugging feature you should attempt to suspend the
-        machine, reboot it and then run
-                dmesg -s 1000000 | grep 'hash matches'
-        CAUTION: this option will cause your machine's real-time clock to be
-        set to an invalid time after a resume.
-config PM_SLEEP_SMP
-        bool
-        depends on SMP
-        depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
-        depends on PM_SLEEP
-        select HOTPLUG
-        select HOTPLUG_CPU
-        default y
-config PM_SLEEP
-        bool
-        depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
-        default y
-config PM_SLEEP_ADVANCED_DEBUG
-        bool
-        depends on PM_ADVANCED_DEBUG
-        default n
 config SUSPEND
        bool "Suspend to RAM and standby"
-        depends on PM && ARCH_SUSPEND_POSSIBLE
+        depends on ARCH_SUSPEND_POSSIBLE
        default y
        ---help---
          Allow the system to enter sleep states in which main memory is
          powered and thus its contents are preserved, such as the
          suspend-to-RAM state (e.g. the ACPI S3 state).
-config PM_TEST_SUSPEND
-        bool "Test suspend/resume and wakealarm during bootup"
-        depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
-        ---help---
-        This option will let you suspend your machine during bootup, and
-        make it wake up a few seconds later using an RTC wakeup alarm.
-        Enable this with a kernel parameter like "test_suspend=mem".
-        You probably want to have your system's RTC driver statically
-        linked, ensuring that it's available when this test runs.
 config SUSPEND_FREEZER
        bool "Enable freezer for suspend to RAM/standby" \
                if ARCH_WANTS_FREEZER_CONTROL || BROKEN
@@ -133,7 +20,7 @@ config SUSPEND_FREEZER
 config HIBERNATION
        bool "Hibernation (aka 'suspend to disk')"
-        depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
+        depends on SWAP && ARCH_HIBERNATION_POSSIBLE
        select LZO_COMPRESS
        select LZO_DECOMPRESS
        ---help---
@@ -196,6 +83,106 @@ config PM_STD_PARTITION
          suspended image to. It will simply pick the first available swap 
          device.
+config PM_SLEEP
+        def_bool y
+        depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
+config PM_SLEEP_SMP
+        def_bool y
+        depends on SMP
+        depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
+        depends on PM_SLEEP
+        select HOTPLUG
+        select HOTPLUG_CPU
+config PM_RUNTIME
+        bool "Run-time PM core functionality"
+        depends on !IA64_HP_SIM
+        ---help---
+          Enable functionality allowing I/O devices to be put into energy-saving
+          (low power) states at run time (or autosuspended) after a specified
+          period of inactivity and woken up in response to a hardware-generated
+          wake-up event or a driver's request.
+          Hardware support is generally required for this functionality to work
+          and the bus type drivers of the buses the devices are on are
+          responsible for the actual handling of the autosuspend requests and
+          wake-up events.
+config PM
+        def_bool y
+        depends on PM_SLEEP || PM_RUNTIME
+config PM_DEBUG
+        bool "Power Management Debug Support"
+        depends on PM
+        ---help---
+        This option enables various debugging support in the Power Management
+        code. This is helpful when debugging and reporting PM bugs, like
+        suspend support.
+config PM_VERBOSE
+        bool "Verbose Power Management debugging"
+        depends on PM_DEBUG
+        ---help---
+        This option enables verbose messages from the Power Management code.
+config PM_ADVANCED_DEBUG
+        bool "Extra PM attributes in sysfs for low-level debugging/testing"
+        depends on PM_DEBUG
+        ---help---
+        Add extra sysfs attributes allowing one to access some Power Management
+        fields of device objects from user space.  If you are not a kernel
+        developer interested in debugging/testing Power Management, say "no".
+config PM_TEST_SUSPEND
+        bool "Test suspend/resume and wakealarm during bootup"
+        depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
+        ---help---
+        This option will let you suspend your machine during bootup, and
+        make it wake up a few seconds later using an RTC wakeup alarm.
+        Enable this with a kernel parameter like "test_suspend=mem".
+        You probably want to have your system's RTC driver statically
+        linked, ensuring that it's available when this test runs.
+config CAN_PM_TRACE
+        def_bool y
+        depends on PM_DEBUG && PM_SLEEP
+config PM_TRACE
+        bool
+        help
+          This enables code to save the last PM event point across
+          reboot. The architecture needs to support this, x86 for
+          example does by saving things in the RTC, see below.
+          The architecture specific code must provide the extern
+          functions from <linux/resume-trace.h> as well as the
+          <asm/resume-trace.h> header with a TRACE_RESUME() macro.
+          The way the information is presented is architecture-
+          dependent, x86 will print the information during a
+          late_initcall.
+config PM_TRACE_RTC
+        bool "Suspend/resume event tracing"
+        depends on CAN_PM_TRACE
+        depends on X86
+        select PM_TRACE
+        ---help---
+        This enables some cheesy code to save the last PM event point in the
+        RTC across reboots, so that you can debug a machine that just hangs
+        during suspend (or more commonly, during resume).
+        To use this debugging feature you should attempt to suspend the
+        machine, reboot it and then run
+                dmesg -s 1000000 | grep 'hash matches'
+        CAUTION: this option will cause your machine's real-time clock to be
+        set to an invalid time after a resume.
 config APM_EMULATION
        tristate "Advanced Power Management Emulation"
        depends on PM && SYS_SUPPORTS_APM_EMULATION
@@ -222,31 +209,11 @@ config APM_EMULATION
          anything, try disabling/enabling this option (or disabling/enabling
          APM in your BIOS).
-config PM_RUNTIME
-        bool "Run-time PM core functionality"
-        depends on PM
-        ---help---
-          Enable functionality allowing I/O devices to be put into energy-saving
-          (low power) states at run time (or autosuspended) after a specified
-          period of inactivity and woken up in response to a hardware-generated
-          wake-up event or a driver's request.
-          Hardware support is generally required for this functionality to work
-          and the bus type drivers of the buses the devices are on are
-          responsible for the actual handling of the autosuspend requests and
-          wake-up events.
-config PM_OPS
-        bool
-        depends on PM_SLEEP || PM_RUNTIME
-        default y
 config ARCH_HAS_OPP
        bool
 config PM_OPP
        bool "Operating Performance Point (OPP) Layer library"
-        depends on PM
        depends on ARCH_HAS_OPP
        ---help---
          SOCs have a standard set of tuples consisting of frequency and
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c350e18b53e3..c5ebc6a90643 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,4 +1,5 @@
-ccflags-$(CONFIG_PM_DEBUG)      :=      -DDEBUG
+ccflags-$(CONFIG_PM_DEBUG)      := -DDEBUG
 obj-$(CONFIG_PM)                += main.o
 obj-$(CONFIG_PM_SLEEP)          += console.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 83bbc7c02df9..d09dd10c5a5e 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
 static int submit(int rw, struct block_device *bdev, sector_t sector,
                struct page *page, struct bio **bio_chain)
 {
-        const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG;
+        const int bio_rw = rw | REQ_SYNC;
        struct bio *bio;
        bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1832bd264219..aeabd26e3342 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -23,6 +23,7 @@
 #include <linux/cpu.h>
 #include <linux/freezer.h>
 #include <linux/gfp.h>
+#include <linux/syscore_ops.h>
 #include <scsi/scsi_scan.h>
 #include <asm/suspend.h>
@@ -272,6 +273,8 @@ static int create_image(int platform_mode)
        local_irq_disable();
        error = sysdev_suspend(PMSG_FREEZE);
+        if (!error)
+                error = syscore_suspend();
        if (error) {
                printk(KERN_ERR "PM: Some system devices failed to power down, "
                        "aborting hibernation\n");
@@ -295,6 +298,7 @@ static int create_image(int platform_mode)
        }
 Power_up:
+        syscore_resume();
        sysdev_resume();
        /* NOTE:  dpm_resume_noirq() is just a resume() for devices
         * that suspended with irqs off ... no overall powerup.
@@ -403,6 +407,8 @@ static int resume_target_kernel(bool platform_mode)
        local_irq_disable();
        error = sysdev_suspend(PMSG_QUIESCE);
+        if (!error)
+                error = syscore_suspend();
        if (error)
                goto Enable_irqs;
@@ -429,6 +435,7 @@ static int resume_target_kernel(bool platform_mode)
        restore_processor_state();
        touch_softlockup_watchdog();
+        syscore_resume();
        sysdev_resume();
 Enable_irqs:
@@ -516,6 +523,7 @@ int hibernation_platform_enter(void)
        local_irq_disable();
        sysdev_suspend(PMSG_HIBERNATE);
+        syscore_suspend();
        if (pm_wakeup_pending()) {
                error = -EAGAIN;
                goto Power_up;
@@ -526,6 +534,7 @@ int hibernation_platform_enter(void)
        while (1);
 Power_up:
+        syscore_resume();
        sysdev_resume();
        local_irq_enable();
        enable_nonboot_cpus();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 701853042c28..8eaba5f27b10 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -17,9 +17,6 @@
 DEFINE_MUTEX(pm_mutex);
-unsigned int pm_flags;
-EXPORT_SYMBOL(pm_flags);
 #ifdef CONFIG_PM_SLEEP
 /* Routines for PM-transition notifications */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 64db648ff911..ca0aacc24874 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -42,15 +42,15 @@ static void swsusp_unset_page_forbidden(struct page *);
 /*
 * Preferred image size in bytes (tunable via /sys/power/image_size).
- * When it is set to N, swsusp will do its best to ensure the image
+ * When it is set to N, the image creating code will do its best to
- * size will not exceed N bytes, but if that is impossible, it will
+ * ensure the image size will not exceed N bytes, but if that is
- * try to create the smallest image possible.
+ * impossible, it will try to create the smallest image possible.
 */
 unsigned long image_size;
 void __init hibernate_image_size_init(void)
 {
-        image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
+        image_size = (totalram_pages / 3) * PAGE_SIZE;
 }
 /* List of PBEs needed for restoring the pages that were allocated before
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index de6f86bfa303..2814c32aed51 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/suspend.h>
+#include <linux/syscore_ops.h>
 #include <trace/events/power.h>
 #include "power.h"
@@ -163,11 +164,14 @@ static int suspend_enter(suspend_state_t state)
        BUG_ON(!irqs_disabled());
        error = sysdev_suspend(PMSG_SUSPEND);
+        if (!error)
+                error = syscore_suspend();
        if (!error) {
                if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
                        error = suspend_ops->enter(state);
                        events_check_enabled = false;
                }
+                syscore_resume();
                sysdev_resume();
        }
diff --git a/kernel/printk.c b/kernel/printk.c
index 36231525e22f..da8ca817eae3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -53,7 +53,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
 #define __LOG_BUF_LEN   (1 << CONFIG_LOG_BUF_SHIFT)
 /* printk's without a loglevel use this.. */
-#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */
+#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
 /* We show everything that is MORE important than this.. */
 #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
@@ -113,6 +113,11 @@ static unsigned con_start;	/* Index into log_buf: next char to be sent to consol
 static unsigned log_end;        /* Index into log_buf: most-recently-written-char + 1 */
 /*
+ * If exclusive_console is non-NULL then only this console is to be printed to.
+ */
+static struct console *exclusive_console;
+/*
 *      Array of consoles built from command line options (console=)
 */
 struct console_cmdline
@@ -476,6 +481,8 @@ static void __call_console_drivers(unsigned start, unsigned end)
        struct console *con;
        for_each_console(con) {
+                if (exclusive_console && con != exclusive_console)
+                        continue;
                if ((con->flags & CON_ENABLED) && con->write &&
                                (cpu_online(smp_processor_id()) ||
                                (con->flags & CON_ANYTIME)))
@@ -515,6 +522,71 @@ static void _call_console_drivers(unsigned start,
 }
 /*
+ * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
+ * lower 3 bit are the log level, the rest are the log facility. In case
+ * userspace passes usual userspace syslog messages to /dev/kmsg or
+ * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
+ * to extract the correct log level for in-kernel processing, and not mangle
+ * the original value.
+ *
+ * If a prefix is found, the length of the prefix is returned. If 'level' is
+ * passed, it will be filled in with the log level without a possible facility
+ * value. If 'special' is passed, the special printk prefix chars are accepted
+ * and returned. If no valid header is found, 0 is returned and the passed
+ * variables are not touched.
+ */
+static size_t log_prefix(const char *p, unsigned int *level, char *special)
+{
+        unsigned int lev = 0;
+        char sp = '\0';
+        size_t len;
+        if (p[0] != '<' || !p[1])
+                return 0;
+        if (p[2] == '>') {
+                /* usual single digit level number or special char */
+                switch (p[1]) {
+                case '0' ... '7':
+                        lev = p[1] - '0';
+                        break;
+                case 'c': /* KERN_CONT */
+                case 'd': /* KERN_DEFAULT */
+                        sp = p[1];
+                        break;
+                default:
+                        return 0;
+                }
+                len = 3;
+        } else {
+                /* multi digit including the level and facility number */
+                char *endp = NULL;
+                if (p[1] < '0' && p[1] > '9')
+                        return 0;
+                lev = (simple_strtoul(&p[1], &endp, 10) & 7);
+                if (endp == NULL || endp[0] != '>')
+                        return 0;
+                len = (endp + 1) - p;
+        }
+        /* do not accept special char if not asked for */
+        if (sp && !special)
+                return 0;
+        if (special) {
+                *special = sp;
+                /* return special char, do not touch level */
+                if (sp)
+                        return len;
+        }
+        if (level)
+                *level = lev;
+        return len;
+}
+/*
 * Call the console drivers, asking them to write out
 * log_buf[start] to log_buf[end - 1].
 * The console_lock must be held.
@@ -529,13 +601,9 @@ static void call_console_drivers(unsigned start, unsigned end)
        cur_index = start;
        start_print = start;
        while (cur_index != end) {
-                if (msg_level < 0 && ((end - cur_index) > 2) &&
+                if (msg_level < 0 && ((end - cur_index) > 2)) {
-                                LOG_BUF(cur_index + 0) == '<' &&
+                        /* strip log prefix */
-                                LOG_BUF(cur_index + 1) >= '0' &&
+                        cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
-                                LOG_BUF(cur_index + 1) <= '7' &&
-                                LOG_BUF(cur_index + 2) == '>') {
-                        msg_level = LOG_BUF(cur_index + 1) - '0';
-                        cur_index += 3;
                        start_print = cur_index;
                }
                while (cur_index != end) {
@@ -733,6 +801,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        unsigned long flags;
        int this_cpu;
        char *p;
+        size_t plen;
+        char special;
        boot_delay_msec();
        printk_delay();
@@ -773,45 +843,52 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        printed_len += vscnprintf(printk_buf + printed_len,
                                  sizeof(printk_buf) - printed_len, fmt, args);
        p = printk_buf;
-        /* Do we have a loglevel in the string? */
+        /* Read log level and handle special printk prefix */
-        if (p[0] == '<') {
+        plen = log_prefix(p, &current_log_level, &special);
-                unsigned char c = p[1];
+        if (plen) {
-                if (c && p[2] == '>') {
+                p += plen;
-                        switch (c) {
-                        case '0' ... '7': /* loglevel */
+                switch (special) {
-                                current_log_level = c - '0';
+                case 'c': /* Strip <c> KERN_CONT, continue line */
-                        /* Fallthrough - make sure we're on a new line */
+                        plen = 0;
-                        case 'd': /* KERN_DEFAULT */
+                        break;
-                                if (!new_text_line) {
+                case 'd': /* Strip <d> KERN_DEFAULT, start new line */
-                                        emit_log_char('\n');
+                        plen = 0;
-                                        new_text_line = 1;
+                default:
-                                }
+                        if (!new_text_line) {
-                        /* Fallthrough - skip the loglevel */
+                                emit_log_char('\n');
-                        case 'c': /* KERN_CONT */
+                                new_text_line = 1;
-                                p += 3;
-                                break;
                        }
                }
        }
        /*
-         * Copy the output into log_buf.  If the caller didn't provide
+         * Copy the output into log_buf. If the caller didn't provide
-         * appropriate log level tags, we insert them here
+         * the appropriate log prefix, we insert them here
         */
-        for ( ; *p; p++) {
+        for (; *p; p++) {
                if (new_text_line) {
-                        /* Always output the token */
-                        emit_log_char('<');
-                        emit_log_char(current_log_level + '0');
-                        emit_log_char('>');
-                        printed_len += 3;
                        new_text_line = 0;
+                        if (plen) {
+                                /* Copy original log prefix */
+                                int i;
+                                for (i = 0; i < plen; i++)
+                                        emit_log_char(printk_buf[i]);
+                                printed_len += plen;
+                        } else {
+                                /* Add log prefix */
+                                emit_log_char('<');
+                                emit_log_char(current_log_level + '0');
+                                emit_log_char('>');
+                                printed_len += 3;
+                        }
                        if (printk_time) {
-                                /* Follow the token with the time */
+                                /* Add the current time stamp */
                                char tbuf[50], *tp;
                                unsigned tlen;
                                unsigned long long t;
@@ -1160,6 +1237,11 @@ void console_unlock(void)
                local_irq_restore(flags);
        }
        console_locked = 0;
+        /* Release the exclusive_console once it is used */
+        if (unlikely(exclusive_console))
+                exclusive_console = NULL;
        up(&console_sem);
        spin_unlock_irqrestore(&logbuf_lock, flags);
        if (wake_klogd)
@@ -1246,6 +1328,18 @@ void console_start(struct console *console)
 }
 EXPORT_SYMBOL(console_start);
+static int __read_mostly keep_bootcon;
+static int __init keep_bootcon_setup(char *str)
+{
+        keep_bootcon = 1;
+        printk(KERN_INFO "debug: skip boot console de-registration.\n");
+        return 0;
+}
+early_param("keep_bootcon", keep_bootcon_setup);
 /*
 * The console driver calls this routine during kernel initialization
 * to register the console printing procedure with printk() and to
@@ -1382,6 +1476,12 @@ void register_console(struct console *newcon)
                spin_lock_irqsave(&logbuf_lock, flags);
                con_start = log_start;
                spin_unlock_irqrestore(&logbuf_lock, flags);
+                /*
+                 * We're about to replay the log buffer.  Only do this to the
+                 * just-registered console to avoid excessive message spam to
+                 * the already-registered consoles.
+                 */
+                exclusive_console = newcon;
        }
        console_unlock();
        console_sysfs_notify();
@@ -1393,7 +1493,9 @@ void register_console(struct console *newcon)
         * users know there might be something in the kernel's log buffer that
         * went to the bootconsole (that they do not see on the real console)
         */
-        if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
+        if (bcon &&
+            ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
+            !keep_bootcon) {
                /* we need to iterate through twice, to make sure we print
                 * everything out, before we unregister the console(s)
                 */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e2302e40b360..0fc1eed28d27 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -134,21 +134,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
                return 0;
        rcu_read_lock();
        tcred = __task_cred(task);
-        if ((cred->uid != tcred->euid ||
+        if (cred->user->user_ns == tcred->user->user_ns &&
-             cred->uid != tcred->suid ||
+            (cred->uid == tcred->euid &&
-             cred->uid != tcred->uid  ||
+             cred->uid == tcred->suid &&
-             cred->gid != tcred->egid ||
+             cred->uid == tcred->uid  &&
-             cred->gid != tcred->sgid ||
+             cred->gid == tcred->egid &&
-             cred->gid != tcred->gid) &&
+             cred->gid == tcred->sgid &&
-            !capable(CAP_SYS_PTRACE)) {
+             cred->gid == tcred->gid))
-                rcu_read_unlock();
+                goto ok;
-                return -EPERM;
+        if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
-        }
+                goto ok;
+        rcu_read_unlock();
+        return -EPERM;
+ok:
        rcu_read_unlock();
        smp_rmb();
        if (task->mm)
                dumpable = get_dumpable(task->mm);
-        if (!dumpable && !capable(CAP_SYS_PTRACE))
+        if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
                return -EPERM;
        return security_ptrace_access_check(task, mode);
@@ -198,7 +201,7 @@ static int ptrace_attach(struct task_struct *task)
                goto unlock_tasklist;
        task->ptrace = PT_PTRACED;
-        if (capable(CAP_SYS_PTRACE))
+        if (task_ns_capable(task, CAP_SYS_PTRACE))
                task->ptrace |= PT_PTRACE_CAP;
        __ptrace_link(task, current);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a23a57a976d1..f3240e987928 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -214,11 +214,12 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
                 * Ensure that queued callbacks are all executed.
                 * If we detect that we are nested in a RCU read-side critical
                 * section, we should simply fail, otherwise we would deadlock.
+                 * Note that the machinery to reliably determine whether
+                 * or not we are in an RCU read-side critical section
+                 * exists only in the preemptible RCU implementations
+                 * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why
+                 * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT.
                 */
-#ifndef CONFIG_PREEMPT
-                WARN_ON(1);
-                return 0;
-#else
                if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
                    irqs_disabled()) {
                        WARN_ON(1);
@@ -229,7 +230,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
                rcu_barrier_bh();
                debug_object_free(head, &rcuhead_debug_descr);
                return 1;
-#endif
        default:
                return 0;
        }
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 015abaea962a..3cb8e362e883 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -852,7 +852,7 @@ void exit_rcu(void)
        if (t->rcu_read_lock_nesting == 0)
                return;
        t->rcu_read_lock_nesting = 1;
-        rcu_read_unlock();
+        __rcu_read_unlock();
 }
 #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 89613f97ff26..c224da41890c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,7 +47,6 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <asm/byteorder.h>
-#include <linux/sched.h>
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index c7eaa37a768b..34683efa2cce 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
                        pos, buf, s - buf);
 }
+#if BITS_PER_LONG == 32
+u64 res_counter_read_u64(struct res_counter *counter, int member)
+{
+        unsigned long flags;
+        u64 ret;
+        spin_lock_irqsave(&counter->lock, flags);
+        ret = *res_counter_member(counter, member);
+        spin_unlock_irqrestore(&counter->lock, flags);
+        return ret;
+}
+#else
 u64 res_counter_read_u64(struct res_counter *counter, int member)
 {
        return *res_counter_member(counter, member);
 }
+#endif
 int res_counter_memparse_write_strategy(const char *buf,
                                        unsigned long long *res)
diff --git a/kernel/sched.c b/kernel/sched.c
index a361e20ec2cd..f592ce6f8616 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,6 @@
 #include <linux/init.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
-#include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
@@ -4086,9 +4085,6 @@ need_resched:
        rcu_note_context_switch(cpu);
        prev = rq->curr;
-        release_kernel_lock(prev);
-need_resched_nonpreemptible:
        schedule_debug(prev);
        if (sched_feat(HRTICK))
@@ -4119,6 +4115,16 @@ need_resched_nonpreemptible:
                switch_count = &prev->nvcsw;
        }
+        /*
+         * If we are going to sleep and we have plugged IO queued, make
+         * sure to submit it to avoid deadlocks.
+         */
+        if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) {
+                raw_spin_unlock(&rq->lock);
+                blk_flush_plug(prev);
+                raw_spin_lock(&rq->lock);
+        }
        pre_schedule(rq, prev);
        if (unlikely(!rq->nr_running))
@@ -4148,9 +4154,6 @@ need_resched_nonpreemptible:
        post_schedule(rq);
-        if (unlikely(reacquire_kernel_lock(prev)))
-                goto need_resched_nonpreemptible;
        preempt_enable_no_resched();
        if (need_resched())
                goto need_resched;
@@ -4899,8 +4902,11 @@ static bool check_same_owner(struct task_struct *p)
        rcu_read_lock();
        pcred = __task_cred(p);
-        match = (cred->euid == pcred->euid ||
+        if (cred->user->user_ns == pcred->user->user_ns)
-                 cred->euid == pcred->uid);
+                match = (cred->euid == pcred->euid ||
+                         cred->euid == pcred->uid);
+        else
+                match = false;
        rcu_read_unlock();
        return match;
 }
@@ -5228,7 +5234,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
                goto out_free_cpus_allowed;
        }
        retval = -EPERM;
-        if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
+        if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
                goto out_unlock;
        retval = security_task_setscheduler(p);
@@ -5534,6 +5540,7 @@ void __sched io_schedule(void)
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
+        blk_flush_plug(current);
        current->in_iowait = 1;
        schedule();
        current->in_iowait = 0;
@@ -5549,6 +5556,7 @@ long __sched io_schedule_timeout(long timeout)
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
+        blk_flush_plug(current);
        current->in_iowait = 1;
        ret = schedule_timeout(timeout);
        current->in_iowait = 0;
@@ -8279,7 +8287,7 @@ static inline int preempt_count_equals(int preempt_offset)
 {
        int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
-        return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+        return (nested == preempt_offset);
 }
 void __might_sleep(const char *file, int line, int preempt_offset)
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff10fdce..324eff5468ad 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -636,13 +636,33 @@ static inline bool si_fromuser(const struct siginfo *info)
 }
 /*
+ * called with RCU read lock from check_kill_permission()
+ */
+static int kill_ok_by_cred(struct task_struct *t)
+{
+        const struct cred *cred = current_cred();
+        const struct cred *tcred = __task_cred(t);
+        if (cred->user->user_ns == tcred->user->user_ns &&
+            (cred->euid == tcred->suid ||
+             cred->euid == tcred->uid ||
+             cred->uid  == tcred->suid ||
+             cred->uid  == tcred->uid))
+                return 1;
+        if (ns_capable(tcred->user->user_ns, CAP_KILL))
+                return 1;
+        return 0;
+}
+/*
 * Bad permissions for sending the signal
 * - the caller must hold the RCU read lock
 */
 static int check_kill_permission(int sig, struct siginfo *info,
                                 struct task_struct *t)
 {
-        const struct cred *cred, *tcred;
        struct pid *sid;
        int error;
@@ -656,14 +676,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
        if (error)
                return error;
-        cred = current_cred();
-        tcred = __task_cred(t);
        if (!same_thread_group(current, t) &&
-            (cred->euid ^ tcred->suid) &&
+            !kill_ok_by_cred(t)) {
-            (cred->euid ^ tcred->uid) &&
-            (cred->uid  ^ tcred->suid) &&
-            (cred->uid  ^ tcred->uid) &&
-            !capable(CAP_KILL)) {
                switch (sig) {
                case SIGCONT:
                        sid = task_session(t);
@@ -2421,9 +2435,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
                return -EFAULT;
        /* Not even root can pretend to send signals from the kernel.
-           Nor can they impersonate a kill(), which adds source info.  */
+         * Nor can they impersonate a kill()/tgkill(), which adds source info.
-        if (info.si_code >= 0)
+         */
+        if (info.si_code != SI_QUEUE) {
+                /* We used to allow any < 0 si_code */
+                WARN_ON_ONCE(info.si_code < 0);
                return -EPERM;
+        }
        info.si_signo = sig;
        /* POSIX.1b doesn't mention process groups.  */
@@ -2437,9 +2455,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
                return -EINVAL;
        /* Not even root can pretend to send signals from the kernel.
-           Nor can they impersonate a kill(), which adds source info.  */
+         * Nor can they impersonate a kill()/tgkill(), which adds source info.
-        if (info->si_code >= 0)
+         */
+        if (info->si_code != SI_QUEUE) {
+                /* We used to allow any < 0 si_code */
+                WARN_ON_ONCE(info->si_code < 0);
                return -EPERM;
+        }
        info->si_signo = sig;
        return do_send_specific(tgid, pid, sig, info);
diff --git a/kernel/smp.c b/kernel/smp.c
index 9910744f0856..73a195193558 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -194,7 +194,7 @@ void generic_smp_call_function_interrupt(void)
         */
        list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
                int refs;
-                void (*func) (void *info);
+                smp_call_func_t func;
                /*
                 * Since we walk the list without any locks, we might
@@ -214,17 +214,17 @@ void generic_smp_call_function_interrupt(void)
                if (atomic_read(&data->refs) == 0)
                        continue;
-                func = data->csd.func;                  /* for later warn */
+                func = data->csd.func;          /* save for later warn */
-                data->csd.func(data->csd.info);
+                func(data->csd.info);
                /*
-                 * If the cpu mask is not still set then it enabled interrupts,
+                 * If the cpu mask is not still set then func enabled
-                 * we took another smp interrupt, and executed the function
+                 * interrupts (BUG), and this cpu took another smp call
-                 * twice on this cpu.  In theory that copy decremented refs.
+                 * function interrupt and executed func(info) twice
+                 * on this cpu.  That nested execution decremented refs.
                 */
                if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
-                        WARN(1, "%pS enabled interrupts and double executed\n",
+                        WARN(1, "%pf enabled interrupts and double executed\n", func);
-                             func);
                        continue;
                }
@@ -450,7 +450,7 @@ void smp_call_function_many(const struct cpumask *mask,
 {
        struct call_function_data *data;
        unsigned long flags;
-        int cpu, next_cpu, this_cpu = smp_processor_id();
+        int refs, cpu, next_cpu, this_cpu = smp_processor_id();
        /*
         * Can deadlock when called with interrupts disabled.
@@ -461,7 +461,7 @@ void smp_call_function_many(const struct cpumask *mask,
        WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
                     && !oops_in_progress && !early_boot_irqs_disabled);
-        /* So, what's a CPU they want? Ignoring this one. */
+        /* Try to fastpath.  So, what's a CPU they want? Ignoring this one. */
        cpu = cpumask_first_and(mask, cpu_online_mask);
        if (cpu == this_cpu)
                cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
@@ -483,22 +483,49 @@ void smp_call_function_many(const struct cpumask *mask,
        data = &__get_cpu_var(cfd_data);
        csd_lock(&data->csd);
+        /* This BUG_ON verifies our reuse assertions and can be removed */
        BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
+        /*
+         * The global call function queue list add and delete are protected
+         * by a lock, but the list is traversed without any lock, relying
+         * on the rcu list add and delete to allow safe concurrent traversal.
+         * We reuse the call function data without waiting for any grace
+         * period after some other cpu removes it from the global queue.
+         * This means a cpu might find our data block as it is being
+         * filled out.
+         *
+         * We hold off the interrupt handler on the other cpu by
+         * ordering our writes to the cpu mask vs our setting of the
+         * refs counter.  We assert only the cpu owning the data block
+         * will set a bit in cpumask, and each bit will only be cleared
+         * by the subject cpu.  Each cpu must first find its bit is
+         * set and then check that refs is set indicating the element is
+         * ready to be processed, otherwise it must skip the entry.
+         *
+         * On the previous iteration refs was set to 0 by another cpu.
+         * To avoid the use of transitivity, set the counter to 0 here
+         * so the wmb will pair with the rmb in the interrupt handler.
+         */
+        atomic_set(&data->refs, 0);     /* convert 3rd to 1st party write */
        data->csd.func = func;
        data->csd.info = info;
-        cpumask_and(data->cpumask, mask, cpu_online_mask);
-        cpumask_clear_cpu(this_cpu, data->cpumask);
-        /*
+        /* Ensure 0 refs is visible before mask.  Also orders func and info */
-         * To ensure the interrupt handler gets an complete view
-         * we order the cpumask and refs writes and order the read
-         * of them in the interrupt handler.  In addition we may
-         * only clear our own cpu bit from the mask.
-         */
        smp_wmb();
-        atomic_set(&data->refs, cpumask_weight(data->cpumask));
+        /* We rely on the "and" being processed before the store */
+        cpumask_and(data->cpumask, mask, cpu_online_mask);
+        cpumask_clear_cpu(this_cpu, data->cpumask);
+        refs = cpumask_weight(data->cpumask);
+        /* Some callers race with other cpus changing the passed mask */
+        if (unlikely(!refs)) {
+                csd_unlock(&data->csd);
+                return;
+        }
        raw_spin_lock_irqsave(&call_function.lock, flags);
        /*
@@ -507,6 +534,12 @@ void smp_call_function_many(const struct cpumask *mask,
         * will not miss any other list entries:
         */
        list_add_rcu(&data->csd.list, &call_function.queue);
+        /*
+         * We rely on the wmb() in list_add_rcu to complete our writes
+         * to the cpumask before this write to refs, which indicates
+         * data is on the list and is ready to be processed.
+         */
+        atomic_set(&data->refs, refs);
        raw_spin_unlock_irqrestore(&call_function.lock, flags);
        /*
@@ -571,6 +604,87 @@ void ipi_call_unlock_irq(void)
 }
 #endif /* USE_GENERIC_SMP_HELPERS */
+/* Setup configured maximum number of CPUs to activate */
+unsigned int setup_max_cpus = NR_CPUS;
+EXPORT_SYMBOL(setup_max_cpus);
+/*
+ * Setup routine for controlling SMP activation
+ *
+ * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
+ * activation entirely (the MPS table probe still happens, though).
+ *
+ * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
+ * greater than 0, limits the maximum number of CPUs activated in
+ * SMP mode to <NUM>.
+ */
+void __weak arch_disable_smp_support(void) { }
+static int __init nosmp(char *str)
+{
+        setup_max_cpus = 0;
+        arch_disable_smp_support();
+        return 0;
+}
+early_param("nosmp", nosmp);
+/* this is hard limit */
+static int __init nrcpus(char *str)
+{
+        int nr_cpus;
+        get_option(&str, &nr_cpus);
+        if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
+                nr_cpu_ids = nr_cpus;
+        return 0;
+}
+early_param("nr_cpus", nrcpus);
+static int __init maxcpus(char *str)
+{
+        get_option(&str, &setup_max_cpus);
+        if (setup_max_cpus == 0)
+                arch_disable_smp_support();
+        return 0;
+}
+early_param("maxcpus", maxcpus);
+/* Setup number of possible processor ids */
+int nr_cpu_ids __read_mostly = NR_CPUS;
+EXPORT_SYMBOL(nr_cpu_ids);
+/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
+void __init setup_nr_cpu_ids(void)
+{
+        nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
+}
+/* Called by boot processor to activate the rest. */
+void __init smp_init(void)
+{
+        unsigned int cpu;
+        /* FIXME: This should be done in userspace --RR */
+        for_each_present_cpu(cpu) {
+                if (num_online_cpus() >= setup_max_cpus)
+                        break;
+                if (!cpu_online(cpu))
+                        cpu_up(cpu);
+        }
+        /* Any cleanup work */
+        printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
+        smp_cpus_done(setup_max_cpus);
+}
 /*
 * Call a function on all processors.  May be used during early boot while
 * early_boot_irqs_disabled is set.  Use local_irq_save/restore() instead
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 56e5dec837f0..735d87095172 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -845,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
+                p = kthread_create_on_node(run_ksoftirqd,
+                                           hcpu,
+                                           cpu_to_node(hotcpu),
+                                           "ksoftirqd/%d", hotcpu);
                if (IS_ERR(p)) {
                        printk("ksoftirqd for %i failed\n", hotcpu);
                        return notifier_from_errno(PTR_ERR(p));
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2df820b03beb..e3516b29076c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
        case CPU_UP_PREPARE:
                BUG_ON(stopper->thread || stopper->enabled ||
                       !list_empty(&stopper->works));
-                p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
+                p = kthread_create_on_node(cpu_stopper_thread,
-                                   cpu);
+                                           stopper,
+                                           cpu_to_node(cpu),
+                                           "migration/%d", cpu);
                if (IS_ERR(p))
                        return notifier_from_errno(PTR_ERR(p));
                get_task_struct(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702ec813..af468edf096a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -37,6 +37,7 @@
 #include <linux/ptrace.h>
 #include <linux/fs_struct.h>
 #include <linux/gfp.h>
+#include <linux/syscore_ops.h>
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -119,16 +120,33 @@ EXPORT_SYMBOL(cad_pid);
 void (*pm_power_off_prepare)(void);
 /*
+ * Returns true if current's euid is same as p's uid or euid,
+ * or has CAP_SYS_NICE to p's user_ns.
+ *
+ * Called with rcu_read_lock, creds are safe
+ */
+static bool set_one_prio_perm(struct task_struct *p)
+{
+        const struct cred *cred = current_cred(), *pcred = __task_cred(p);
+        if (pcred->user->user_ns == cred->user->user_ns &&
+            (pcred->uid  == cred->euid ||
+             pcred->euid == cred->euid))
+                return true;
+        if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
+                return true;
+        return false;
+}
+/*
 * set the priority of a task
 * - the caller must hold the RCU read lock
 */
 static int set_one_prio(struct task_struct *p, int niceval, int error)
 {
-        const struct cred *cred = current_cred(), *pcred = __task_cred(p);
        int no_nice;
-        if (pcred->uid  != cred->euid &&
+        if (!set_one_prio_perm(p)) {
-            pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
                error = -EPERM;
                goto out;
        }
@@ -298,6 +316,7 @@ void kernel_restart_prepare(char *cmd)
        system_state = SYSTEM_RESTART;
        device_shutdown();
        sysdev_shutdown();
+        syscore_shutdown();
 }
 /**
@@ -336,6 +355,7 @@ void kernel_halt(void)
 {
        kernel_shutdown_prepare(SYSTEM_HALT);
        sysdev_shutdown();
+        syscore_shutdown();
        printk(KERN_EMERG "System halted.\n");
        kmsg_dump(KMSG_DUMP_HALT);
        machine_halt();
@@ -355,6 +375,7 @@ void kernel_power_off(void)
                pm_power_off_prepare();
        disable_nonboot_cpus();
        sysdev_shutdown();
+        syscore_shutdown();
        printk(KERN_EMERG "Power down.\n");
        kmsg_dump(KMSG_DUMP_POWEROFF);
        machine_power_off();
@@ -502,7 +523,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
        if (rgid != (gid_t) -1) {
                if (old->gid == rgid ||
                    old->egid == rgid ||
-                    capable(CAP_SETGID))
+                    nsown_capable(CAP_SETGID))
                        new->gid = rgid;
                else
                        goto error;
@@ -511,7 +532,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
                if (old->gid == egid ||
                    old->egid == egid ||
                    old->sgid == egid ||
-                    capable(CAP_SETGID))
+                    nsown_capable(CAP_SETGID))
                        new->egid = egid;
                else
                        goto error;
@@ -546,7 +567,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
        old = current_cred();
        retval = -EPERM;
-        if (capable(CAP_SETGID))
+        if (nsown_capable(CAP_SETGID))
                new->gid = new->egid = new->sgid = new->fsgid = gid;
        else if (gid == old->gid || gid == old->sgid)
                new->egid = new->fsgid = gid;
@@ -613,7 +634,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
                new->uid = ruid;
                if (old->uid != ruid &&
                    old->euid != ruid &&
-                    !capable(CAP_SETUID))
+                    !nsown_capable(CAP_SETUID))
                        goto error;
        }
@@ -622,7 +643,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
                if (old->uid != euid &&
                    old->euid != euid &&
                    old->suid != euid &&
-                    !capable(CAP_SETUID))
+                    !nsown_capable(CAP_SETUID))
                        goto error;
        }
@@ -670,7 +691,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
        old = current_cred();
        retval = -EPERM;
-        if (capable(CAP_SETUID)) {
+        if (nsown_capable(CAP_SETUID)) {
                new->suid = new->uid = uid;
                if (uid != old->uid) {
                        retval = set_user(new);
@@ -712,7 +733,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
        old = current_cred();
        retval = -EPERM;
-        if (!capable(CAP_SETUID)) {
+        if (!nsown_capable(CAP_SETUID)) {
                if (ruid != (uid_t) -1 && ruid != old->uid &&
                    ruid != old->euid  && ruid != old->suid)
                        goto error;
@@ -776,7 +797,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
        old = current_cred();
        retval = -EPERM;
-        if (!capable(CAP_SETGID)) {
+        if (!nsown_capable(CAP_SETGID)) {
                if (rgid != (gid_t) -1 && rgid != old->gid &&
                    rgid != old->egid  && rgid != old->sgid)
                        goto error;
@@ -836,7 +857,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
        if (uid == old->uid  || uid == old->euid  ||
            uid == old->suid || uid == old->fsuid ||
-            capable(CAP_SETUID)) {
+            nsown_capable(CAP_SETUID)) {
                if (uid != old_fsuid) {
                        new->fsuid = uid;
                        if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -869,7 +890,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
        if (gid == old->gid  || gid == old->egid  ||
            gid == old->sgid || gid == old->fsgid ||
-            capable(CAP_SETGID)) {
+            nsown_capable(CAP_SETGID)) {
                if (gid != old_fsgid) {
                        new->fsgid = gid;
                        goto change_okay;
@@ -1177,8 +1198,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
        int errno;
        char tmp[__NEW_UTS_LEN];
-        if (!capable(CAP_SYS_ADMIN))
+        if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        if (len < 0 || len > __NEW_UTS_LEN)
                return -EINVAL;
        down_write(&uts_sem);
@@ -1226,7 +1248,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
        int errno;
        char tmp[__NEW_UTS_LEN];
-        if (!capable(CAP_SYS_ADMIN))
+        if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        if (len < 0 || len > __NEW_UTS_LEN)
                return -EINVAL;
@@ -1341,6 +1363,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
        rlim = tsk->signal->rlim + resource;
        task_lock(tsk->group_leader);
        if (new_rlim) {
+                /* Keep the capable check against init_user_ns until
+                   cgroups can contain all limits */
                if (new_rlim->rlim_max > rlim->rlim_max &&
                                !capable(CAP_SYS_RESOURCE))
                        retval = -EPERM;
@@ -1384,19 +1408,22 @@ static int check_prlimit_permission(struct task_struct *task)
 {
        const struct cred *cred = current_cred(), *tcred;
-        tcred = __task_cred(task);
+        if (current == task)
-        if (current != task &&
+                return 0;
-            (cred->uid != tcred->euid ||
-             cred->uid != tcred->suid ||
-             cred->uid != tcred->uid  ||
-             cred->gid != tcred->egid ||
-             cred->gid != tcred->sgid ||
-             cred->gid != tcred->gid) &&
-             !capable(CAP_SYS_RESOURCE)) {
-                return -EPERM;
-        }
-        return 0;
+        tcred = __task_cred(task);
+        if (cred->user->user_ns == tcred->user->user_ns &&
+            (cred->uid == tcred->euid &&
+             cred->uid == tcred->suid &&
+             cred->uid == tcred->uid  &&
+             cred->gid == tcred->egid &&
+             cred->gid == tcred->sgid &&
+             cred->gid == tcred->gid))
+                return 0;
+        if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
+                return 0;
+        return -EPERM;
 }
 SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 51054fea5d99..c0bb32414b17 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -117,6 +117,7 @@ static int neg_one = -1;
 static int zero;
 static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
+static int __maybe_unused three = 3;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
 #ifdef CONFIG_PRINTK
@@ -169,6 +170,11 @@ static int proc_taint(struct ctl_table *table, int write,
                               void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
+#ifdef CONFIG_PRINTK
+static int proc_dmesg_restrict(struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses it's own private copy */
 static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -706,7 +712,7 @@ static struct ctl_table kern_table[] = {
                .data           = &kptr_restrict,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = proc_dmesg_restrict,
                .extra1         = &zero,
                .extra2         = &two,
        },
@@ -971,14 +977,18 @@ static struct ctl_table vm_table[] = {
                .data           = &sysctl_overcommit_memory,
                .maxlen         = sizeof(sysctl_overcommit_memory),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &two,
        },
        {
                .procname       = "panic_on_oom",
                .data           = &sysctl_panic_on_oom,
                .maxlen         = sizeof(sysctl_panic_on_oom),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &two,
        },
        {
                .procname       = "oom_kill_allocating_task",
@@ -1006,7 +1016,8 @@ static struct ctl_table vm_table[] = {
                .data           = &page_cluster,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
        },
        {
                .procname       = "dirty_background_ratio",
@@ -1054,7 +1065,8 @@ static struct ctl_table vm_table[] = {
                .data           = &dirty_expire_interval,
                .maxlen         = sizeof(dirty_expire_interval),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
        },
        {
                .procname       = "nr_pdflush_threads",
@@ -1130,6 +1142,8 @@ static struct ctl_table vm_table[] = {
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = drop_caches_sysctl_handler,
+                .extra1         = &one,
+                .extra2         = &three,
        },
 #ifdef CONFIG_COMPACTION
        {
@@ -1683,13 +1697,8 @@ static int test_perm(int mode, int op)
 int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
 {
-        int error;
        int mode;
-        error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
-        if (error)
-                return error;
        if (root->permissions)
                mode = root->permissions(root, current->nsproxy, table);
        else
@@ -2390,6 +2399,17 @@ static int proc_taint(struct ctl_table *table, int write,
        return err;
 }
+#ifdef CONFIG_PRINTK
+static int proc_dmesg_restrict(struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        if (write && !capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+#endif
 struct do_proc_dointvec_minmax_conv_param {
        int *min;
        int *max;
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 10b90d8a03c4..4e4932a7b360 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
                const char *fail = NULL;
                if (table->parent) {
-                        if (table->procname && !table->parent->procname)
+                        if (!table->parent->procname)
                                set_fail(&fail, table, "Parent without procname");
                }
-                if (!table->procname)
-                        set_fail(&fail, table, "No procname");
                if (table->child) {
                        if (table->data)
                                set_fail(&fail, table, "Directory with data?");
@@ -144,13 +142,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
                                        set_fail(&fail, table, "No maxlen");
                        }
 #ifdef CONFIG_PROC_SYSCTL
-                        if (table->procname && !table->proc_handler)
+                        if (!table->proc_handler)
                                set_fail(&fail, table, "No proc_handler");
 #endif
-#if 0
-                        if (!table->procname && table->proc_handler)
-                                set_fail(&fail, table, "proc_handler without procname");
-#endif
                        sysctl_check_leaf(namespaces, table, &fail);
                }
                if (table->mode > 0777)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 3971c6b9d58d..9ffea360a778 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -685,7 +685,7 @@ static int __init taskstats_init(void)
                goto err_cgroup_ops;
        family_registered = 1;
-        printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
+        pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
        return 0;
 err_cgroup_ops:
        genl_unregister_ops(&family, &taskstats_ops);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 14674dce77a6..61d7d59f4a1a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -275,7 +275,7 @@ config PROFILE_ANNOTATED_BRANCHES
          This tracer profiles all the the likely and unlikely macros
          in the kernel. It will display the results in:
-          /sys/kernel/debug/tracing/profile_annotated_branch
+          /sys/kernel/debug/tracing/trace_stat/branch_annotated
          Note: this will add a significant overhead; only turn this
          on if you need to profile the system's use of these macros.
@@ -288,7 +288,7 @@ config PROFILE_ALL_BRANCHES
          taken in the kernel is recorded whether it hit or miss.
          The results will be displayed in:
-          /sys/kernel/debug/tracing/profile_branch
+          /sys/kernel/debug/tracing/trace_stat/branch_all
          This option also enables the likely/unlikely profiler.
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index cbafed7d4f38..7aa40f8e182d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -703,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q)
 *
 **/
 static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-                                    u32 what)
+                             u32 what)
 {
        struct blk_trace *bt = q->blk_trace;
-        int rw = rq->cmd_flags & 0x03;
        if (likely(!bt))
                return;
-        if (rq->cmd_flags & REQ_DISCARD)
-                rw |= REQ_DISCARD;
-        if (rq->cmd_flags & REQ_SECURE)
-                rw |= REQ_SECURE;
        if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
                what |= BLK_TC_ACT(BLK_TC_PC);
-                __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
+                __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
                                what, rq->errors, rq->cmd_len, rq->cmd);
        } else  {
                what |= BLK_TC_ACT(BLK_TC_FS);
-                __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
+                __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
-                                what, rq->errors, 0, NULL);
+                                rq->cmd_flags, what, rq->errors, 0, NULL);
        }
 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 888b611897d3..c075f4ea6b94 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1467,7 +1467,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                return t_hash_next(m, pos);
        (*pos)++;
-        iter->pos = *pos;
+        iter->pos = iter->func_pos = *pos;
        if (iter->flags & FTRACE_ITER_PRINTALL)
                return t_hash_start(m, pos);
@@ -1502,7 +1502,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
        if (!rec)
                return t_hash_start(m, pos);
-        iter->func_pos = *pos;
        iter->func = rec;
        return iter;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index db7b439d23ee..d9c8bcafb120 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -668,7 +668,7 @@ static struct list_head *rb_list_head(struct list_head *list)
 * the reader page). But if the next page is a header page,
 * its flags will be non zero.
 */
-static int inline
+static inline int
 rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
                struct buffer_page *page, struct list_head *list)
 {
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 3249b4f77ef0..8008ddcfbf20 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -391,8 +391,8 @@ static int process_ops(struct filter_pred *preds,
                       struct filter_pred *op, void *rec)
 {
        struct filter_pred *pred;
+        int match = 0;
        int type;
-        int match;
        int i;
        /*
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 419209893d87..51c6e89e8619 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
        struct group_info *group_info;
        int retval;
-        if (!capable(CAP_SETGID))
+        if (!nsown_capable(CAP_SETGID))
                return -EPERM;
        if ((unsigned)gidsetsize > NGROUPS_MAX)
                return -EINVAL;
diff --git a/kernel/user.c b/kernel/user.c
index 5c598ca781df..9e03e9c1df8d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,9 +17,13 @@
 #include <linux/module.h>
 #include <linux/user_namespace.h>
+/*
+ * userns count is 1 for root user, 1 for init_uts_ns,
+ * and 1 for... ?
+ */
 struct user_namespace init_user_ns = {
        .kref = {
-                .refcount       = ATOMIC_INIT(2),
+                .refcount       = ATOMIC_INIT(3),
        },
        .creator = &root_user,
 };
@@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep;
 */
 static DEFINE_SPINLOCK(uidhash_lock);
-/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */
+/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
 struct user_struct root_user = {
        .__count        = ATOMIC_INIT(2),
        .processes      = ATOMIC_INIT(1),
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 8a82b4b8ea52..44646179eaba 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <linux/err.h>
 #include <linux/slab.h>
+#include <linux/user_namespace.h>
 static struct uts_namespace *create_uts_ns(void)
 {
@@ -30,7 +31,8 @@ static struct uts_namespace *create_uts_ns(void)
 * @old_ns: namespace to clone
 * Return NULL on error (failure to kmalloc), new ns otherwise
 */
-static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
+static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
+                                          struct uts_namespace *old_ns)
 {
        struct uts_namespace *ns;
@@ -40,6 +42,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
        down_read(&uts_sem);
        memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
+        ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
        up_read(&uts_sem);
        return ns;
 }
@@ -50,8 +53,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
 * utsname of this process won't be seen by parent, and vice
 * versa.
 */
-struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns)
+struct uts_namespace *copy_utsname(unsigned long flags,
+                                   struct task_struct *tsk)
 {
+        struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
        struct uts_namespace *new_ns;
        BUG_ON(!old_ns);
@@ -60,7 +65,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol
        if (!(flags & CLONE_NEWUTS))
                return old_ns;
-        new_ns = clone_uts_ns(old_ns);
+        new_ns = clone_uts_ns(tsk, old_ns);
        put_uts_ns(old_ns);
        return new_ns;
@@ -71,5 +76,6 @@ void free_uts_ns(struct kref *kref)
        struct uts_namespace *ns;
        ns = container_of(kref, struct uts_namespace, kref);
+        put_user_ns(ns->user_ns);
        kfree(ns);
 }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 18bb15776c57..140dce750450 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -48,12 +48,15 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 * Should we panic when a soft-lockup or hard-lockup occurs:
 */
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int hardlockup_panic;
+static int hardlockup_panic =
+                        CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
 static int __init hardlockup_panic_setup(char *str)
 {
        if (!strncmp(str, "panic", 5))
                hardlockup_panic = 1;
+        else if (!strncmp(str, "nopanic", 7))
+                hardlockup_panic = 0;
        else if (!strncmp(str, "0", 1))
                watchdog_enabled = 0;
        return 1;
@@ -415,19 +418,22 @@ static int watchdog_prepare_cpu(int cpu)
 static int watchdog_enable(int cpu)
 {
        struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
-        int err;
+        int err = 0;
        /* enable the perf event */
        err = watchdog_nmi_enable(cpu);
-        if (err)
-                return err;
+        /* Regardless of err above, fall through and start softlockup */
        /* create the watchdog thread */
        if (!p) {
                p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
                if (IS_ERR(p)) {
                        printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
-                        return PTR_ERR(p);
+                        if (!err)
+                                /* if hardlockup hasn't already set this */
+                                err = PTR_ERR(p);
+                        goto out;
                }
                kthread_bind(p, cpu);
                per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -435,7 +441,8 @@ static int watchdog_enable(int cpu)
                wake_up_process(p);
        }
-        return 0;
+out:
+        return err;
 }
 static void watchdog_disable(int cpu)
@@ -547,7 +554,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 #endif /* CONFIG_HOTPLUG_CPU */
        }
-        return notifier_from_errno(err);
+        /*
+         * hardlockup and softlockup are not important enough
+         * to block cpu bring up.  Just always succeed and
+         * rely on printk output to flag problems.
+         */
+        return NOTIFY_OK;
 }
 static struct notifier_block __cpuinitdata cpu_nfb = {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b5fe4c00eb3c..04ef830690ec 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -251,10 +251,12 @@ struct workqueue_struct *system_wq __read_mostly;
 struct workqueue_struct *system_long_wq __read_mostly;
 struct workqueue_struct *system_nrt_wq __read_mostly;
 struct workqueue_struct *system_unbound_wq __read_mostly;
+struct workqueue_struct *system_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_wq);
 EXPORT_SYMBOL_GPL(system_long_wq);
 EXPORT_SYMBOL_GPL(system_nrt_wq);
 EXPORT_SYMBOL_GPL(system_unbound_wq);
+EXPORT_SYMBOL_GPL(system_freezable_wq);
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
@@ -1364,8 +1366,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
        worker->id = id;
        if (!on_unbound_cpu)
-                worker->task = kthread_create(worker_thread, worker,
+                worker->task = kthread_create_on_node(worker_thread,
-                                              "kworker/%u:%d", gcwq->cpu, id);
+                                                      worker,
+                                                      cpu_to_node(gcwq->cpu),
+                                                      "kworker/%u:%d", gcwq->cpu, id);
        else
                worker->task = kthread_create(worker_thread, worker,
                                              "kworker/u:%d", id);
@@ -3781,8 +3785,10 @@ static int __init init_workqueues(void)
        system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
        system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
                                            WQ_UNBOUND_MAX_ACTIVE);
+        system_freezable_wq = alloc_workqueue("events_freezable",
+                                              WQ_FREEZABLE, 0);
        BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
-               !system_unbound_wq);
+               !system_unbound_wq || !system_freezable_wq);
        return 0;
 }
 early_initcall(init_workqueues);