78 files changed, 2083 insertions, 1638 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 5068e2a4e75f..2251882daf53 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -124,8 +124,8 @@ config INLINE_SPIN_LOCK_IRQSAVE
        def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
                 ARCH_INLINE_SPIN_LOCK_IRQSAVE
-config INLINE_SPIN_UNLOCK
+config UNINLINE_SPIN_UNLOCK
-        def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
+        bool
 config INLINE_SPIN_UNLOCK_BH
        def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 24e7cb0ba26a..3f9c97419f02 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -36,6 +36,7 @@ config PREEMPT_VOLUNTARY
 config PREEMPT
        bool "Preemptible Kernel (Low-Latency Desktop)"
        select PREEMPT_COUNT
+        select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
        help
          This option reduces the latency of the kernel by making
          all kernel code (that is not executing in a critical section)
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d9de86b7e76..cb41b9547c9f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,7 +27,6 @@ obj-y += power/
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
-obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
diff --git a/kernel/audit.c b/kernel/audit.c
index bb0eb5bb9a0a..1c7f2c61416b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1418,7 +1418,7 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
 /* This is a helper-function to print the escaped d_path */
 void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
-                      struct path *path)
+                      const struct path *path)
 {
        char *p, *pathname;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c6877fe9a831..ed64ccac67c9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1472,7 +1472,6 @@ static int cgroup_get_rootdir(struct super_block *sb)
        struct inode *inode =
                cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
-        struct dentry *dentry;
        if (!inode)
                return -ENOMEM;
@@ -1481,12 +1480,9 @@ static int cgroup_get_rootdir(struct super_block *sb)
        inode->i_op = &cgroup_dir_inode_operations;
        /* directories start off with i_nlink == 2 (for "." entry) */
        inc_nlink(inode);
-        dentry = d_alloc_root(inode);
+        sb->s_root = d_make_root(inode);
-        if (!dentry) {
+        if (!sb->s_root)
-                iput(inode);
                return -ENOMEM;
-        }
-        sb->s_root = dentry;
        /* for everything else we want ->d_op set */
        sb->s_d_op = &cgroup_dops;
        return 0;
@@ -1887,7 +1883,7 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
 */
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
-        int retval;
+        int retval = 0;
        struct cgroup_subsys *ss, *failed_ss = NULL;
        struct cgroup *oldcgrp;
        struct cgroupfs_root *root = cgrp->root;
@@ -4885,9 +4881,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
        rcu_assign_pointer(id->css, NULL);
        rcu_assign_pointer(css->id, NULL);
-        write_lock(&ss->id_lock);
+        spin_lock(&ss->id_lock);
        idr_remove(&ss->idr, id->id);
-        write_unlock(&ss->id_lock);
+        spin_unlock(&ss->id_lock);
        kfree_rcu(id, rcu_head);
 }
 EXPORT_SYMBOL_GPL(free_css_id);
@@ -4913,10 +4909,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
                error = -ENOMEM;
                goto err_out;
        }
-        write_lock(&ss->id_lock);
+        spin_lock(&ss->id_lock);
        /* Don't use 0. allocates an ID of 1-65535 */
        error = idr_get_new_above(&ss->idr, newid, 1, &myid);
-        write_unlock(&ss->id_lock);
+        spin_unlock(&ss->id_lock);
        /* Returns error when there are no free spaces for new ID.*/
        if (error) {
@@ -4931,9 +4927,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
        return newid;
 remove_idr:
        error = -ENOSPC;
-        write_lock(&ss->id_lock);
+        spin_lock(&ss->id_lock);
        idr_remove(&ss->idr, myid);
-        write_unlock(&ss->id_lock);
+        spin_unlock(&ss->id_lock);
 err_out:
        kfree(newid);
        return ERR_PTR(error);
@@ -4945,7 +4941,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
 {
        struct css_id *newid;
-        rwlock_init(&ss->id_lock);
+        spin_lock_init(&ss->id_lock);
        idr_init(&ss->idr);
        newid = get_new_cssid(ss, 0);
@@ -5033,6 +5029,8 @@ css_get_next(struct cgroup_subsys *ss, int id,
                return NULL;
        BUG_ON(!ss->use_id);
+        WARN_ON_ONCE(!rcu_read_lock_held());
        /* fill start point for scan */
        tmpid = id;
        while (1) {
@@ -5040,10 +5038,7 @@ css_get_next(struct cgroup_subsys *ss, int id,
                 * scan next entry from bitmap(tree), tmpid is updated after
                 * idr_get_next().
                 */
-                read_lock(&ss->id_lock);
                tmp = idr_get_next(&ss->idr, &tmpid);
-                read_unlock(&ss->id_lock);
                if (!tmp)
                        break;
                if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
diff --git a/kernel/compat.c b/kernel/compat.c
index f346cedfe24d..74ff8498809a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -31,11 +31,10 @@
 #include <asm/uaccess.h>
 /*
- * Note that the native side is already converted to a timespec, because
+ * Get/set struct timeval with struct timespec on the native side
- * that's what we want anyway.
 */
-static int compat_get_timeval(struct timespec *o,
+static int compat_get_timeval_convert(struct timespec *o,
-                struct compat_timeval __user *i)
+                                      struct compat_timeval __user *i)
 {
        long usec;
@@ -46,8 +45,8 @@ static int compat_get_timeval(struct timespec *o,
        return 0;
 }
-static int compat_put_timeval(struct compat_timeval __user *o,
+static int compat_put_timeval_convert(struct compat_timeval __user *o,
-                struct timeval *i)
+                                      struct timeval *i)
 {
        return (put_user(i->tv_sec, &o->tv_sec) ||
                put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
@@ -117,7 +116,7 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
        if (tv) {
                struct timeval ktv;
                do_gettimeofday(&ktv);
-                if (compat_put_timeval(tv, &ktv))
+                if (compat_put_timeval_convert(tv, &ktv))
                        return -EFAULT;
        }
        if (tz) {
@@ -135,7 +134,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
        struct timezone ktz;
        if (tv) {
-                if (compat_get_timeval(&kts, tv))
+                if (compat_get_timeval_convert(&kts, tv))
                        return -EFAULT;
        }
        if (tz) {
@@ -146,12 +145,29 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
        return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
 }
+int get_compat_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
+{
+        return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
+                        __get_user(tv->tv_sec, &ctv->tv_sec) ||
+                        __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(get_compat_timeval);
+int put_compat_timeval(const struct timeval *tv, struct compat_timeval __user *ctv)
+{
+        return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) ||
+                        __put_user(tv->tv_sec, &ctv->tv_sec) ||
+                        __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(put_compat_timeval);
 int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
 {
        return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
                        __get_user(ts->tv_sec, &cts->tv_sec) ||
                        __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
 }
+EXPORT_SYMBOL_GPL(get_compat_timespec);
 int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
 {
@@ -161,6 +177,42 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
 }
 EXPORT_SYMBOL_GPL(put_compat_timespec);
+int compat_get_timeval(struct timeval *tv, const void __user *utv)
+{
+        if (COMPAT_USE_64BIT_TIME)
+                return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0;
+        else
+                return get_compat_timeval(tv, utv);
+}
+EXPORT_SYMBOL_GPL(compat_get_timeval);
+int compat_put_timeval(const struct timeval *tv, void __user *utv)
+{
+        if (COMPAT_USE_64BIT_TIME)
+                return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0;
+        else
+                return put_compat_timeval(tv, utv);
+}
+EXPORT_SYMBOL_GPL(compat_put_timeval);
+int compat_get_timespec(struct timespec *ts, const void __user *uts)
+{
+        if (COMPAT_USE_64BIT_TIME)
+                return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0;
+        else
+                return get_compat_timespec(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_get_timespec);
+int compat_put_timespec(const struct timespec *ts, void __user *uts)
+{
+        if (COMPAT_USE_64BIT_TIME)
+                return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0;
+        else
+                return put_compat_timespec(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_put_timespec);
 static long compat_nanosleep_restart(struct restart_block *restart)
 {
        struct compat_timespec __user *rmtp;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 5d575836dba6..14f7070b4ba2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -270,11 +270,11 @@ static struct file_system_type cpuset_fs_type = {
 * are online.  If none are online, walk up the cpuset hierarchy
 * until we find one that does have some online cpus.  If we get
 * all the way to the top and still haven't found any online cpus,
- * return cpu_online_map.  Or if passed a NULL cs from an exit'ing
+ * return cpu_online_mask.  Or if passed a NULL cs from an exit'ing
- * task, return cpu_online_map.
+ * task, return cpu_online_mask.
 *
 * One way or another, we guarantee to return some non-empty subset
- * of cpu_online_map.
+ * of cpu_online_mask.
 *
 * Call with callback_mutex held.
 */
@@ -867,7 +867,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
        int retval;
        int is_load_balanced;
-        /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
+        /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
        if (cs == &top_cpuset)
                return -EACCES;
@@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
 {
        bool need_loop;
-repeat:
        /*
         * Allow tasks that have access to memory reserves because they have
         * been OOM killed to get memory anywhere.
@@ -983,45 +982,19 @@ repeat:
         */
        need_loop = task_has_mempolicy(tsk) ||
                        !nodes_intersects(*newmems, tsk->mems_allowed);
-        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-        /*
-         * ensure checking ->mems_allowed_change_disable after setting all new
-         * allowed nodes.
-         *
-         * the read-side task can see an nodemask with new allowed nodes and
-         * old allowed nodes. and if it allocates page when cpuset clears newly
-         * disallowed ones continuous, it can see the new allowed bits.
-         *
-         * And if setting all new allowed nodes is after the checking, setting
-         * all new allowed nodes and clearing newly disallowed ones will be done
-         * continuous, and the read-side task may find no node to alloc page.
-         */
-        smp_mb();
-        /*
+        if (need_loop)
-         * Allocation of memory is very fast, we needn't sleep when waiting
+                write_seqcount_begin(&tsk->mems_allowed_seq);
-         * for the read-side.
-         */
-        while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
-                task_unlock(tsk);
-                if (!task_curr(tsk))
-                        yield();
-                goto repeat;
-        }
-        /*
+        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-         * ensure checking ->mems_allowed_change_disable before clearing all new
+        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-         * disallowed nodes.
-         *
-         * if clearing newly disallowed bits before the checking, the read-side
-         * task may find no node to alloc page.
-         */
-        smp_mb();
        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
        tsk->mems_allowed = *newmems;
+        if (need_loop)
+                write_seqcount_end(&tsk->mems_allowed_seq);
        task_unlock(tsk);
 }
@@ -2176,7 +2149,7 @@ void __init cpuset_init_smp(void)
 *
 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
 * attached to the specified @tsk.  Guaranteed to return some non-empty
- * subset of cpu_online_map, even if this means going outside the
+ * subset of cpu_online_mask, even if this means going outside the
 * tasks cpuset.
 **/
@@ -2189,10 +2162,9 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
        mutex_unlock(&callback_mutex);
 }
-int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 {
        const struct cpuset *cs;
-        int cpu;
        rcu_read_lock();
        cs = task_cs(tsk);
@@ -2213,22 +2185,10 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
         * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
         * set any mask even if it is not right from task_cs() pov,
         * the pending set_cpus_allowed_ptr() will fix things.
+         *
+         * select_fallback_rq() will fix things ups and set cpu_possible_mask
+         * if required.
         */
-        cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
-        if (cpu >= nr_cpu_ids) {
-                /*
-                 * Either tsk->cpus_allowed is wrong (see above) or it
-                 * is actually empty. The latter case is only possible
-                 * if we are racing with remove_tasks_in_empty_cpuset().
-                 * Like above we can temporary set any mask and rely on
-                 * set_cpus_allowed_ptr() as synchronization point.
-                 */
-                do_set_cpus_allowed(tsk, cpu_possible_mask);
-                cpu = cpumask_any(cpu_active_mask);
-        }
-        return cpu;
 }
 void cpuset_init_current_mems_allowed(void)
diff --git a/kernel/cred.c b/kernel/cred.c
index 5791612a4045..e70683d9ec32 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -16,6 +16,7 @@
 #include <linux/keyctl.h>
 #include <linux/init_task.h>
 #include <linux/security.h>
+#include <linux/binfmts.h>
 #include <linux/cn_proc.h>
 #if 0
@@ -385,6 +386,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
        struct cred *new;
        int ret;
+        p->replacement_session_keyring = NULL;
        if (
 #ifdef CONFIG_KEYS
                !p->cred->thread_keyring &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0d7c08784efb..0557f24c6bca 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -41,6 +41,7 @@
 #include <linux/delay.h>
 #include <linux/sched.h>
 #include <linux/sysrq.h>
+#include <linux/reboot.h>
 #include <linux/init.h>
 #include <linux/kgdb.h>
 #include <linux/kdb.h>
@@ -52,7 +53,6 @@
 #include <asm/cacheflush.h>
 #include <asm/byteorder.h>
 #include <linux/atomic.h>
-#include <asm/system.h>
 #include "debug_core.h"
@@ -75,6 +75,8 @@ static int			exception_level;
 struct kgdb_io          *dbg_io_ops;
 static DEFINE_SPINLOCK(kgdb_registration_lock);
+/* Action for the reboot notifiter, a global allow kdb to change it */
+static int kgdbreboot;
 /* kgdb console driver is loaded */
 static int kgdb_con_registered;
 /* determine if kgdb console output should be used */
@@ -96,6 +98,7 @@ static int __init opt_kgdb_con(char *str)
 early_param("kgdbcon", opt_kgdb_con);
 module_param(kgdb_use_con, int, 0644);
+module_param(kgdbreboot, int, 0644);
 /*
 * Holds information about breakpoints in a kernel. These breakpoints are
@@ -157,37 +160,39 @@ early_param("nokgdbroundup", opt_nokgdbroundup);
 * Weak aliases for breakpoint management,
 * can be overriden by architectures when needed:
 */
-int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
+int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
 {
        int err;
-        err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
+        err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+                                BREAK_INSTR_SIZE);
        if (err)
                return err;
+        err = probe_kernel_write((char *)bpt->bpt_addr,
-        return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
+                                 arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
-                                  BREAK_INSTR_SIZE);
+        return err;
 }
-int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
+int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
 {
-        return probe_kernel_write((char *)addr,
+        return probe_kernel_write((char *)bpt->bpt_addr,
-                                  (char *)bundle, BREAK_INSTR_SIZE);
+                                  (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
 }
 int __weak kgdb_validate_break_address(unsigned long addr)
 {
-        char tmp_variable[BREAK_INSTR_SIZE];
+        struct kgdb_bkpt tmp;
        int err;
-        /* Validate setting the breakpoint and then removing it.  In the
+        /* Validate setting the breakpoint and then removing it.  If the
         * remove fails, the kernel needs to emit a bad message because we
         * are deep trouble not being able to put things back the way we
         * found them.
         */
-        err = kgdb_arch_set_breakpoint(addr, tmp_variable);
+        tmp.bpt_addr = addr;
+        err = kgdb_arch_set_breakpoint(&tmp);
        if (err)
                return err;
-        err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
+        err = kgdb_arch_remove_breakpoint(&tmp);
        if (err)
                printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
                   "memory destroyed at: %lx", addr);
@@ -231,7 +236,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
 */
 int dbg_activate_sw_breakpoints(void)
 {
-        unsigned long addr;
        int error;
        int ret = 0;
        int i;
@@ -240,16 +244,15 @@ int dbg_activate_sw_breakpoints(void)
                if (kgdb_break[i].state != BP_SET)
                        continue;
-                addr = kgdb_break[i].bpt_addr;
+                error = kgdb_arch_set_breakpoint(&kgdb_break[i]);
-                error = kgdb_arch_set_breakpoint(addr,
-                                kgdb_break[i].saved_instr);
                if (error) {
                        ret = error;
-                        printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
+                        printk(KERN_INFO "KGDB: BP install failed: %lx",
+                               kgdb_break[i].bpt_addr);
                        continue;
                }
-                kgdb_flush_swbreak_addr(addr);
+                kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr);
                kgdb_break[i].state = BP_ACTIVE;
        }
        return ret;
@@ -298,7 +301,6 @@ int dbg_set_sw_break(unsigned long addr)
 int dbg_deactivate_sw_breakpoints(void)
 {
-        unsigned long addr;
        int error;
        int ret = 0;
        int i;
@@ -306,15 +308,14 @@ int dbg_deactivate_sw_breakpoints(void)
        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
                if (kgdb_break[i].state != BP_ACTIVE)
                        continue;
-                addr = kgdb_break[i].bpt_addr;
+                error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
-                error = kgdb_arch_remove_breakpoint(addr,
-                                        kgdb_break[i].saved_instr);
                if (error) {
-                        printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
+                        printk(KERN_INFO "KGDB: BP remove failed: %lx\n",
+                               kgdb_break[i].bpt_addr);
                        ret = error;
                }
-                kgdb_flush_swbreak_addr(addr);
+                kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr);
                kgdb_break[i].state = BP_SET;
        }
        return ret;
@@ -348,7 +349,6 @@ int kgdb_isremovedbreak(unsigned long addr)
 int dbg_remove_all_break(void)
 {
-        unsigned long addr;
        int error;
        int i;
@@ -356,12 +356,10 @@ int dbg_remove_all_break(void)
        for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
                if (kgdb_break[i].state != BP_ACTIVE)
                        goto setundefined;
-                addr = kgdb_break[i].bpt_addr;
+                error = kgdb_arch_remove_breakpoint(&kgdb_break[i]);
-                error = kgdb_arch_remove_breakpoint(addr,
-                                kgdb_break[i].saved_instr);
                if (error)
                        printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
-                           addr);
+                               kgdb_break[i].bpt_addr);
 setundefined:
                kgdb_break[i].state = BP_UNDEFINED;
        }
@@ -784,6 +782,33 @@ void __init dbg_late_init(void)
        kdb_init(KDB_INIT_FULL);
 }
+static int
+dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
+{
+        /*
+         * Take the following action on reboot notify depending on value:
+         *    1 == Enter debugger
+         *    0 == [the default] detatch debug client
+         *   -1 == Do nothing... and use this until the board resets
+         */
+        switch (kgdbreboot) {
+        case 1:
+                kgdb_breakpoint();
+        case -1:
+                goto done;
+        }
+        if (!dbg_kdb_mode)
+                gdbstub_exit(code);
+done:
+        return NOTIFY_DONE;
+}
+static struct notifier_block dbg_reboot_notifier = {
+        .notifier_call          = dbg_notify_reboot,
+        .next                   = NULL,
+        .priority               = INT_MAX,
+};
 static void kgdb_register_callbacks(void)
 {
        if (!kgdb_io_module_registered) {
@@ -791,6 +816,7 @@ static void kgdb_register_callbacks(void)
                kgdb_arch_init();
                if (!dbg_is_early)
                        kgdb_arch_late();
+                register_reboot_notifier(&dbg_reboot_notifier);
                atomic_notifier_chain_register(&panic_notifier_list,
                                               &kgdb_panic_event_nb);
 #ifdef CONFIG_MAGIC_SYSRQ
@@ -812,6 +838,7 @@ static void kgdb_unregister_callbacks(void)
         */
        if (kgdb_io_module_registered) {
                kgdb_io_module_registered = 0;
+                unregister_reboot_notifier(&dbg_reboot_notifier);
                atomic_notifier_chain_unregister(&panic_notifier_list,
                                               &kgdb_panic_event_nb);
                kgdb_arch_exit();
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index c22d8c28ad84..ce615e064482 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1111,6 +1111,13 @@ void gdbstub_exit(int status)
        unsigned char checksum, ch, buffer[3];
        int loop;
+        if (!kgdb_connected)
+                return;
+        kgdb_connected = 0;
+        if (!dbg_io_ops || dbg_kdb_mode)
+                return;
        buffer[0] = 'W';
        buffer[1] = hex_asc_hi(status);
        buffer[2] = hex_asc_lo(status);
@@ -1129,5 +1136,6 @@ void gdbstub_exit(int status)
        dbg_io_ops->write_char(hex_asc_lo(checksum));
        /* make sure the output is flushed, lest the bootloader clobber it */
-        dbg_io_ops->flush();
+        if (dbg_io_ops->flush)
+                dbg_io_ops->flush();
 }
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 20059ef4459a..8418c2f8ec5d 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,6 +153,13 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
        } else {
                kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
                           __func__, bp->bp_addr);
+#ifdef CONFIG_DEBUG_RODATA
+                if (!bp->bp_type) {
+                        kdb_printf("Software breakpoints are unavailable.\n"
+                                   "  Change the kernel CONFIG_DEBUG_RODATA=n\n"
+                                   "  OR use hw breaks: help bph\n");
+                }
+#endif
                return 1;
        }
        return 0;
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 7179eac7b41c..07c9bbb94a0b 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -15,7 +15,6 @@
 #include <linux/sched.h>
 #include <linux/kdb.h>
 #include <linux/nmi.h>
-#include <asm/system.h>
 #include "kdb_private.h"
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 4802eb5840e1..bb9520f0f6ff 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -689,7 +689,7 @@ kdb_printit:
        if (!dbg_kdb_mode && kgdb_connected) {
                gdbstub_msg_write(kdb_buffer, retlen);
        } else {
-                if (!dbg_io_ops->is_console) {
+                if (dbg_io_ops && !dbg_io_ops->is_console) {
                        len = strlen(kdb_buffer);
                        cp = kdb_buffer;
                        while (len--) {
@@ -743,7 +743,7 @@ kdb_printit:
                kdb_input_flush();
                c = console_drivers;
-                if (!dbg_io_ops->is_console) {
+                if (dbg_io_ops && !dbg_io_ops->is_console) {
                        len = strlen(moreprompt);
                        cp = moreprompt;
                        while (len--) {
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 4bca634975c0..118527aa60ea 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -25,6 +25,7 @@
 #define KBD_STAT_MOUSE_OBF      0x20    /* Mouse output buffer full */
 static int kbd_exists;
+static int kbd_last_ret;
 /*
 * Check if the keyboard controller has a keypress for us.
@@ -90,8 +91,11 @@ int kdb_get_kbd_char(void)
                return -1;
        }
-        if ((scancode & 0x80) != 0)
+        if ((scancode & 0x80) != 0) {
+                if (scancode == 0x9c)
+                        kbd_last_ret = 0;
                return -1;
+        }
        scancode &= 0x7f;
@@ -178,35 +182,82 @@ int kdb_get_kbd_char(void)
                return -1;      /* ignore unprintables */
        }
-        if ((scancode & 0x7f) == 0x1c) {
+        if (scancode == 0x1c) {
-                /*
+                kbd_last_ret = 1;
-                 * enter key.  All done.  Absorb the release scancode.
+                return 13;
-                 */
+        }
+        return keychar & 0xff;
+}
+EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
+/*
+ * Best effort cleanup of ENTER break codes on leaving KDB. Called on
+ * exiting KDB, when we know we processed an ENTER or KP ENTER scan
+ * code.
+ */
+void kdb_kbd_cleanup_state(void)
+{
+        int scancode, scanstatus;
+        /*
+         * Nothing to clean up, since either
+         * ENTER was never pressed, or has already
+         * gotten cleaned up.
+         */
+        if (!kbd_last_ret)
+                return;
+        kbd_last_ret = 0;
+        /*
+         * Enter key. Need to absorb the break code here, lest it gets
+         * leaked out if we exit KDB as the result of processing 'g'.
+         *
+         * This has several interesting implications:
+         * + Need to handle KP ENTER, which has break code 0xe0 0x9c.
+         * + Need to handle repeat ENTER and repeat KP ENTER. Repeats
+         *   only get a break code at the end of the repeated
+         *   sequence. This means we can't propagate the repeated key
+         *   press, and must swallow it away.
+         * + Need to handle possible PS/2 mouse input.
+         * + Need to handle mashed keys.
+         */
+        while (1) {
                while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
-                        ;
+                        cpu_relax();
                /*
-                 * Fetch the scancode
+                 * Fetch the scancode.
                 */
                scancode = inb(KBD_DATA_REG);
                scanstatus = inb(KBD_STATUS_REG);
-                while (scanstatus & KBD_STAT_MOUSE_OBF) {
+                /*
-                        scancode = inb(KBD_DATA_REG);
+                 * Skip mouse input.
-                        scanstatus = inb(KBD_STATUS_REG);
+                 */
-                }
+                if (scanstatus & KBD_STAT_MOUSE_OBF)
+                        continue;
-                if (scancode != 0x9c) {
+                /*
-                        /*
+                 * If we see 0xe0, this is either a break code for KP
-                         * Wasn't an enter-release,  why not?
+                 * ENTER, or a repeat make for KP ENTER. Either way,
-                         */
+                 * since the second byte is equivalent to an ENTER,
-                        kdb_printf("kdb: expected enter got 0x%x status 0x%x\n",
+                 * skip the 0xe0 and try again.
-                               scancode, scanstatus);
+                 *
-                }
+                 * If we see 0x1c, this must be a repeat ENTER or KP
+                 * ENTER (and we swallowed 0xe0 before). Try again.
+                 *
+                 * We can also see make and break codes for other keys
+                 * mashed before or after pressing ENTER. Thus, if we
+                 * see anything other than 0x9c, we have to try again.
+                 *
+                 * Note, if you held some key as ENTER was depressed,
+                 * that break code would get leaked out.
+                 */
+                if (scancode != 0x9c)
+                        continue;
-                return 13;
+                return;
        }
-        return keychar & 0xff;
 }
-EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index e2ae7349437f..67b847dfa2bb 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1400,6 +1400,9 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
        if (KDB_STATE(DOING_SS))
                KDB_STATE_CLEAR(SSBPT);
+        /* Clean up any keyboard devices before leaving */
+        kdb_kbd_cleanup_state();
        return result;
 }
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index e381d105b40b..47c4e56e513b 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -246,6 +246,13 @@ extern void debug_kusage(void);
 extern void kdb_set_current_task(struct task_struct *);
 extern struct task_struct *kdb_current_task;
+#ifdef CONFIG_KDB_KEYBOARD
+extern void kdb_kbd_cleanup_state(void);
+#else /* ! CONFIG_KDB_KEYBOARD */
+#define kdb_kbd_cleanup_state()
+#endif /* ! CONFIG_KDB_KEYBOARD */
 #ifdef CONFIG_MODULES
 extern struct list_head *kdb_modules;
 #endif /* CONFIG_MODULES */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 7d6fb40d2188..d35cc2d3a4cc 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -384,9 +384,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size)
        if (!pfn_valid(pfn))
                return 1;
        page = pfn_to_page(pfn);
-        vaddr = kmap_atomic(page, KM_KDB);
+        vaddr = kmap_atomic(page);
        memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
-        kunmap_atomic(vaddr, KM_KDB);
+        kunmap_atomic(vaddr);
        return 0;
 }
diff --git a/kernel/dma.c b/kernel/dma.c
index 68a2306522c8..6c6262f86c17 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -18,7 +18,6 @@
 #include <linux/proc_fs.h>
 #include <linux/init.h>
 #include <asm/dma.h>
-#include <asm/system.h>
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4b50357914fb..a6a9ec4cd8f5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3348,7 +3348,7 @@ static void calc_timer_values(struct perf_event *event,
        *running = ctx_time - event->tstamp_running;
 }
-void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
+void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
 {
 }
@@ -3398,7 +3398,7 @@ void perf_event_update_userpage(struct perf_event *event)
        userpg->time_running = running +
                        atomic64_read(&event->child_total_time_running);
-        perf_update_user_clock(userpg, now);
+        arch_perf_update_userpage(userpg, now);
        barrier();
        ++userpg->lock;
@@ -7116,6 +7116,13 @@ void __init perf_event_init(void)
        /* do not patch jump label more than once per second */
        jump_label_rate_limit(&perf_sched_events, HZ);
+        /*
+         * Build time assertion that we keep the data_head at the intended
+         * location.  IOW, validation we got the __reserved[] size right.
+         */
+        BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
+                     != 1024);
 }
 static int __init perf_event_sysfs_init(void)
diff --git a/kernel/exit.c b/kernel/exit.c
index ce5f758f40bd..d8bd3b425fa7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -52,6 +52,7 @@
 #include <linux/hw_breakpoint.h>
 #include <linux/oom.h>
 #include <linux/writeback.h>
+#include <linux/shm.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -424,7 +425,7 @@ void daemonize(const char *name, ...)
         */
        exit_mm(current);
        /*
-         * We don't want to have TIF_FREEZE set if the system-wide hibernation
+         * We don't want to get frozen, in case system-wide hibernation
         * or suspend transition begins right now.
         */
        current->flags |= (PF_NOFREEZE | PF_KTHREAD);
@@ -473,7 +474,7 @@ static void close_files(struct files_struct * files)
                i = j * __NFDBITS;
                if (i >= fdt->max_fds)
                        break;
-                set = fdt->open_fds->fds_bits[j++];
+                set = fdt->open_fds[j++];
                while (set) {
                        if (set & 1) {
                                struct file * file = xchg(&fdt->fd[i], NULL);
@@ -686,11 +687,11 @@ static void exit_mm(struct task_struct * tsk)
 }
 /*
- * When we die, we re-parent all our children.
+ * When we die, we re-parent all our children, and try to:
- * Try to give them to another thread in our thread
+ * 1. give them to another thread in our thread group, if such a member exists
- * group, and if no such member exists, give it to
+ * 2. give it to the first ancestor process which prctl'd itself as a
- * the child reaper process (ie "init") in our pid
+ *    child_subreaper for its children (like a service manager)
- * space.
+ * 3. give it to the init process (PID 1) in our pid namespace
 */
 static struct task_struct *find_new_reaper(struct task_struct *father)
        __releases(&tasklist_lock)
@@ -710,8 +711,11 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
        if (unlikely(pid_ns->child_reaper == father)) {
                write_unlock_irq(&tasklist_lock);
-                if (unlikely(pid_ns == &init_pid_ns))
+                if (unlikely(pid_ns == &init_pid_ns)) {
-                        panic("Attempted to kill init!");
+                        panic("Attempted to kill init! exitcode=0x%08x\n",
+                                father->signal->group_exit_code ?:
+                                        father->exit_code);
+                }
                zap_pid_ns_processes(pid_ns);
                write_lock_irq(&tasklist_lock);
@@ -721,6 +725,29 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
                 * forget_original_parent() must move them somewhere.
                 */
                pid_ns->child_reaper = init_pid_ns.child_reaper;
+        } else if (father->signal->has_child_subreaper) {
+                struct task_struct *reaper;
+                /*
+                 * Find the first ancestor marked as child_subreaper.
+                 * Note that the code below checks same_thread_group(reaper,
+                 * pid_ns->child_reaper).  This is what we need to DTRT in a
+                 * PID namespace. However we still need the check above, see
+                 * http://marc.info/?l=linux-kernel&m=131385460420380
+                 */
+                for (reaper = father->real_parent;
+                     reaper != &init_task;
+                     reaper = reaper->real_parent) {
+                        if (same_thread_group(reaper, pid_ns->child_reaper))
+                                break;
+                        if (!reaper->signal->is_child_subreaper)
+                                continue;
+                        thread = reaper;
+                        do {
+                                if (!(thread->flags & PF_EXITING))
+                                        return reaper;
+                        } while_each_thread(reaper, thread);
+                }
        }
        return pid_ns->child_reaper;
@@ -934,7 +961,7 @@ void do_exit(long code)
        acct_update_integrals(tsk);
        /* sync mm's RSS info before statistics gathering */
        if (tsk->mm)
-                sync_mm_rss(tsk, tsk->mm);
+                sync_mm_rss(tsk->mm);
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
                hrtimer_cancel(&tsk->signal->real_timer);
diff --git a/kernel/fork.c b/kernel/fork.c
index c4f38a849436..b9372a0bff18 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -193,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk)
        WARN_ON(atomic_read(&tsk->usage));
        WARN_ON(tsk == current);
+        security_task_free(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
        put_signal_struct(tsk->signal);
@@ -355,7 +356,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                charge = 0;
                if (mpnt->vm_flags & VM_ACCOUNT) {
                        unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
-                        if (security_vm_enough_memory(len))
+                        if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                                goto fail_nomem;
                        charge = len;
                }
@@ -511,6 +512,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
        return NULL;
 }
+static void check_mm(struct mm_struct *mm)
+{
+        int i;
+        for (i = 0; i < NR_MM_COUNTERS; i++) {
+                long x = atomic_long_read(&mm->rss_stat.count[i]);
+                if (unlikely(x))
+                        printk(KERN_ALERT "BUG: Bad rss-counter state "
+                                          "mm:%p idx:%d val:%ld\n", mm, i, x);
+        }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        VM_BUG_ON(mm->pmd_huge_pte);
+#endif
+}
 /*
 * Allocate and initialize an mm_struct.
 */
@@ -538,9 +556,7 @@ void __mmdrop(struct mm_struct *mm)
        mm_free_pgd(mm);
        destroy_context(mm);
        mmu_notifier_mm_destroy(mm);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        check_mm(mm);
-        VM_BUG_ON(mm->pmd_huge_pte);
-#endif
        free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1035,6 +1051,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
+        sig->has_child_subreaper = current->signal->has_child_subreaper ||
+                                   current->signal->is_child_subreaper;
        mutex_init(&sig->cred_guard_mutex);
        return 0;
@@ -1222,6 +1241,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_CPUSETS
        p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
        p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
+        seqcount_init(&p->mems_allowed_seq);
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
        p->irq_events = 0;
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 9815b8d1eed5..11f82a4d4eae 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -99,9 +99,9 @@ static void fake_signal_wake_up(struct task_struct *p)
 * freeze_task - send a freeze request to given task
 * @p: task to send the request to
 *
- * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE
+ * If @p is freezing, the freeze request is sent either by sending a fake
- * flag and either sending a fake signal to it or waking it up, depending
+ * signal (if it's not a kernel thread) or waking it up (if it's a kernel
- * on whether it has %PF_FREEZER_NOSIG set.
+ * thread).
 *
 * RETURNS:
 * %false, if @p is not freezing or already frozen; %true, otherwise
diff --git a/kernel/futex.c b/kernel/futex.c
index 72efa1e4359a..e2b0fb9a0b3b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -59,6 +59,7 @@
 #include <linux/magic.h>
 #include <linux/pid.h>
 #include <linux/nsproxy.h>
+#include <linux/ptrace.h>
 #include <asm/futex.h>
@@ -2443,40 +2444,31 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
 {
        struct robust_list_head __user *head;
        unsigned long ret;
-        const struct cred *cred = current_cred(), *pcred;
+        struct task_struct *p;
        if (!futex_cmpxchg_enabled)
                return -ENOSYS;
+        WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
+        rcu_read_lock();
+        ret = -ESRCH;
        if (!pid)
-                head = current->robust_list;
+                p = current;
        else {
-                struct task_struct *p;
-                ret = -ESRCH;
-                rcu_read_lock();
                p = find_task_by_vpid(pid);
                if (!p)
                        goto err_unlock;
-                ret = -EPERM;
-                pcred = __task_cred(p);
-                /* If victim is in different user_ns, then uids are not
-                   comparable, so we must have CAP_SYS_PTRACE */
-                if (cred->user->user_ns != pcred->user->user_ns) {
-                        if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
-                                goto err_unlock;
-                        goto ok;
-                }
-                /* If victim is in same user_ns, then uids are comparable */
-                if (cred->euid != pcred->euid &&
-                    cred->euid != pcred->uid &&
-                    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
-                        goto err_unlock;
-ok:
-                head = p->robust_list;
-                rcu_read_unlock();
        }
+        ret = -EPERM;
+        if (!ptrace_may_access(p, PTRACE_MODE_READ))
+                goto err_unlock;
+        head = p->robust_list;
+        rcu_read_unlock();
        if (put_user(sizeof(*head), len_ptr))
                return -EFAULT;
        return put_user(head, head_ptr);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 5f9e689dc8f0..83e368b005fc 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -10,6 +10,7 @@
 #include <linux/compat.h>
 #include <linux/nsproxy.h>
 #include <linux/futex.h>
+#include <linux/ptrace.h>
 #include <asm/uaccess.h>
@@ -136,40 +137,31 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 {
        struct compat_robust_list_head __user *head;
        unsigned long ret;
-        const struct cred *cred = current_cred(), *pcred;
+        struct task_struct *p;
        if (!futex_cmpxchg_enabled)
                return -ENOSYS;
+        WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
+        rcu_read_lock();
+        ret = -ESRCH;
        if (!pid)
-                head = current->compat_robust_list;
+                p = current;
        else {
-                struct task_struct *p;
-                ret = -ESRCH;
-                rcu_read_lock();
                p = find_task_by_vpid(pid);
                if (!p)
                        goto err_unlock;
-                ret = -EPERM;
-                pcred = __task_cred(p);
-                /* If victim is in different user_ns, then uids are not
-                   comparable, so we must have CAP_SYS_PTRACE */
-                if (cred->user->user_ns != pcred->user->user_ns) {
-                        if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
-                                goto err_unlock;
-                        goto ok;
-                }
-                /* If victim is in same user_ns, then uids are comparable */
-                if (cred->euid != pcred->euid &&
-                    cred->euid != pcred->uid &&
-                    !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
-                        goto err_unlock;
-ok:
-                head = p->compat_robust_list;
-                rcu_read_unlock();
        }
+        ret = -EPERM;
+        if (!ptrace_may_access(p, PTRACE_MODE_READ))
+                goto err_unlock;
+        head = p->compat_robust_list;
+        rcu_read_unlock();
        if (put_user(sizeof(*head), len_ptr))
                return -EFAULT;
        return put_user(ptr_to_compat(head), head_ptr);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 5a38bf4de641..d1a758bc972a 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -13,7 +13,7 @@ config GENERIC_HARDIRQS
 # Options selectable by the architecture code
 # Make sparse irq Kconfig switch below available
-config HAVE_SPARSE_IRQ
+config MAY_HAVE_SPARSE_IRQ
       bool
 # Enable the generic irq autoprobe mechanism
@@ -56,13 +56,22 @@ config GENERIC_IRQ_CHIP
 config IRQ_DOMAIN
        bool
+config IRQ_DOMAIN_DEBUG
+        bool "Expose hardware/virtual IRQ mapping via debugfs"
+        depends on IRQ_DOMAIN && DEBUG_FS
+        help
+          This option will show the mapping relationship between hardware irq
+          numbers and Linux irq numbers. The mapping is exposed via debugfs
+          in the file "irq_domain_mapping".
+          If you don't know what this means you don't need it.
 # Support forced irq threading
 config IRQ_FORCED_THREADING
       bool
 config SPARSE_IRQ
-        bool "Support sparse irq numbering"
+        bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ
-        depends on HAVE_SPARSE_IRQ
        ---help---
          Sparse irq numbering is useful for distro kernels that want
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 6ff84e6a954c..bdb180325551 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,14 +54,18 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
 static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
 {
        /*
-         * Wake up the handler thread for this action. In case the
+         * In case the thread crashed and was killed we just pretend that
-         * thread crashed and was killed we just pretend that we
+         * we handled the interrupt. The hardirq handler has disabled the
-         * handled the interrupt. The hardirq handler has disabled the
+         * device interrupt, so no irq storm is lurking.
-         * device interrupt, so no irq storm is lurking. If the
+         */
+        if (action->thread->flags & PF_EXITING)
+                return;
+        /*
+         * Wake up the handler thread for this action. If the
         * RUNTHREAD bit is already set, nothing to do.
         */
-        if ((action->thread->flags & PF_EXITING) ||
+        if (test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
-            test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
                return;
        /*
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 1f9e26526b69..0e0ba5f840b2 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,189 +1,780 @@
+#include <linux/debugfs.h>
+#include <linux/hardirq.h>
+#include <linux/interrupt.h>
 #include <linux/irq.h>
+#include <linux/irqdesc.h>
 #include <linux/irqdomain.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/fs.h>
+#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs.
+                                 * ie. legacy 8259, gets irqs 1..15 */
+#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */
+#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */
+#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */
 static LIST_HEAD(irq_domain_list);
 static DEFINE_MUTEX(irq_domain_mutex);
+static DEFINE_MUTEX(revmap_trees_mutex);
+static struct irq_domain *irq_default_domain;
 /**
- * irq_domain_add() - Register an irq_domain
+ * irq_domain_alloc() - Allocate a new irq_domain data structure
- * @domain: ptr to initialized irq_domain structure
+ * @of_node: optional device-tree node of the interrupt controller
+ * @revmap_type: type of reverse mapping to use
+ * @ops: map/unmap domain callbacks
+ * @host_data: Controller private data pointer
 *
- * Registers an irq_domain structure.  The irq_domain must at a minimum be
+ * Allocates and initialize and irq_domain structure.  Caller is expected to
- * initialized with an ops structure pointer, and either a ->to_irq hook or
+ * register allocated irq_domain with irq_domain_register().  Returns pointer
- * a valid irq_base value.  Everything else is optional.
+ * to IRQ domain, or NULL on failure.
 */
-void irq_domain_add(struct irq_domain *domain)
+static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
+                                           unsigned int revmap_type,
+                                           const struct irq_domain_ops *ops,
+                                           void *host_data)
 {
-        struct irq_data *d;
+        struct irq_domain *domain;
-        int hwirq, irq;
-        /*
+        domain = kzalloc(sizeof(*domain), GFP_KERNEL);
-         * This assumes that the irq_domain owner has already allocated
+        if (WARN_ON(!domain))
-         * the irq_descs.  This block will be removed when support for dynamic
+                return NULL;
-         * allocation of irq_descs is added to irq_domain.
-         */
+        /* Fill structure */
-        irq_domain_for_each_irq(domain, hwirq, irq) {
+        domain->revmap_type = revmap_type;
-                d = irq_get_irq_data(irq);
+        domain->ops = ops;
-                if (!d) {
+        domain->host_data = host_data;
-                        WARN(1, "error: assigning domain to non existant irq_desc");
+        domain->of_node = of_node_get(of_node);
-                        return;
-                }
+        return domain;
-                if (d->domain) {
+}
-                        /* things are broken; just report, don't clean up */
-                        WARN(1, "error: irq_desc already assigned to a domain");
+static void irq_domain_add(struct irq_domain *domain)
-                        return;
+{
+        mutex_lock(&irq_domain_mutex);
+        list_add(&domain->link, &irq_domain_list);
+        mutex_unlock(&irq_domain_mutex);
+        pr_debug("irq: Allocated domain of type %d @0x%p\n",
+                 domain->revmap_type, domain);
+}
+static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
+                                             irq_hw_number_t hwirq)
+{
+        irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq;
+        int size = domain->revmap_data.legacy.size;
+        if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size))
+                return 0;
+        return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq;
+}
+/**
+ * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
+ * @of_node: pointer to interrupt controller's device tree node.
+ * @size: total number of irqs in legacy mapping
+ * @first_irq: first number of irq block assigned to the domain
+ * @first_hwirq: first hwirq number to use for the translation. Should normally
+ *               be '0', but a positive integer can be used if the effective
+ *               hwirqs numbering does not begin at zero.
+ * @ops: map/unmap domain callbacks
+ * @host_data: Controller private data pointer
+ *
+ * Note: the map() callback will be called before this function returns
+ * for all legacy interrupts except 0 (which is always the invalid irq for
+ * a legacy controller).
+ */
+struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
+                                         unsigned int size,
+                                         unsigned int first_irq,
+                                         irq_hw_number_t first_hwirq,
+                                         const struct irq_domain_ops *ops,
+                                         void *host_data)
+{
+        struct irq_domain *domain;
+        unsigned int i;
+        domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data);
+        if (!domain)
+                return NULL;
+        domain->revmap_data.legacy.first_irq = first_irq;
+        domain->revmap_data.legacy.first_hwirq = first_hwirq;
+        domain->revmap_data.legacy.size = size;
+        mutex_lock(&irq_domain_mutex);
+        /* Verify that all the irqs are available */
+        for (i = 0; i < size; i++) {
+                int irq = first_irq + i;
+                struct irq_data *irq_data = irq_get_irq_data(irq);
+                if (WARN_ON(!irq_data || irq_data->domain)) {
+                        mutex_unlock(&irq_domain_mutex);
+                        of_node_put(domain->of_node);
+                        kfree(domain);
+                        return NULL;
                }
-                d->domain = domain;
-                d->hwirq = hwirq;
        }
-        mutex_lock(&irq_domain_mutex);
+        /* Claim all of the irqs before registering a legacy domain */
-        list_add(&domain->list, &irq_domain_list);
+        for (i = 0; i < size; i++) {
+                struct irq_data *irq_data = irq_get_irq_data(first_irq + i);
+                irq_data->hwirq = first_hwirq + i;
+                irq_data->domain = domain;
+        }
        mutex_unlock(&irq_domain_mutex);
+        for (i = 0; i < size; i++) {
+                int irq = first_irq + i;
+                int hwirq = first_hwirq + i;
+                /* IRQ0 gets ignored */
+                if (!irq)
+                        continue;
+                /* Legacy flags are left to default at this point,
+                 * one can then use irq_create_mapping() to
+                 * explicitly change them
+                 */
+                ops->map(domain, irq, hwirq);
+                /* Clear norequest flags */
+                irq_clear_status_flags(irq, IRQ_NOREQUEST);
+        }
+        irq_domain_add(domain);
+        return domain;
+}
+/**
+ * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain.
+ * @of_node: pointer to interrupt controller's device tree node.
+ * @ops: map/unmap domain callbacks
+ * @host_data: Controller private data pointer
+ */
+struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
+                                         unsigned int size,
+                                         const struct irq_domain_ops *ops,
+                                         void *host_data)
+{
+        struct irq_domain *domain;
+        unsigned int *revmap;
+        revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL);
+        if (WARN_ON(!revmap))
+                return NULL;
+        domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data);
+        if (!domain) {
+                kfree(revmap);
+                return NULL;
+        }
+        domain->revmap_data.linear.size = size;
+        domain->revmap_data.linear.revmap = revmap;
+        irq_domain_add(domain);
+        return domain;
+}
+struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
+                                         unsigned int max_irq,
+                                         const struct irq_domain_ops *ops,
+                                         void *host_data)
+{
+        struct irq_domain *domain = irq_domain_alloc(of_node,
+                                        IRQ_DOMAIN_MAP_NOMAP, ops, host_data);
+        if (domain) {
+                domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0;
+                irq_domain_add(domain);
+        }
+        return domain;
+}
+/**
+ * irq_domain_add_tree()
+ * @of_node: pointer to interrupt controller's device tree node.
+ * @ops: map/unmap domain callbacks
+ *
+ * Note: The radix tree will be allocated later during boot automatically
+ * (the reverse mapping will use the slow path until that happens).
+ */
+struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
+                                         const struct irq_domain_ops *ops,
+                                         void *host_data)
+{
+        struct irq_domain *domain = irq_domain_alloc(of_node,
+                                        IRQ_DOMAIN_MAP_TREE, ops, host_data);
+        if (domain) {
+                INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL);
+                irq_domain_add(domain);
+        }
+        return domain;
 }
 /**
- * irq_domain_del() - Unregister an irq_domain
+ * irq_find_host() - Locates a domain for a given device node
- * @domain: ptr to registered irq_domain.
+ * @node: device-tree node of the interrupt controller
 */
-void irq_domain_del(struct irq_domain *domain)
+struct irq_domain *irq_find_host(struct device_node *node)
 {
-        struct irq_data *d;
+        struct irq_domain *h, *found = NULL;
-        int hwirq, irq;
+        int rc;
+        /* We might want to match the legacy controller last since
+         * it might potentially be set to match all interrupts in
+         * the absence of a device node. This isn't a problem so far
+         * yet though...
+         */
        mutex_lock(&irq_domain_mutex);
-        list_del(&domain->list);
+        list_for_each_entry(h, &irq_domain_list, link) {
+                if (h->ops->match)
+                        rc = h->ops->match(h, node);
+                else
+                        rc = (h->of_node != NULL) && (h->of_node == node);
+                if (rc) {
+                        found = h;
+                        break;
+                }
+        }
        mutex_unlock(&irq_domain_mutex);
+        return found;
+}
+EXPORT_SYMBOL_GPL(irq_find_host);
+/**
+ * irq_set_default_host() - Set a "default" irq domain
+ * @domain: default domain pointer
+ *
+ * For convenience, it's possible to set a "default" domain that will be used
+ * whenever NULL is passed to irq_create_mapping(). It makes life easier for
+ * platforms that want to manipulate a few hard coded interrupt numbers that
+ * aren't properly represented in the device-tree.
+ */
+void irq_set_default_host(struct irq_domain *domain)
+{
+        pr_debug("irq: Default domain set to @0x%p\n", domain);
+        irq_default_domain = domain;
+}
+static int irq_setup_virq(struct irq_domain *domain, unsigned int virq,
+                            irq_hw_number_t hwirq)
+{
+        struct irq_data *irq_data = irq_get_irq_data(virq);
-        /* Clear the irq_domain assignments */
+        irq_data->hwirq = hwirq;
-        irq_domain_for_each_irq(domain, hwirq, irq) {
+        irq_data->domain = domain;
-                d = irq_get_irq_data(irq);
+        if (domain->ops->map(domain, virq, hwirq)) {
-                d->domain = NULL;
+                pr_debug("irq: -> mapping failed, freeing\n");
+                irq_data->domain = NULL;
+                irq_data->hwirq = 0;
+                return -1;
        }
+        irq_clear_status_flags(virq, IRQ_NOREQUEST);
+        return 0;
 }
-#if defined(CONFIG_OF_IRQ)
 /**
- * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec
+ * irq_create_direct_mapping() - Allocate an irq for direct mapping
+ * @domain: domain to allocate the irq for or NULL for default domain
 *
- * Used by the device tree interrupt mapping code to translate a device tree
+ * This routine is used for irq controllers which can choose the hardware
- * interrupt specifier to a valid linux irq number.  Returns either a valid
+ * interrupt numbers they generate. In such a case it's simplest to use
- * linux IRQ number or 0.
+ * the linux irq as the hardware interrupt number.
+ */
+unsigned int irq_create_direct_mapping(struct irq_domain *domain)
+{
+        unsigned int virq;
+        if (domain == NULL)
+                domain = irq_default_domain;
+        BUG_ON(domain == NULL);
+        WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP);
+        virq = irq_alloc_desc_from(1, 0);
+        if (!virq) {
+                pr_debug("irq: create_direct virq allocation failed\n");
+                return 0;
+        }
+        if (virq >= domain->revmap_data.nomap.max_irq) {
+                pr_err("ERROR: no free irqs available below %i maximum\n",
+                        domain->revmap_data.nomap.max_irq);
+                irq_free_desc(virq);
+                return 0;
+        }
+        pr_debug("irq: create_direct obtained virq %d\n", virq);
+        if (irq_setup_virq(domain, virq, virq)) {
+                irq_free_desc(virq);
+                return 0;
+        }
+        return virq;
+}
+/**
+ * irq_create_mapping() - Map a hardware interrupt into linux irq space
+ * @domain: domain owning this hardware interrupt or NULL for default domain
+ * @hwirq: hardware irq number in that domain space
 *
- * When the caller no longer need the irq number returned by this function it
+ * Only one mapping per hardware interrupt is permitted. Returns a linux
- * should arrange to call irq_dispose_mapping().
+ * irq number.
+ * If the sense/trigger is to be specified, set_irq_type() should be called
+ * on the number returned from that call.
 */
+unsigned int irq_create_mapping(struct irq_domain *domain,
+                                irq_hw_number_t hwirq)
+{
+        unsigned int hint;
+        int virq;
+        pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
+        /* Look for default domain if nececssary */
+        if (domain == NULL)
+                domain = irq_default_domain;
+        if (domain == NULL) {
+                printk(KERN_WARNING "irq_create_mapping called for"
+                       " NULL domain, hwirq=%lx\n", hwirq);
+                WARN_ON(1);
+                return 0;
+        }
+        pr_debug("irq: -> using domain @%p\n", domain);
+        /* Check if mapping already exists */
+        virq = irq_find_mapping(domain, hwirq);
+        if (virq) {
+                pr_debug("irq: -> existing mapping on virq %d\n", virq);
+                return virq;
+        }
+        /* Get a virtual interrupt number */
+        if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
+                return irq_domain_legacy_revmap(domain, hwirq);
+        /* Allocate a virtual interrupt number */
+        hint = hwirq % nr_irqs;
+        if (hint == 0)
+                hint++;
+        virq = irq_alloc_desc_from(hint, 0);
+        if (virq <= 0)
+                virq = irq_alloc_desc_from(1, 0);
+        if (virq <= 0) {
+                pr_debug("irq: -> virq allocation failed\n");
+                return 0;
+        }
+        if (irq_setup_virq(domain, virq, hwirq)) {
+                if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY)
+                        irq_free_desc(virq);
+                return 0;
+        }
+        pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n",
+                hwirq, domain->of_node ? domain->of_node->full_name : "null", virq);
+        return virq;
+}
+EXPORT_SYMBOL_GPL(irq_create_mapping);
 unsigned int irq_create_of_mapping(struct device_node *controller,
                                   const u32 *intspec, unsigned int intsize)
 {
        struct irq_domain *domain;
-        unsigned long hwirq;
+        irq_hw_number_t hwirq;
-        unsigned int irq, type;
+        unsigned int type = IRQ_TYPE_NONE;
-        int rc = -EINVAL;
+        unsigned int virq;
-        /* Find a domain which can translate the irq spec */
+        domain = controller ? irq_find_host(controller) : irq_default_domain;
-        mutex_lock(&irq_domain_mutex);
+        if (!domain) {
-        list_for_each_entry(domain, &irq_domain_list, list) {
+#ifdef CONFIG_MIPS
-                if (!domain->ops->dt_translate)
+                /*
-                        continue;
+                 * Workaround to avoid breaking interrupt controller drivers
-                rc = domain->ops->dt_translate(domain, controller,
+                 * that don't yet register an irq_domain.  This is temporary
-                                        intspec, intsize, &hwirq, &type);
+                 * code. ~~~gcl, Feb 24, 2012
-                if (rc == 0)
+                 *
-                        break;
+                 * Scheduled for removal in Linux v3.6.  That should be enough
+                 * time.
+                 */
+                if (intsize > 0)
+                        return intspec[0];
+#endif
+                printk(KERN_WARNING "irq: no irq domain found for %s !\n",
+                       controller->full_name);
+                return 0;
        }
-        mutex_unlock(&irq_domain_mutex);
-        if (rc != 0)
+        /* If domain has no translation, then we assume interrupt line */
-                return 0;
+        if (domain->ops->xlate == NULL)
+                hwirq = intspec[0];
+        else {
+                if (domain->ops->xlate(domain, controller, intspec, intsize,
+                                     &hwirq, &type))
+                        return 0;
+        }
+        /* Create mapping */
+        virq = irq_create_mapping(domain, hwirq);
+        if (!virq)
+                return virq;
-        irq = irq_domain_to_irq(domain, hwirq);
+        /* Set type if specified and different than the current one */
-        if (type != IRQ_TYPE_NONE)
+        if (type != IRQ_TYPE_NONE &&
-                irq_set_irq_type(irq, type);
+            type != (irqd_get_trigger_type(irq_get_irq_data(virq))))
-        pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n",
+                irq_set_irq_type(virq, type);
-                 controller->full_name, (int)hwirq, irq, type);
+        return virq;
-        return irq;
 }
 EXPORT_SYMBOL_GPL(irq_create_of_mapping);
 /**
- * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping()
+ * irq_dispose_mapping() - Unmap an interrupt
- * @irq: linux irq number to be discarded
+ * @virq: linux irq number of the interrupt to unmap
+ */
+void irq_dispose_mapping(unsigned int virq)
+{
+        struct irq_data *irq_data = irq_get_irq_data(virq);
+        struct irq_domain *domain;
+        irq_hw_number_t hwirq;
+        if (!virq || !irq_data)
+                return;
+        domain = irq_data->domain;
+        if (WARN_ON(domain == NULL))
+                return;
+        /* Never unmap legacy interrupts */
+        if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
+                return;
+        irq_set_status_flags(virq, IRQ_NOREQUEST);
+        /* remove chip and handler */
+        irq_set_chip_and_handler(virq, NULL, NULL);
+        /* Make sure it's completed */
+        synchronize_irq(virq);
+        /* Tell the PIC about it */
+        if (domain->ops->unmap)
+                domain->ops->unmap(domain, virq);
+        smp_mb();
+        /* Clear reverse map */
+        hwirq = irq_data->hwirq;
+        switch(domain->revmap_type) {
+        case IRQ_DOMAIN_MAP_LINEAR:
+                if (hwirq < domain->revmap_data.linear.size)
+                        domain->revmap_data.linear.revmap[hwirq] = 0;
+                break;
+        case IRQ_DOMAIN_MAP_TREE:
+                mutex_lock(&revmap_trees_mutex);
+                radix_tree_delete(&domain->revmap_data.tree, hwirq);
+                mutex_unlock(&revmap_trees_mutex);
+                break;
+        }
+        irq_free_desc(virq);
+}
+EXPORT_SYMBOL_GPL(irq_dispose_mapping);
+/**
+ * irq_find_mapping() - Find a linux irq from an hw irq number.
+ * @domain: domain owning this hardware interrupt
+ * @hwirq: hardware irq number in that domain space
+ *
+ * This is a slow path, for use by generic code. It's expected that an
+ * irq controller implementation directly calls the appropriate low level
+ * mapping function.
+ */
+unsigned int irq_find_mapping(struct irq_domain *domain,
+                              irq_hw_number_t hwirq)
+{
+        unsigned int i;
+        unsigned int hint = hwirq % nr_irqs;
+        /* Look for default domain if nececssary */
+        if (domain == NULL)
+                domain = irq_default_domain;
+        if (domain == NULL)
+                return 0;
+        /* legacy -> bail early */
+        if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
+                return irq_domain_legacy_revmap(domain, hwirq);
+        /* Slow path does a linear search of the map */
+        if (hint == 0)
+                hint = 1;
+        i = hint;
+        do {
+                struct irq_data *data = irq_get_irq_data(i);
+                if (data && (data->domain == domain) && (data->hwirq == hwirq))
+                        return i;
+                i++;
+                if (i >= nr_irqs)
+                        i = 1;
+        } while(i != hint);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(irq_find_mapping);
+/**
+ * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number.
+ * @domain: domain owning this hardware interrupt
+ * @hwirq: hardware irq number in that domain space
 *
- * Calling this function indicates the caller no longer needs a reference to
+ * This is a fast path, for use by irq controller code that uses radix tree
- * the linux irq number returned by a prior call to irq_create_of_mapping().
+ * revmaps
 */
-void irq_dispose_mapping(unsigned int irq)
+unsigned int irq_radix_revmap_lookup(struct irq_domain *domain,
+                                     irq_hw_number_t hwirq)
 {
+        struct irq_data *irq_data;
+        if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
+                return irq_find_mapping(domain, hwirq);
+        /*
+         * Freeing an irq can delete nodes along the path to
+         * do the lookup via call_rcu.
+         */
+        rcu_read_lock();
+        irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
+        rcu_read_unlock();
        /*
-         * nothing yet; will be filled when support for dynamic allocation of
+         * If found in radix tree, then fine.
-         * irq_descs is added to irq_domain
+         * Else fallback to linear lookup - this should not happen in practice
+         * as it means that we failed to insert the node in the radix tree.
         */
+        return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq);
 }
-EXPORT_SYMBOL_GPL(irq_dispose_mapping);
-int irq_domain_simple_dt_translate(struct irq_domain *d,
+/**
-                            struct device_node *controller,
+ * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping.
-                            const u32 *intspec, unsigned int intsize,
+ * @domain: domain owning this hardware interrupt
-                            unsigned long *out_hwirq, unsigned int *out_type)
+ * @virq: linux irq number
+ * @hwirq: hardware irq number in that domain space
+ *
+ * This is for use by irq controllers that use a radix tree reverse
+ * mapping for fast lookup.
+ */
+void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq,
+                             irq_hw_number_t hwirq)
 {
-        if (d->of_node != controller)
+        struct irq_data *irq_data = irq_get_irq_data(virq);
-                return -EINVAL;
-        if (intsize < 1)
+        if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
-                return -EINVAL;
+                return;
-        if (d->nr_irq && ((intspec[0] < d->hwirq_base) ||
-            (intspec[0] >= d->hwirq_base + d->nr_irq)))
+        if (virq) {
-                return -EINVAL;
+                mutex_lock(&revmap_trees_mutex);
+                radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
+                mutex_unlock(&revmap_trees_mutex);
+        }
+}
+/**
+ * irq_linear_revmap() - Find a linux irq from a hw irq number.
+ * @domain: domain owning this hardware interrupt
+ * @hwirq: hardware irq number in that domain space
+ *
+ * This is a fast path, for use by irq controller code that uses linear
+ * revmaps. It does fallback to the slow path if the revmap doesn't exist
+ * yet and will create the revmap entry with appropriate locking
+ */
+unsigned int irq_linear_revmap(struct irq_domain *domain,
+                               irq_hw_number_t hwirq)
+{
+        unsigned int *revmap;
+        if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR))
+                return irq_find_mapping(domain, hwirq);
+        /* Check revmap bounds */
+        if (unlikely(hwirq >= domain->revmap_data.linear.size))
+                return irq_find_mapping(domain, hwirq);
+        /* Check if revmap was allocated */
+        revmap = domain->revmap_data.linear.revmap;
+        if (unlikely(revmap == NULL))
+                return irq_find_mapping(domain, hwirq);
+        /* Fill up revmap with slow path if no mapping found */
+        if (unlikely(!revmap[hwirq]))
+                revmap[hwirq] = irq_find_mapping(domain, hwirq);
+        return revmap[hwirq];
+}
+#ifdef CONFIG_IRQ_DOMAIN_DEBUG
+static int virq_debug_show(struct seq_file *m, void *private)
+{
+        unsigned long flags;
+        struct irq_desc *desc;
+        const char *p;
+        static const char none[] = "none";
+        void *data;
+        int i;
+        seq_printf(m, "%-5s  %-7s  %-15s  %-*s  %s\n", "irq", "hwirq",
+                      "chip name", (int)(2 * sizeof(void *) + 2), "chip data",
+                      "domain name");
+        for (i = 1; i < nr_irqs; i++) {
+                desc = irq_to_desc(i);
+                if (!desc)
+                        continue;
+                raw_spin_lock_irqsave(&desc->lock, flags);
+                if (desc->action && desc->action->handler) {
+                        struct irq_chip *chip;
+                        seq_printf(m, "%5d  ", i);
+                        seq_printf(m, "0x%05lx  ", desc->irq_data.hwirq);
+                        chip = irq_desc_get_chip(desc);
+                        if (chip && chip->name)
+                                p = chip->name;
+                        else
+                                p = none;
+                        seq_printf(m, "%-15s  ", p);
+                        data = irq_desc_get_chip_data(desc);
+                        seq_printf(m, data ? "0x%p  " : "  %p  ", data);
+                        if (desc->irq_data.domain && desc->irq_data.domain->of_node)
+                                p = desc->irq_data.domain->of_node->full_name;
+                        else
+                                p = none;
+                        seq_printf(m, "%s\n", p);
+                }
+                raw_spin_unlock_irqrestore(&desc->lock, flags);
+        }
+        return 0;
+}
+static int virq_debug_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, virq_debug_show, inode->i_private);
+}
+static const struct file_operations virq_debug_fops = {
+        .open = virq_debug_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+static int __init irq_debugfs_init(void)
+{
+        if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL,
+                                 NULL, &virq_debug_fops) == NULL)
+                return -ENOMEM;
+        return 0;
+}
+__initcall(irq_debugfs_init);
+#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
+int irq_domain_simple_map(struct irq_domain *d, unsigned int irq,
+                          irq_hw_number_t hwirq)
+{
+        return 0;
+}
+/**
+ * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
+ *
+ * Device Tree IRQ specifier translation function which works with one cell
+ * bindings where the cell value maps directly to the hwirq number.
+ */
+int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr,
+                             const u32 *intspec, unsigned int intsize,
+                             unsigned long *out_hwirq, unsigned int *out_type)
+{
+        if (WARN_ON(intsize < 1))
+                return -EINVAL;
        *out_hwirq = intspec[0];
        *out_type = IRQ_TYPE_NONE;
-        if (intsize > 1)
-                *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
        return 0;
 }
+EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell);
 /**
- * irq_domain_create_simple() - Set up a 'simple' translation range
+ * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings
+ *
+ * Device Tree IRQ specifier translation function which works with two cell
+ * bindings where the cell values map directly to the hwirq number
+ * and linux irq flags.
 */
-void irq_domain_add_simple(struct device_node *controller, int irq_base)
+int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
+                        const u32 *intspec, unsigned int intsize,
+                        irq_hw_number_t *out_hwirq, unsigned int *out_type)
 {
-        struct irq_domain *domain;
+        if (WARN_ON(intsize < 2))
+                return -EINVAL;
-        domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+        *out_hwirq = intspec[0];
-        if (!domain) {
+        *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
-                WARN_ON(1);
+        return 0;
-                return;
+}
-        }
+EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
-        domain->irq_base = irq_base;
+/**
-        domain->of_node = of_node_get(controller);
+ * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings
-        domain->ops = &irq_domain_simple_ops;
+ *
-        irq_domain_add(domain);
+ * Device Tree IRQ specifier translation function which works with either one
+ * or two cell bindings where the cell values map directly to the hwirq number
+ * and linux irq flags.
+ *
+ * Note: don't use this function unless your interrupt controller explicitly
+ * supports both one and two cell bindings.  For the majority of controllers
+ * the _onecell() or _twocell() variants above should be used.
+ */
+int irq_domain_xlate_onetwocell(struct irq_domain *d,
+                                struct device_node *ctrlr,
+                                const u32 *intspec, unsigned int intsize,
+                                unsigned long *out_hwirq, unsigned int *out_type)
+{
+        if (WARN_ON(intsize < 1))
+                return -EINVAL;
+        *out_hwirq = intspec[0];
+        *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE;
+        return 0;
 }
-EXPORT_SYMBOL_GPL(irq_domain_add_simple);
+EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
+const struct irq_domain_ops irq_domain_simple_ops = {
+        .map = irq_domain_simple_map,
+        .xlate = irq_domain_xlate_onetwocell,
+};
+EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
+#ifdef CONFIG_OF_IRQ
 void irq_domain_generate_simple(const struct of_device_id *match,
                                u64 phys_base, unsigned int irq_start)
 {
        struct device_node *node;
-        pr_info("looking for phys_base=%llx, irq_start=%i\n",
+        pr_debug("looking for phys_base=%llx, irq_start=%i\n",
                (unsigned long long) phys_base, (int) irq_start);
        node = of_find_matching_node_by_address(NULL, match, phys_base);
        if (node)
-                irq_domain_add_simple(node, irq_start);
+                irq_domain_add_legacy(node, 32, irq_start, 0,
-        else
+                                      &irq_domain_simple_ops, NULL);
-                pr_info("no node found\n");
 }
 EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
-#endif /* CONFIG_OF_IRQ */
+#endif
-struct irq_domain_ops irq_domain_simple_ops = {
-#ifdef CONFIG_OF_IRQ
-        .dt_translate = irq_domain_simple_dt_translate,
-#endif /* CONFIG_OF_IRQ */
-};
-EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index b0ccd1ac2d6a..89a3ea82569b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -282,7 +282,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
        struct irq_chip *chip = irq_desc_get_chip(desc);
        struct cpumask *set = irq_default_affinity;
-        int ret;
+        int ret, node = desc->irq_data.node;
        /* Excludes PER_CPU and NO_BALANCE interrupts */
        if (!irq_can_set_affinity(irq))
@@ -301,6 +301,13 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
        }
        cpumask_and(mask, cpu_online_mask, set);
+        if (node != NUMA_NO_NODE) {
+                const struct cpumask *nodemask = cpumask_of_node(node);
+                /* make sure at least one of the cpus in nodemask is online */
+                if (cpumask_intersects(mask, nodemask))
+                        cpumask_and(mask, mask, nodemask);
+        }
        ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
        switch (ret) {
        case IRQ_SET_MASK_OK:
@@ -645,7 +652,7 @@ static int irq_wait_for_interrupt(struct irqaction *action)
 * is marked MASKED.
 */
 static void irq_finalize_oneshot(struct irq_desc *desc,
-                                 struct irqaction *action, bool force)
+                                 struct irqaction *action)
 {
        if (!(desc->istate & IRQS_ONESHOT))
                return;
@@ -679,7 +686,7 @@ again:
         * we would clear the threads_oneshot bit of this thread which
         * was just set.
         */
-        if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+        if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
                goto out_unlock;
        desc->threads_oneshot &= ~action->thread_mask;
@@ -739,7 +746,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
        local_bh_disable();
        ret = action->thread_fn(action->irq, action->dev_id);
-        irq_finalize_oneshot(desc, action, false);
+        irq_finalize_oneshot(desc, action);
        local_bh_enable();
        return ret;
 }
@@ -755,7 +762,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,
        irqreturn_t ret;
        ret = action->thread_fn(action->irq, action->dev_id);
-        irq_finalize_oneshot(desc, action, false);
+        irq_finalize_oneshot(desc, action);
        return ret;
 }
@@ -844,7 +851,7 @@ void exit_irq_thread(void)
                wake_threads_waitq(desc);
        /* Prevent a stale desc->threads_oneshot */
-        irq_finalize_oneshot(desc, action, true);
+        irq_finalize_oneshot(desc, action);
 }
 static void irq_setup_forced_threading(struct irqaction *new)
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 47420908fba0..c3c89751b327 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -43,12 +43,16 @@ void irq_move_masked_irq(struct irq_data *idata)
         * masking the irqs.
         */
        if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
-                   < nr_cpu_ids))
+                   < nr_cpu_ids)) {
-                if (!chip->irq_set_affinity(&desc->irq_data,
+                int ret = chip->irq_set_affinity(&desc->irq_data,
-                                            desc->pending_mask, false)) {
+                                                 desc->pending_mask, false);
+                switch (ret) {
+                case IRQ_SET_MASK_OK:
                        cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
+                case IRQ_SET_MASK_OK_NOCOPY:
                        irq_set_thread_affinity(desc);
                }
+        }
        cpumask_clear(desc->pending_mask);
 }
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index c3c46c72046e..1588e3b2871b 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -5,11 +5,13 @@
 * context. The enqueueing is NMI-safe.
 */
+#include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/irq_work.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
+#include <linux/irqflags.h>
 #include <asm/processor.h>
 /*
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 22000c3db0dd..8d262b467573 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -284,8 +284,12 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
        if (value) {
                if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
                        return -EFAULT;
-        } else
+        } else {
-                memset((char *) &set_buffer, 0, sizeof(set_buffer));
+                memset(&set_buffer, 0, sizeof(set_buffer));
+                printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer."
+                            " Misfeature support will be removed\n",
+                            current->comm);
+        }
        error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
        if (error || !ovalue)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 7b0886786701..4e2e472f6aeb 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -37,7 +37,6 @@
 #include <asm/page.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
-#include <asm/system.h>
 #include <asm/sections.h>
 /* Per cpu memory for storing cpu states in case of system crash. */
@@ -1359,6 +1358,10 @@ static int __init parse_crashkernel_simple(char 		*cmdline,
        if (*cur == '@')
                *crash_base = memparse(cur+1, &cur);
+        else if (*cur != ' ' && *cur != '\0') {
+                pr_warning("crashkernel: unrecognized char\n");
+                return -EINVAL;
+        }
        return 0;
 }
@@ -1462,7 +1465,9 @@ static int __init crash_save_vmcoreinfo_init(void)
        VMCOREINFO_SYMBOL(init_uts_ns);
        VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
        VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
        VMCOREINFO_SYMBOL(_stext);
        VMCOREINFO_SYMBOL(vmlist);
@@ -1546,13 +1551,13 @@ int kernel_kexec(void)
                if (error)
                        goto Resume_console;
                /* At this point, dpm_suspend_start() has been called,
-                 * but *not* dpm_suspend_noirq(). We *must* call
+                 * but *not* dpm_suspend_end(). We *must* call
-                 * dpm_suspend_noirq() now.  Otherwise, drivers for
+                 * dpm_suspend_end() now.  Otherwise, drivers for
                 * some devices (e.g. interrupt controllers) become
                 * desynchronized with the actual state of the
                 * hardware at resume time, and evil weirdness ensues.
                 */
-                error = dpm_suspend_noirq(PMSG_FREEZE);
+                error = dpm_suspend_end(PMSG_FREEZE);
                if (error)
                        goto Resume_devices;
                error = disable_nonboot_cpus();
@@ -1579,7 +1584,7 @@ int kernel_kexec(void)
                local_irq_enable();
 Enable_cpus:
                enable_nonboot_cpus();
-                dpm_resume_noirq(PMSG_RESTORE);
+                dpm_resume_start(PMSG_RESTORE);
 Resume_devices:
                dpm_resume_end(PMSG_RESTORE);
 Resume_console:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a0a88543934e..05698a7415fe 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -60,6 +60,43 @@ static DECLARE_RWSEM(umhelper_sem);
 */
 char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
+static void free_modprobe_argv(struct subprocess_info *info)
+{
+        kfree(info->argv[3]); /* check call_modprobe() */
+        kfree(info->argv);
+}
+static int call_modprobe(char *module_name, int wait)
+{
+        static char *envp[] = {
+                "HOME=/",
+                "TERM=linux",
+                "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+                NULL
+        };
+        char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
+        if (!argv)
+                goto out;
+        module_name = kstrdup(module_name, GFP_KERNEL);
+        if (!module_name)
+                goto free_argv;
+        argv[0] = modprobe_path;
+        argv[1] = "-q";
+        argv[2] = "--";
+        argv[3] = module_name;  /* check free_modprobe_argv() */
+        argv[4] = NULL;
+        return call_usermodehelper_fns(modprobe_path, argv, envp,
+                wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL);
+free_argv:
+        kfree(argv);
+out:
+        return -ENOMEM;
+}
 /**
 * __request_module - try to load a kernel module
 * @wait: wait (or not) for the operation to complete
@@ -81,11 +118,6 @@ int __request_module(bool wait, const char *fmt, ...)
        char module_name[MODULE_NAME_LEN];
        unsigned int max_modprobes;
        int ret;
-        char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
-        static char *envp[] = { "HOME=/",
-                                "TERM=linux",
-                                "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
-                                NULL };
        static atomic_t kmod_concurrent = ATOMIC_INIT(0);
 #define MAX_KMOD_CONCURRENT 50  /* Completely arbitrary value - KAO */
        static int kmod_loop_msg;
@@ -128,9 +160,7 @@ int __request_module(bool wait, const char *fmt, ...)
        trace_module_request(module_name, wait, _RET_IP_);
-        ret = call_usermodehelper_fns(modprobe_path, argv, envp,
+        ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
-                        wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
-                        NULL, NULL, NULL);
        atomic_dec(&kmod_concurrent);
        return ret;
@@ -188,7 +218,7 @@ static int ____call_usermodehelper(void *data)
        /* Exec failed? */
 fail:
        sub_info->retval = retval;
-        do_exit(0);
+        return 0;
 }
 void call_usermodehelper_freeinfo(struct subprocess_info *info)
@@ -199,6 +229,19 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info)
 }
 EXPORT_SYMBOL(call_usermodehelper_freeinfo);
+static void umh_complete(struct subprocess_info *sub_info)
+{
+        struct completion *comp = xchg(&sub_info->complete, NULL);
+        /*
+         * See call_usermodehelper_exec(). If xchg() returns NULL
+         * we own sub_info, the UMH_KILLABLE caller has gone away.
+         */
+        if (comp)
+                complete(comp);
+        else
+                call_usermodehelper_freeinfo(sub_info);
+}
 /* Keventd can't block, but this (a child) can. */
 static int wait_for_helper(void *data)
 {
@@ -235,7 +278,7 @@ static int wait_for_helper(void *data)
                        sub_info->retval = ret;
        }
-        complete(sub_info->complete);
+        umh_complete(sub_info);
        return 0;
 }
@@ -244,7 +287,7 @@ static void __call_usermodehelper(struct work_struct *work)
 {
        struct subprocess_info *sub_info =
                container_of(work, struct subprocess_info, work);
-        enum umh_wait wait = sub_info->wait;
+        int wait = sub_info->wait & ~UMH_KILLABLE;
        pid_t pid;
        /* CLONE_VFORK: wait until the usermode helper has execve'd
@@ -269,7 +312,7 @@ static void __call_usermodehelper(struct work_struct *work)
        case UMH_WAIT_EXEC:
                if (pid < 0)
                        sub_info->retval = pid;
-                complete(sub_info->complete);
+                umh_complete(sub_info);
        }
 }
@@ -279,7 +322,7 @@ static void __call_usermodehelper(struct work_struct *work)
 * land has been frozen during a system-wide hibernation or suspend operation).
 * Should always be manipulated under umhelper_sem acquired for write.
 */
-static int usermodehelper_disabled = 1;
+static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED;
 /* Number of helpers running */
 static atomic_t running_helpers = ATOMIC_INIT(0);
@@ -291,32 +334,110 @@ static atomic_t running_helpers = ATOMIC_INIT(0);
 static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
 /*
+ * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled
+ * to become 'false'.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq);
+/*
 * Time to wait for running_helpers to become zero before the setting of
 * usermodehelper_disabled in usermodehelper_disable() fails
 */
 #define RUNNING_HELPERS_TIMEOUT (5 * HZ)
-void read_lock_usermodehelper(void)
+int usermodehelper_read_trylock(void)
+{
+        DEFINE_WAIT(wait);
+        int ret = 0;
+        down_read(&umhelper_sem);
+        for (;;) {
+                prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
+                                TASK_INTERRUPTIBLE);
+                if (!usermodehelper_disabled)
+                        break;
+                if (usermodehelper_disabled == UMH_DISABLED)
+                        ret = -EAGAIN;
+                up_read(&umhelper_sem);
+                if (ret)
+                        break;
+                schedule();
+                try_to_freeze();
+                down_read(&umhelper_sem);
+        }
+        finish_wait(&usermodehelper_disabled_waitq, &wait);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(usermodehelper_read_trylock);
+long usermodehelper_read_lock_wait(long timeout)
 {
+        DEFINE_WAIT(wait);
+        if (timeout < 0)
+                return -EINVAL;
        down_read(&umhelper_sem);
+        for (;;) {
+                prepare_to_wait(&usermodehelper_disabled_waitq, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                if (!usermodehelper_disabled)
+                        break;
+                up_read(&umhelper_sem);
+                timeout = schedule_timeout(timeout);
+                if (!timeout)
+                        break;
+                down_read(&umhelper_sem);
+        }
+        finish_wait(&usermodehelper_disabled_waitq, &wait);
+        return timeout;
 }
-EXPORT_SYMBOL_GPL(read_lock_usermodehelper);
+EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait);
-void read_unlock_usermodehelper(void)
+void usermodehelper_read_unlock(void)
 {
        up_read(&umhelper_sem);
 }
-EXPORT_SYMBOL_GPL(read_unlock_usermodehelper);
+EXPORT_SYMBOL_GPL(usermodehelper_read_unlock);
 /**
- * usermodehelper_disable - prevent new helpers from being started
+ * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled.
+ * depth: New value to assign to usermodehelper_disabled.
+ *
+ * Change the value of usermodehelper_disabled (under umhelper_sem locked for
+ * writing) and wakeup tasks waiting for it to change.
 */
-int usermodehelper_disable(void)
+void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
+{
+        down_write(&umhelper_sem);
+        usermodehelper_disabled = depth;
+        wake_up(&usermodehelper_disabled_waitq);
+        up_write(&umhelper_sem);
+}
+/**
+ * __usermodehelper_disable - Prevent new helpers from being started.
+ * @depth: New value to assign to usermodehelper_disabled.
+ *
+ * Set usermodehelper_disabled to @depth and wait for running helpers to exit.
+ */
+int __usermodehelper_disable(enum umh_disable_depth depth)
 {
        long retval;
+        if (!depth)
+                return -EINVAL;
        down_write(&umhelper_sem);
-        usermodehelper_disabled = 1;
+        usermodehelper_disabled = depth;
        up_write(&umhelper_sem);
        /*
@@ -331,31 +452,10 @@ int usermodehelper_disable(void)
        if (retval)
                return 0;
-        down_write(&umhelper_sem);
+        __usermodehelper_set_disable_depth(UMH_ENABLED);
-        usermodehelper_disabled = 0;
-        up_write(&umhelper_sem);
        return -EAGAIN;
 }
-/**
- * usermodehelper_enable - allow new helpers to be started again
- */
-void usermodehelper_enable(void)
-{
-        down_write(&umhelper_sem);
-        usermodehelper_disabled = 0;
-        up_write(&umhelper_sem);
-}
-/**
- * usermodehelper_is_disabled - check if new helpers are allowed to be started
- */
-bool usermodehelper_is_disabled(void)
-{
-        return usermodehelper_disabled;
-}
-EXPORT_SYMBOL_GPL(usermodehelper_is_disabled);
 static void helper_lock(void)
 {
        atomic_inc(&running_helpers);
@@ -435,8 +535,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns);
 * asynchronously if wait is not set, and runs as a child of keventd.
 * (ie. it runs with full root capabilities).
 */
-int call_usermodehelper_exec(struct subprocess_info *sub_info,
+int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
-                             enum umh_wait wait)
 {
        DECLARE_COMPLETION_ONSTACK(done);
        int retval = 0;
@@ -456,9 +555,21 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
        queue_work(khelper_wq, &sub_info->work);
        if (wait == UMH_NO_WAIT)        /* task has freed sub_info */
                goto unlock;
+        if (wait & UMH_KILLABLE) {
+                retval = wait_for_completion_killable(&done);
+                if (!retval)
+                        goto wait_done;
+                /* umh_complete() will see NULL and free sub_info */
+                if (xchg(&sub_info->complete, NULL))
+                        goto unlock;
+                /* fallthrough, umh_complete() was already called */
+        }
        wait_for_completion(&done);
+wait_done:
        retval = sub_info->retval;
 out:
        call_usermodehelper_freeinfo(sub_info);
 unlock:
diff --git a/kernel/module.c b/kernel/module.c
index 2c932760fd33..78ac6ec1e425 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -105,6 +105,7 @@ struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
 /* Block module loading/unloading? */
 int modules_disabled = 0;
+core_param(nomodule, modules_disabled, bint, 0);
 /* Waiting for a module to finish initializing? */
 static DECLARE_WAIT_QUEUE_HEAD(module_wq);
@@ -903,6 +904,36 @@ static ssize_t show_refcnt(struct module_attribute *mattr,
 static struct module_attribute modinfo_refcnt =
        __ATTR(refcnt, 0444, show_refcnt, NULL);
+void __module_get(struct module *module)
+{
+        if (module) {
+                preempt_disable();
+                __this_cpu_inc(module->refptr->incs);
+                trace_module_get(module, _RET_IP_);
+                preempt_enable();
+        }
+}
+EXPORT_SYMBOL(__module_get);
+bool try_module_get(struct module *module)
+{
+        bool ret = true;
+        if (module) {
+                preempt_disable();
+                if (likely(module_is_live(module))) {
+                        __this_cpu_inc(module->refptr->incs);
+                        trace_module_get(module, _RET_IP_);
+                } else
+                        ret = false;
+                preempt_enable();
+        }
+        return ret;
+}
+EXPORT_SYMBOL(try_module_get);
 void module_put(struct module *module)
 {
        if (module) {
@@ -2380,8 +2411,7 @@ static int copy_and_check(struct load_info *info,
                return -ENOEXEC;
        /* Suck in entire file: we'll want most of it. */
-        /* vmalloc barfs on "unusual" numbers.  Check here */
+        if ((hdr = vmalloc(len)) == NULL)
-        if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
                return -ENOMEM;
        if (copy_from_user(hdr, umod, len) != 0) {
@@ -2922,7 +2952,8 @@ static struct module *load_module(void __user *umod,
        mutex_unlock(&module_mutex);
        /* Module is ready to execute: parsing args may do that. */
-        err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
+        err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
+                         -32768, 32767, NULL);
        if (err < 0)
                goto unlink;
diff --git a/kernel/padata.c b/kernel/padata.c
index b45259931512..89fe3d1b9efb 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -1,6 +1,8 @@
 /*
 * padata.c - generic interface to process data streams in parallel
 *
+ * See Documentation/padata.txt for an api documentation.
+ *
 * Copyright (C) 2008, 2009 secunet Security Networks AG
 * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
 *
@@ -29,7 +31,6 @@
 #include <linux/sysfs.h>
 #include <linux/rcupdate.h>
-#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
 #define MAX_OBJ_NUM 1000
 static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
@@ -43,18 +44,19 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
        return target_cpu;
 }
-static int padata_cpu_hash(struct padata_priv *padata)
+static int padata_cpu_hash(struct parallel_data *pd)
 {
        int cpu_index;
-        struct parallel_data *pd;
-        pd =  padata->pd;
        /*
         * Hash the sequence numbers to the cpus by taking
         * seq_nr mod. number of cpus in use.
         */
-        cpu_index =  padata->seq_nr % cpumask_weight(pd->cpumask.pcpu);
+        spin_lock(&pd->seq_lock);
+        cpu_index =  pd->seq_nr % cpumask_weight(pd->cpumask.pcpu);
+        pd->seq_nr++;
+        spin_unlock(&pd->seq_lock);
        return padata_index_to_cpu(pd, cpu_index);
 }
@@ -132,12 +134,7 @@ int padata_do_parallel(struct padata_instance *pinst,
        padata->pd = pd;
        padata->cb_cpu = cb_cpu;
-        if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
+        target_cpu = padata_cpu_hash(pd);
-                atomic_set(&pd->seq_nr, -1);
-        padata->seq_nr = atomic_inc_return(&pd->seq_nr);
-        target_cpu = padata_cpu_hash(padata);
        queue = per_cpu_ptr(pd->pqueue, target_cpu);
        spin_lock(&queue->parallel.lock);
@@ -173,7 +170,7 @@ EXPORT_SYMBOL(padata_do_parallel);
 static struct padata_priv *padata_get_next(struct parallel_data *pd)
 {
        int cpu, num_cpus;
-        int next_nr, next_index;
+        unsigned int next_nr, next_index;
        struct padata_parallel_queue *queue, *next_queue;
        struct padata_priv *padata;
        struct padata_list *reorder;
@@ -189,14 +186,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
        cpu = padata_index_to_cpu(pd, next_index);
        next_queue = per_cpu_ptr(pd->pqueue, cpu);
-        if (unlikely(next_nr > pd->max_seq_nr)) {
-                next_nr = next_nr - pd->max_seq_nr - 1;
-                next_index = next_nr % num_cpus;
-                cpu = padata_index_to_cpu(pd, next_index);
-                next_queue = per_cpu_ptr(pd->pqueue, cpu);
-                pd->processed = 0;
-        }
        padata = NULL;
        reorder = &next_queue->reorder;
@@ -205,8 +194,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
                padata = list_entry(reorder->list.next,
                                    struct padata_priv, list);
-                BUG_ON(next_nr != padata->seq_nr);
                spin_lock(&reorder->lock);
                list_del_init(&padata->list);
                atomic_dec(&pd->reorder_objects);
@@ -230,6 +217,7 @@ out:
 static void padata_reorder(struct parallel_data *pd)
 {
+        int cb_cpu;
        struct padata_priv *padata;
        struct padata_serial_queue *squeue;
        struct padata_instance *pinst = pd->pinst;
@@ -270,13 +258,14 @@ static void padata_reorder(struct parallel_data *pd)
                        return;
                }
-                squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu);
+                cb_cpu = padata->cb_cpu;
+                squeue = per_cpu_ptr(pd->squeue, cb_cpu);
                spin_lock(&squeue->serial.lock);
                list_add_tail(&padata->list, &squeue->serial.list);
                spin_unlock(&squeue->serial.lock);
-                queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work);
+                queue_work_on(cb_cpu, pinst->wq, &squeue->work);
        }
        spin_unlock_bh(&pd->lock);
@@ -367,13 +356,13 @@ static int padata_setup_cpumasks(struct parallel_data *pd,
        if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
                return -ENOMEM;
-        cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask);
+        cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask);
        if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
                free_cpumask_var(pd->cpumask.cbcpu);
                return -ENOMEM;
        }
-        cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask);
+        cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask);
        return 0;
 }
@@ -400,7 +389,7 @@ static void padata_init_squeues(struct parallel_data *pd)
 /* Initialize all percpu queues used by parallel workers */
 static void padata_init_pqueues(struct parallel_data *pd)
 {
-        int cpu_index, num_cpus, cpu;
+        int cpu_index, cpu;
        struct padata_parallel_queue *pqueue;
        cpu_index = 0;
@@ -415,9 +404,6 @@ static void padata_init_pqueues(struct parallel_data *pd)
                INIT_WORK(&pqueue->work, padata_parallel_worker);
                atomic_set(&pqueue->num_obj, 0);
        }
-        num_cpus = cpumask_weight(pd->cpumask.pcpu);
-        pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
 }
 /* Allocate and initialize the internal cpumask dependend resources. */
@@ -444,7 +430,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
        padata_init_pqueues(pd);
        padata_init_squeues(pd);
        setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
-        atomic_set(&pd->seq_nr, -1);
+        pd->seq_nr = 0;
        atomic_set(&pd->reorder_objects, 0);
        atomic_set(&pd->refcnt, 0);
        pd->pinst = pinst;
@@ -580,7 +566,7 @@ EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
 static bool padata_validate_cpumask(struct padata_instance *pinst,
                                    const struct cpumask *cpumask)
 {
-        if (!cpumask_intersects(cpumask, cpu_active_mask)) {
+        if (!cpumask_intersects(cpumask, cpu_online_mask)) {
                pinst->flags |= PADATA_INVALID;
                return false;
        }
@@ -694,7 +680,7 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
 {
        struct parallel_data *pd;
-        if (cpumask_test_cpu(cpu, cpu_active_mask)) {
+        if (cpumask_test_cpu(cpu, cpu_online_mask)) {
                pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
                                     pinst->cpumask.cbcpu);
                if (!pd)
@@ -762,6 +748,9 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
                        return -ENOMEM;
                padata_replace(pinst, pd);
+                cpumask_clear_cpu(cpu, pd->cpumask.cbcpu);
+                cpumask_clear_cpu(cpu, pd->cpumask.pcpu);
        }
        return 0;
diff --git a/kernel/panic.c b/kernel/panic.c
index 80aed44e345a..8ed89a175d79 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -97,7 +97,7 @@ void panic(const char *fmt, ...)
        /*
         * Avoid nested stack-dumping if a panic occurs during oops processing
         */
-        if (!oops_in_progress)
+        if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
                dump_stack();
 #endif
diff --git a/kernel/params.c b/kernel/params.c
index 4bc965d8a1fe..f37d82631347 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,6 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
-#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/errno.h>
@@ -88,6 +87,8 @@ static int parse_one(char *param,
                     char *val,
                     const struct kernel_param *params,
                     unsigned num_params,
+                     s16 min_level,
+                     s16 max_level,
                     int (*handle_unknown)(char *param, char *val))
 {
        unsigned int i;
@@ -96,6 +97,9 @@ static int parse_one(char *param,
        /* Find parameter */
        for (i = 0; i < num_params; i++) {
                if (parameq(param, params[i].name)) {
+                        if (params[i].level < min_level
+                            || params[i].level > max_level)
+                                return 0;
                        /* No one handled NULL, so do it here. */
                        if (!val && params[i].ops->set != param_set_bool
                            && params[i].ops->set != param_set_bint)
@@ -175,6 +179,8 @@ int parse_args(const char *name,
               char *args,
               const struct kernel_param *params,
               unsigned num,
+               s16 min_level,
+               s16 max_level,
               int (*unknown)(char *param, char *val))
 {
        char *param, *val;
@@ -190,7 +196,8 @@ int parse_args(const char *name,
                args = next_arg(args, &param, &val);
                irq_was_disabled = irqs_disabled();
-                ret = parse_one(param, val, params, num, unknown);
+                ret = parse_one(param, val, params, num,
+                                min_level, max_level, unknown);
                if (irq_was_disabled && !irqs_disabled()) {
                        printk(KERN_WARNING "parse_args(): option '%s' enabled "
                                        "irq's!\n", param);
@@ -298,35 +305,18 @@ EXPORT_SYMBOL(param_ops_charp);
 /* Actually could be a bool or an int, for historical reasons. */
 int param_set_bool(const char *val, const struct kernel_param *kp)
 {
-        bool v;
-        int ret;
        /* No equals means "set"... */
        if (!val) val = "1";
        /* One of =[yYnN01] */
-        ret = strtobool(val, &v);
+        return strtobool(val, kp->arg);
-        if (ret)
-                return ret;
-        if (kp->flags & KPARAM_ISBOOL)
-                *(bool *)kp->arg = v;
-        else
-                *(int *)kp->arg = v;
-        return 0;
 }
 EXPORT_SYMBOL(param_set_bool);
 int param_get_bool(char *buffer, const struct kernel_param *kp)
 {
-        bool val;
-        if (kp->flags & KPARAM_ISBOOL)
-                val = *(bool *)kp->arg;
-        else
-                val = *(int *)kp->arg;
        /* Y and N chosen as being relatively non-coder friendly */
-        return sprintf(buffer, "%c", val ? 'Y' : 'N');
+        return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
 }
 EXPORT_SYMBOL(param_get_bool);
@@ -344,7 +334,6 @@ int param_set_invbool(const char *val, const struct kernel_param *kp)
        struct kernel_param dummy;
        dummy.arg = &boolval;
-        dummy.flags = KPARAM_ISBOOL;
        ret = param_set_bool(val, &dummy);
        if (ret == 0)
                *(bool *)kp->arg = !boolval;
@@ -373,7 +362,6 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
        /* Match bool exactly, by re-using it. */
        boolkp = *kp;
        boolkp.arg = &v;
-        boolkp.flags |= KPARAM_ISBOOL;
        ret = param_set_bool(val, &boolkp);
        if (ret == 0)
@@ -394,7 +382,7 @@ static int param_array(const char *name,
                       unsigned int min, unsigned int max,
                       void *elem, int elemsize,
                       int (*set)(const char *, const struct kernel_param *kp),
-                       u16 flags,
+                       s16 level,
                       unsigned int *num)
 {
        int ret;
@@ -404,7 +392,7 @@ static int param_array(const char *name,
        /* Get the name right for errors. */
        kp.name = name;
        kp.arg = elem;
-        kp.flags = flags;
+        kp.level = level;
        *num = 0;
        /* We expect a comma-separated list of values. */
@@ -445,7 +433,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp)
        unsigned int temp_num;
        return param_array(kp->name, val, 1, arr->max, arr->elem,
-                           arr->elemsize, arr->ops->set, kp->flags,
+                           arr->elemsize, arr->ops->set, kp->level,
                           arr->num ?: &temp_num);
 }
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a8968396046d..57bc1fd35b3c 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -15,6 +15,7 @@
 #include <linux/acct.h>
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
+#include <linux/reboot.h>
 #define BITS_PER_PAGE           (PAGE_SIZE*8)
@@ -168,13 +169,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        while (nr > 0) {
                rcu_read_lock();
-                /*
-                 * Any nested-container's init processes won't ignore the
-                 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
-                 */
                task = pid_task(find_vpid(nr), PIDTYPE_PID);
-                if (task)
+                if (task && !__fatal_signal_pending(task))
-                        send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
+                        send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
                rcu_read_unlock();
@@ -187,6 +184,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
                rc = sys_wait4(-1, NULL, __WALL, NULL);
        } while (rc != -ECHILD);
+        if (pid_ns->reboot)
+                current->signal->group_exit_code = pid_ns->reboot;
        acct_exit_ns(pid_ns);
        return;
 }
@@ -221,6 +221,35 @@ static struct ctl_table pid_ns_ctl_table[] = {
 static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
+int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
+{
+        if (pid_ns == &init_pid_ns)
+                return 0;
+        switch (cmd) {
+        case LINUX_REBOOT_CMD_RESTART2:
+        case LINUX_REBOOT_CMD_RESTART:
+                pid_ns->reboot = SIGHUP;
+                break;
+        case LINUX_REBOOT_CMD_POWER_OFF:
+        case LINUX_REBOOT_CMD_HALT:
+                pid_ns->reboot = SIGINT;
+                break;
+        default:
+                return -EINVAL;
+        }
+        read_lock(&tasklist_lock);
+        force_sig(SIGKILL, pid_ns->child_reaper);
+        read_unlock(&tasklist_lock);
+        do_exit(0);
+        /* Not reached */
+        return 0;
+}
 static __init int pid_namespaces_init(void)
 {
        pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 07e0e28ffba7..66d808ec5252 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,8 @@
 ccflags-$(CONFIG_PM_DEBUG)      := -DDEBUG
-obj-$(CONFIG_PM)                += main.o qos.o
+obj-y                           += qos.o
+obj-$(CONFIG_PM)                += main.o
 obj-$(CONFIG_VT_CONSOLE_SLEEP)  += console.o
 obj-$(CONFIG_FREEZER)           += process.o
 obj-$(CONFIG_SUSPEND)           += suspend.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 6d6d28870335..e09dfbfeecee 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -16,7 +16,6 @@
 #include <linux/string.h>
 #include <linux/device.h>
 #include <linux/async.h>
-#include <linux/kmod.h>
 #include <linux/delay.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -245,8 +244,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
 * create_image - Create a hibernation image.
 * @platform_mode: Whether or not to use the platform driver.
 *
- * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image
+ * Execute device drivers' "late" and "noirq" freeze callbacks, create a
- * and execute the drivers' .thaw_noirq() callbacks.
+ * hibernation image and run the drivers' "noirq" and "early" thaw callbacks.
 *
 * Control reappears in this routine after the subsequent restore.
 */
@@ -254,7 +253,7 @@ static int create_image(int platform_mode)
 {
        int error;
-        error = dpm_suspend_noirq(PMSG_FREEZE);
+        error = dpm_suspend_end(PMSG_FREEZE);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down, "
                        "aborting hibernation\n");
@@ -306,7 +305,7 @@ static int create_image(int platform_mode)
 Platform_finish:
        platform_finish(platform_mode);
-        dpm_resume_noirq(in_suspend ?
+        dpm_resume_start(in_suspend ?
                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
        return error;
@@ -343,13 +342,13 @@ int hibernation_snapshot(int platform_mode)
                 * successful freezer test.
                 */
                freezer_test_done = true;
-                goto Cleanup;
+                goto Thaw;
        }
        error = dpm_prepare(PMSG_FREEZE);
        if (error) {
                dpm_complete(PMSG_RECOVER);
-                goto Cleanup;
+                goto Thaw;
        }
        suspend_console();
@@ -385,6 +384,8 @@ int hibernation_snapshot(int platform_mode)
        platform_end(platform_mode);
        return error;
+ Thaw:
+        thaw_kernel_threads();
 Cleanup:
        swsusp_free();
        goto Close;
@@ -394,16 +395,16 @@ int hibernation_snapshot(int platform_mode)
 * resume_target_kernel - Restore system state from a hibernation image.
 * @platform_mode: Whether or not to use the platform driver.
 *
- * Execute device drivers' .freeze_noirq() callbacks, restore the contents of
+ * Execute device drivers' "noirq" and "late" freeze callbacks, restore the
- * highmem that have not been restored yet from the image and run the low-level
+ * contents of highmem that have not been restored yet from the image and run
- * code that will restore the remaining contents of memory and switch to the
+ * the low-level code that will restore the remaining contents of memory and
- * just restored target kernel.
+ * switch to the just restored target kernel.
 */
 static int resume_target_kernel(bool platform_mode)
 {
        int error;
-        error = dpm_suspend_noirq(PMSG_QUIESCE);
+        error = dpm_suspend_end(PMSG_QUIESCE);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down, "
                        "aborting resume\n");
@@ -460,7 +461,7 @@ static int resume_target_kernel(bool platform_mode)
 Cleanup:
        platform_restore_cleanup(platform_mode);
-        dpm_resume_noirq(PMSG_RECOVER);
+        dpm_resume_start(PMSG_RECOVER);
        return error;
 }
@@ -518,7 +519,7 @@ int hibernation_platform_enter(void)
                goto Resume_devices;
        }
-        error = dpm_suspend_noirq(PMSG_HIBERNATE);
+        error = dpm_suspend_end(PMSG_HIBERNATE);
        if (error)
                goto Resume_devices;
@@ -549,7 +550,7 @@ int hibernation_platform_enter(void)
 Platform_finish:
        hibernation_ops->finish();
-        dpm_resume_noirq(PMSG_RESTORE);
+        dpm_resume_start(PMSG_RESTORE);
 Resume_devices:
        entering_platform_hibernation = false;
@@ -609,10 +610,6 @@ int hibernate(void)
        if (error)
                goto Exit;
-        error = usermodehelper_disable();
-        if (error)
-                goto Exit;
        /* Allocate memory management structures */
        error = create_basic_memory_bitmaps();
        if (error)
@@ -624,15 +621,11 @@ int hibernate(void)
        error = freeze_processes();
        if (error)
-                goto Finish;
+                goto Free_bitmaps;
        error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
-        if (error)
+        if (error || freezer_test_done)
-                goto Thaw;
-        if (freezer_test_done) {
-                freezer_test_done = false;
                goto Thaw;
-        }
        if (in_suspend) {
                unsigned int flags = 0;
@@ -657,9 +650,12 @@ int hibernate(void)
 Thaw:
        thaw_processes();
- Finish:
+        /* Don't bother checking whether freezer_test_done is true */
+        freezer_test_done = false;
+ Free_bitmaps:
        free_basic_memory_bitmaps();
-        usermodehelper_enable();
 Exit:
        pm_notifier_call_chain(PM_POST_HIBERNATION);
        pm_restore_console();
@@ -774,15 +770,9 @@ static int software_resume(void)
        if (error)
                goto close_finish;
-        error = usermodehelper_disable();
-        if (error)
-                goto close_finish;
        error = create_basic_memory_bitmaps();
-        if (error) {
+        if (error)
-                usermodehelper_enable();
                goto close_finish;
-        }
        pr_debug("PM: Preparing processes for restore.\n");
        error = freeze_processes();
@@ -803,7 +793,6 @@ static int software_resume(void)
        thaw_processes();
 Done:
        free_basic_memory_bitmaps();
-        usermodehelper_enable();
 Finish:
        pm_notifier_call_chain(PM_POST_RESTORE);
        pm_restore_console();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9824b41e5a18..1c12581f1c62 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -165,16 +165,20 @@ static int suspend_stats_show(struct seq_file *s, void *unused)
        last_errno %= REC_FAILED_NUM;
        last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
        last_step %= REC_FAILED_NUM;
-        seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
+        seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
-                        "%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
+                        "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
                        "success", suspend_stats.success,
                        "fail", suspend_stats.fail,
                        "failed_freeze", suspend_stats.failed_freeze,
                        "failed_prepare", suspend_stats.failed_prepare,
                        "failed_suspend", suspend_stats.failed_suspend,
+                        "failed_suspend_late",
+                                suspend_stats.failed_suspend_late,
                        "failed_suspend_noirq",
                                suspend_stats.failed_suspend_noirq,
                        "failed_resume", suspend_stats.failed_resume,
+                        "failed_resume_early",
+                                suspend_stats.failed_resume_early,
                        "failed_resume_noirq",
                                suspend_stats.failed_resume_noirq);
        seq_printf(s,   "failures:\n  last_failed_dev:\t%-s\n",
@@ -287,16 +291,10 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
 #ifdef CONFIG_SUSPEND
        for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
-                if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
+                if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) {
+                        error = pm_suspend(state);
                        break;
-        }
+                }
-        if (state < PM_SUSPEND_MAX && *s) {
-                error = enter_state(state);
-                if (error) {
-                        suspend_stats.fail++;
-                        dpm_save_failed_errno(error);
-                } else
-                        suspend_stats.success++;
        }
 #endif
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 21724eee5206..98f3622d7407 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -177,13 +177,11 @@ extern const char *const pm_states[];
 extern bool valid_state(suspend_state_t state);
 extern int suspend_devices_and_enter(suspend_state_t state);
-extern int enter_state(suspend_state_t state);
 #else /* !CONFIG_SUSPEND */
 static inline int suspend_devices_and_enter(suspend_state_t state)
 {
        return -ENOSYS;
 }
-static inline int enter_state(suspend_state_t state) { return -ENOSYS; }
 static inline bool valid_state(suspend_state_t state) { return false; }
 #endif /* !CONFIG_SUSPEND */
@@ -234,16 +232,14 @@ static inline int suspend_freeze_processes(void)
        int error;
        error = freeze_processes();
        /*
         * freeze_processes() automatically thaws every task if freezing
         * fails. So we need not do anything extra upon error.
         */
        if (error)
-                goto Finish;
+                return error;
        error = freeze_kernel_threads();
        /*
         * freeze_kernel_threads() thaws only kernel threads upon freezing
         * failure. So we have to thaw the userspace tasks ourselves.
@@ -251,7 +247,6 @@ static inline int suspend_freeze_processes(void)
        if (error)
                thaw_processes();
- Finish:
        return error;
 }
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 7e426459e60a..19db29f67558 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -16,6 +16,7 @@
 #include <linux/freezer.h>
 #include <linux/delay.h>
 #include <linux/workqueue.h>
+#include <linux/kmod.h>
 /* 
 * Timeout for stopping processes
@@ -53,11 +54,9 @@ static int try_to_freeze_tasks(bool user_only)
                         * It is "frozen enough".  If the task does wake
                         * up, it will immediately call try_to_freeze.
                         *
-                         * Because freeze_task() goes through p's
+                         * Because freeze_task() goes through p's scheduler lock, it's
-                         * scheduler lock after setting TIF_FREEZE, it's
+                         * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
-                         * guaranteed that either we see TASK_RUNNING or
+                         * transition can't race with task state testing here.
-                         * try_to_stop() after schedule() in ptrace/signal
-                         * stop sees TIF_FREEZE.
                         */
                        if (!task_is_stopped_or_traced(p) &&
                            !freezer_should_skip(p))
@@ -98,13 +97,15 @@ static int try_to_freeze_tasks(bool user_only)
                       elapsed_csecs / 100, elapsed_csecs % 100,
                       todo - wq_busy, wq_busy);
-                read_lock(&tasklist_lock);
+                if (!wakeup) {
-                do_each_thread(g, p) {
+                        read_lock(&tasklist_lock);
-                        if (!wakeup && !freezer_should_skip(p) &&
+                        do_each_thread(g, p) {
-                            p != current && freezing(p) && !frozen(p))
+                                if (p != current && !freezer_should_skip(p)
-                                sched_show_task(p);
+                                    && freezing(p) && !frozen(p))
-                } while_each_thread(g, p);
+                                        sched_show_task(p);
-                read_unlock(&tasklist_lock);
+                        } while_each_thread(g, p);
+                        read_unlock(&tasklist_lock);
+                }
        } else {
                printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,
                        elapsed_csecs % 100);
@@ -122,6 +123,10 @@ int freeze_processes(void)
 {
        int error;
+        error = __usermodehelper_disable(UMH_FREEZING);
+        if (error)
+                return error;
        if (!pm_freezing)
                atomic_inc(&system_freezing_cnt);
@@ -130,6 +135,7 @@ int freeze_processes(void)
        error = try_to_freeze_tasks(true);
        if (!error) {
                printk("done.");
+                __usermodehelper_set_disable_depth(UMH_DISABLED);
                oom_killer_disable();
        }
        printk("\n");
@@ -187,6 +193,8 @@ void thaw_processes(void)
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
+        usermodehelper_enable();
        schedule();
        printk("done.\n");
 }
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 995e3bd3417b..6a031e684026 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -230,6 +230,21 @@ int pm_qos_request_active(struct pm_qos_request *req)
 EXPORT_SYMBOL_GPL(pm_qos_request_active);
 /**
+ * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout
+ * @work: work struct for the delayed work (timeout)
+ *
+ * This cancels the timeout request by falling back to the default at timeout.
+ */
+static void pm_qos_work_fn(struct work_struct *work)
+{
+        struct pm_qos_request *req = container_of(to_delayed_work(work),
+                                                  struct pm_qos_request,
+                                                  work);
+        pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
+}
+/**
 * pm_qos_add_request - inserts new qos request into the list
 * @req: pointer to a preallocated handle
 * @pm_qos_class: identifies which list of qos request to use
@@ -253,6 +268,7 @@ void pm_qos_add_request(struct pm_qos_request *req,
                return;
        }
        req->pm_qos_class = pm_qos_class;
+        INIT_DELAYED_WORK(&req->work, pm_qos_work_fn);
        pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
                             &req->node, PM_QOS_ADD_REQ, value);
 }
@@ -279,6 +295,9 @@ void pm_qos_update_request(struct pm_qos_request *req,
                return;
        }
+        if (delayed_work_pending(&req->work))
+                cancel_delayed_work_sync(&req->work);
        if (new_value != req->node.prio)
                pm_qos_update_target(
                        pm_qos_array[req->pm_qos_class]->constraints,
@@ -287,6 +306,34 @@ void pm_qos_update_request(struct pm_qos_request *req,
 EXPORT_SYMBOL_GPL(pm_qos_update_request);
 /**
+ * pm_qos_update_request_timeout - modifies an existing qos request temporarily.
+ * @req : handle to list element holding a pm_qos request to use
+ * @new_value: defines the temporal qos request
+ * @timeout_us: the effective duration of this qos request in usecs.
+ *
+ * After timeout_us, this qos request is cancelled automatically.
+ */
+void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
+                                   unsigned long timeout_us)
+{
+        if (!req)
+                return;
+        if (WARN(!pm_qos_request_active(req),
+                 "%s called for unknown object.", __func__))
+                return;
+        if (delayed_work_pending(&req->work))
+                cancel_delayed_work_sync(&req->work);
+        if (new_value != req->node.prio)
+                pm_qos_update_target(
+                        pm_qos_array[req->pm_qos_class]->constraints,
+                        &req->node, PM_QOS_UPDATE_REQ, new_value);
+        schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us));
+}
+/**
 * pm_qos_remove_request - modifies an existing qos request
 * @req: handle to request list element
 *
@@ -305,6 +352,9 @@ void pm_qos_remove_request(struct pm_qos_request *req)
                return;
        }
+        if (delayed_work_pending(&req->work))
+                cancel_delayed_work_sync(&req->work);
        pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
                             &req->node, PM_QOS_REMOVE_REQ,
                             PM_QOS_DEFAULT_VALUE);
@@ -469,21 +519,18 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
 static int __init pm_qos_power_init(void)
 {
        int ret = 0;
+        int i;
-        ret = register_pm_qos_misc(&cpu_dma_pm_qos);
+        BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
-        if (ret < 0) {
-                printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n");
+        for (i = 1; i < PM_QOS_NUM_CLASSES; i++) {
-                return ret;
+                ret = register_pm_qos_misc(pm_qos_array[i]);
-        }
+                if (ret < 0) {
-        ret = register_pm_qos_misc(&network_lat_pm_qos);
+                        printk(KERN_ERR "pm_qos_param: %s setup failed\n",
-        if (ret < 0) {
+                               pm_qos_array[i]->name);
-                printk(KERN_ERR "pm_qos_param: network_latency setup failed\n");
+                        return ret;
-                return ret;
+                }
        }
-        ret = register_pm_qos_misc(&network_throughput_pm_qos);
-        if (ret < 0)
-                printk(KERN_ERR
-                        "pm_qos_param: network_throughput setup failed\n");
        return ret;
 }
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 6a768e537001..0de28576807d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -711,9 +711,10 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
        list_for_each_entry(region, &nosave_regions, list) {
                unsigned long pfn;
-                pr_debug("PM: Marking nosave pages: %016lx - %016lx\n",
+                pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n",
-                                region->start_pfn << PAGE_SHIFT,
+                         (unsigned long long) region->start_pfn << PAGE_SHIFT,
-                                region->end_pfn << PAGE_SHIFT);
+                         ((unsigned long long) region->end_pfn << PAGE_SHIFT)
+                                - 1);
                for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
                        if (pfn_valid(pfn)) {
@@ -1000,20 +1001,20 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
        s_page = pfn_to_page(src_pfn);
        d_page = pfn_to_page(dst_pfn);
        if (PageHighMem(s_page)) {
-                src = kmap_atomic(s_page, KM_USER0);
+                src = kmap_atomic(s_page);
-                dst = kmap_atomic(d_page, KM_USER1);
+                dst = kmap_atomic(d_page);
                do_copy_page(dst, src);
-                kunmap_atomic(dst, KM_USER1);
+                kunmap_atomic(dst);
-                kunmap_atomic(src, KM_USER0);
+                kunmap_atomic(src);
        } else {
                if (PageHighMem(d_page)) {
                        /* Page pointed to by src may contain some kernel
                         * data modified by kmap_atomic()
                         */
                        safe_copy_page(buffer, s_page);
-                        dst = kmap_atomic(d_page, KM_USER0);
+                        dst = kmap_atomic(d_page);
                        copy_page(dst, buffer);
-                        kunmap_atomic(dst, KM_USER0);
+                        kunmap_atomic(dst);
                } else {
                        safe_copy_page(page_address(d_page), s_page);
                }
@@ -1728,9 +1729,9 @@ int snapshot_read_next(struct snapshot_handle *handle)
                         */
                        void *kaddr;
-                        kaddr = kmap_atomic(page, KM_USER0);
+                        kaddr = kmap_atomic(page);
                        copy_page(buffer, kaddr);
-                        kunmap_atomic(kaddr, KM_USER0);
+                        kunmap_atomic(kaddr);
                        handle->buffer = buffer;
                } else {
                        handle->buffer = page_address(page);
@@ -2014,9 +2015,9 @@ static void copy_last_highmem_page(void)
        if (last_highmem_page) {
                void *dst;
-                dst = kmap_atomic(last_highmem_page, KM_USER0);
+                dst = kmap_atomic(last_highmem_page);
                copy_page(dst, buffer);
-                kunmap_atomic(dst, KM_USER0);
+                kunmap_atomic(dst);
                last_highmem_page = NULL;
        }
 }
@@ -2309,13 +2310,13 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
 {
        void *kaddr1, *kaddr2;
-        kaddr1 = kmap_atomic(p1, KM_USER0);
+        kaddr1 = kmap_atomic(p1);
-        kaddr2 = kmap_atomic(p2, KM_USER1);
+        kaddr2 = kmap_atomic(p2);
        copy_page(buf, kaddr1);
        copy_page(kaddr1, kaddr2);
        copy_page(kaddr2, buf);
-        kunmap_atomic(kaddr2, KM_USER1);
+        kunmap_atomic(kaddr2);
-        kunmap_atomic(kaddr1, KM_USER0);
+        kunmap_atomic(kaddr1);
 }
 /**
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4fd51beed879..396d262b8fd0 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -12,7 +12,6 @@
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/init.h>
-#include <linux/kmod.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
@@ -37,8 +36,8 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
 static const struct platform_suspend_ops *suspend_ops;
 /**
- *      suspend_set_ops - Set the global suspend method table.
+ * suspend_set_ops - Set the global suspend method table.
- *      @ops:   Pointer to ops structure.
+ * @ops: Suspend operations to use.
 */
 void suspend_set_ops(const struct platform_suspend_ops *ops)
 {
@@ -58,11 +57,11 @@ bool valid_state(suspend_state_t state)
 }
 /**
- * suspend_valid_only_mem - generic memory-only valid callback
+ * suspend_valid_only_mem - Generic memory-only valid callback.
 *
- * Platform drivers that implement mem suspend only and only need
+ * Platform drivers that implement mem suspend only and only need to check for
- * to check for that in their .valid callback can use this instead
+ * that in their .valid() callback can use this instead of rolling their own
- * of rolling their own .valid callback.
+ * .valid() callback.
 */
 int suspend_valid_only_mem(suspend_state_t state)
 {
@@ -83,10 +82,11 @@ static int suspend_test(int level)
 }
 /**
- *      suspend_prepare - Do prep work before entering low-power state.
+ * suspend_prepare - Prepare for entering system sleep state.
 *
- *      This is common code that is called for each state that we're entering.
+ * Common code run for every system sleep state that can be entered (except for
- *      Run suspend notifiers, allocate a console and stop all processes.
+ * hibernation).  Run suspend notifiers, allocate the "suspend" console and
+ * freeze processes.
 */
 static int suspend_prepare(void)
 {
@@ -101,17 +101,12 @@ static int suspend_prepare(void)
        if (error)
                goto Finish;
-        error = usermodehelper_disable();
-        if (error)
-                goto Finish;
        error = suspend_freeze_processes();
        if (!error)
                return 0;
        suspend_stats.failed_freeze++;
        dpm_save_failed_step(SUSPEND_FREEZE);
-        usermodehelper_enable();
 Finish:
        pm_notifier_call_chain(PM_POST_SUSPEND);
        pm_restore_console();
@@ -131,9 +126,9 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
 }
 /**
- * suspend_enter - enter the desired system sleep state.
+ * suspend_enter - Make the system enter the given sleep state.
- * @state: State to enter
+ * @state: System sleep state to enter.
- * @wakeup: Returns information that suspend should not be entered again.
+ * @wakeup: Returns information that the sleep state should not be re-entered.
 *
 * This function should be called after devices have been suspended.
 */
@@ -147,7 +142,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
                        goto Platform_finish;
        }
-        error = dpm_suspend_noirq(PMSG_SUSPEND);
+        error = dpm_suspend_end(PMSG_SUSPEND);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down\n");
                goto Platform_finish;
@@ -189,7 +184,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
        if (suspend_ops->wake)
                suspend_ops->wake();
-        dpm_resume_noirq(PMSG_RESUME);
+        dpm_resume_start(PMSG_RESUME);
 Platform_finish:
        if (suspend_ops->finish)
@@ -199,9 +194,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 }
 /**
- *      suspend_devices_and_enter - suspend devices and enter the desired system
+ * suspend_devices_and_enter - Suspend devices and enter system sleep state.
- *                                  sleep state.
+ * @state: System sleep state to enter.
- *      @state:           state to enter
 */
 int suspend_devices_and_enter(suspend_state_t state)
 {
@@ -251,30 +245,27 @@ int suspend_devices_and_enter(suspend_state_t state)
 }
 /**
- *      suspend_finish - Do final work before exiting suspend sequence.
+ * suspend_finish - Clean up before finishing the suspend sequence.
 *
- *      Call platform code to clean up, restart processes, and free the
+ * Call platform code to clean up, restart processes, and free the console that
- *      console that we've allocated. This is not called for suspend-to-disk.
+ * we've allocated. This routine is not called for hibernation.
 */
 static void suspend_finish(void)
 {
        suspend_thaw_processes();
-        usermodehelper_enable();
        pm_notifier_call_chain(PM_POST_SUSPEND);
        pm_restore_console();
 }
 /**
- *      enter_state - Do common work of entering low-power state.
+ * enter_state - Do common work needed to enter system sleep state.
- *      @state:         pm_state structure for state we're entering.
+ * @state: System sleep state to enter.
 *
- *      Make sure we're the only ones trying to enter a sleep state. Fail
+ * Make sure that no one else is trying to put the system into a sleep state.
- *      if someone has beat us to it, since we don't want anything weird to
+ * Fail if that's not the case.  Otherwise, prepare for system suspend, make the
- *      happen when we wake up.
+ * system enter the given sleep state and clean up after wakeup.
- *      Then, do the setup for suspend, enter the state, and cleaup (after
- *      we've woken up).
 */
-int enter_state(suspend_state_t state)
+static int enter_state(suspend_state_t state)
 {
        int error;
@@ -310,24 +301,26 @@ int enter_state(suspend_state_t state)
 }
 /**
- *      pm_suspend - Externally visible function for suspending system.
+ * pm_suspend - Externally visible function for suspending the system.
- *      @state:         Enumerated value of state to enter.
+ * @state: System sleep state to enter.
 *
- *      Determine whether or not value is within range, get state
+ * Check if the value of @state represents one of the supported states,
- *      structure, and enter (above).
+ * execute enter_state() and update system suspend statistics.
 */
 int pm_suspend(suspend_state_t state)
 {
-        int ret;
+        int error;
-        if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) {
-                ret = enter_state(state);
+        if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
-                if (ret) {
+                return -EINVAL;
-                        suspend_stats.fail++;
-                        dpm_save_failed_errno(ret);
+        error = enter_state(state);
-                } else
+        if (error) {
-                        suspend_stats.success++;
+                suspend_stats.fail++;
-                return ret;
+                dpm_save_failed_errno(error);
+        } else {
+                suspend_stats.success++;
        }
-        return -EINVAL;
+        return error;
 }
 EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3e100075b13c..91b0fd021a95 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -12,7 +12,6 @@
 #include <linux/suspend.h>
 #include <linux/syscalls.h>
 #include <linux/reboot.h>
-#include <linux/kmod.h>
 #include <linux/string.h>
 #include <linux/device.h>
 #include <linux/miscdevice.h>
@@ -222,14 +221,8 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                sys_sync();
                printk("done.\n");
-                error = usermodehelper_disable();
-                if (error)
-                        break;
                error = freeze_processes();
-                if (error)
+                if (!error)
-                        usermodehelper_enable();
-                else
                        data->frozen = 1;
                break;
@@ -238,7 +231,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                        break;
                pm_restore_gfp_mask();
                thaw_processes();
-                usermodehelper_enable();
                data->frozen = 0;
                break;
@@ -249,16 +241,10 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                }
                pm_restore_gfp_mask();
                error = hibernation_snapshot(data->platform_support);
-                if (error) {
+                if (!error) {
-                        thaw_kernel_threads();
-                } else {
                        error = put_user(in_suspend, (int __user *)arg);
-                        if (!error && !freezer_test_done)
+                        data->ready = !freezer_test_done && !error;
-                                data->ready = 1;
+                        freezer_test_done = false;
-                        if (freezer_test_done) {
-                                freezer_test_done = false;
-                                thaw_kernel_threads();
-                        }
                }
                break;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 00ab2ca5ed11..ee8d49b9c309 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -231,26 +231,22 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 }
 static int ptrace_attach(struct task_struct *task, long request,
+                         unsigned long addr,
                         unsigned long flags)
 {
        bool seize = (request == PTRACE_SEIZE);
        int retval;
-        /*
-         * SEIZE will enable new ptrace behaviors which will be implemented
-         * gradually.  SEIZE_DEVEL is used to prevent applications
-         * expecting full SEIZE behaviors trapping on kernel commits which
-         * are still in the process of implementing them.
-         *
-         * Only test programs for new ptrace behaviors being implemented
-         * should set SEIZE_DEVEL.  If unset, SEIZE will fail with -EIO.
-         *
-         * Once SEIZE behaviors are completely implemented, this flag and
-         * the following test will be removed.
-         */
        retval = -EIO;
-        if (seize && !(flags & PTRACE_SEIZE_DEVEL))
+        if (seize) {
-                goto out;
+                if (addr != 0)
+                        goto out;
+                if (flags & ~(unsigned long)PTRACE_O_MASK)
+                        goto out;
+                flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
+        } else {
+                flags = PT_PTRACED;
+        }
        audit_ptrace(task);
@@ -262,7 +258,7 @@ static int ptrace_attach(struct task_struct *task, long request,
        /*
         * Protect exec's credential calculations against our interference;
-         * interference; SUID, SGID and LSM creds get determined differently
+         * SUID, SGID and LSM creds get determined differently
         * under ptrace.
         */
        retval = -ERESTARTNOINTR;
@@ -282,11 +278,11 @@ static int ptrace_attach(struct task_struct *task, long request,
        if (task->ptrace)
                goto unlock_tasklist;
-        task->ptrace = PT_PTRACED;
        if (seize)
-                task->ptrace |= PT_SEIZED;
+                flags |= PT_SEIZED;
        if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
-                task->ptrace |= PT_PTRACE_CAP;
+                flags |= PT_PTRACE_CAP;
+        task->ptrace = flags;
        __ptrace_link(task, current);
@@ -528,30 +524,18 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
 static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 {
-        child->ptrace &= ~PT_TRACE_MASK;
+        unsigned flags;
-        if (data & PTRACE_O_TRACESYSGOOD)
+        if (data & ~(unsigned long)PTRACE_O_MASK)
-                child->ptrace |= PT_TRACESYSGOOD;
+                return -EINVAL;
-        if (data & PTRACE_O_TRACEFORK)
-                child->ptrace |= PT_TRACE_FORK;
-        if (data & PTRACE_O_TRACEVFORK)
-                child->ptrace |= PT_TRACE_VFORK;
-        if (data & PTRACE_O_TRACECLONE)
-                child->ptrace |= PT_TRACE_CLONE;
-        if (data & PTRACE_O_TRACEEXEC)
-                child->ptrace |= PT_TRACE_EXEC;
-        if (data & PTRACE_O_TRACEVFORKDONE)
-                child->ptrace |= PT_TRACE_VFORK_DONE;
-        if (data & PTRACE_O_TRACEEXIT)
+        /* Avoid intermediate state when all opts are cleared */
-                child->ptrace |= PT_TRACE_EXIT;
+        flags = child->ptrace;
+        flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
+        flags |= (data << PT_OPT_FLAG_SHIFT);
+        child->ptrace = flags;
-        return (data & ~PTRACE_O_MASK) ? -EINVAL : 0;
+        return 0;
 }
 static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
@@ -891,7 +875,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
        }
        if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
-                ret = ptrace_attach(child, request, data);
+                ret = ptrace_attach(child, request, addr, data);
                /*
                 * Some architectures need to do book-keeping after
                 * a ptrace attach.
@@ -1034,7 +1018,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
        }
        if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
-                ret = ptrace_attach(child, request, data);
+                ret = ptrace_attach(child, request, addr, data);
                /*
                 * Some architectures need to do book-keeping after
                 * a ptrace attach.
diff --git a/kernel/resource.c b/kernel/resource.c
index 7640b3a947d0..7e8ea66a8c01 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -749,6 +749,7 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
        write_unlock(&resource_lock);
        return result;
 }
+EXPORT_SYMBOL(adjust_resource);
 static void __init __reserve_region_with_split(struct resource *root,
                resource_size_t start, resource_size_t end,
@@ -792,8 +793,6 @@ void __init reserve_region_with_split(struct resource *root,
        write_unlock(&resource_lock);
 }
-EXPORT_SYMBOL(adjust_resource);
 /**
 * resource_alignment - calculate resource's alignment
 * @res: resource pointer
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index b152f74f02de..6850f53e02d8 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -10,7 +10,6 @@
 #include <linux/export.h>
 #include <linux/rwsem.h>
-#include <asm/system.h>
 #include <linux/atomic.h>
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a35cb8dbd8c4..4603b9d8f30a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -71,7 +71,9 @@
 #include <linux/ftrace.h>
 #include <linux/slab.h>
 #include <linux/init_task.h>
+#include <linux/binfmts.h>
+#include <asm/switch_to.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include <asm/mutex.h>
@@ -1263,29 +1265,59 @@ EXPORT_SYMBOL_GPL(kick_process);
 */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
-        int dest_cpu;
        const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+        enum { cpuset, possible, fail } state = cpuset;
+        int dest_cpu;
        /* Look for allowed, online CPU in same node. */
-        for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
+        for_each_cpu(dest_cpu, nodemask) {
+                if (!cpu_online(dest_cpu))
+                        continue;
+                if (!cpu_active(dest_cpu))
+                        continue;
                if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
                        return dest_cpu;
+        }
+        for (;;) {
+                /* Any allowed, online CPU? */
+                for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
+                        if (!cpu_online(dest_cpu))
+                                continue;
+                        if (!cpu_active(dest_cpu))
+                                continue;
+                        goto out;
+                }
-        /* Any allowed, online CPU? */
+                switch (state) {
-        dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
+                case cpuset:
-        if (dest_cpu < nr_cpu_ids)
+                        /* No more Mr. Nice Guy. */
-                return dest_cpu;
+                        cpuset_cpus_allowed_fallback(p);
+                        state = possible;
+                        break;
-        /* No more Mr. Nice Guy. */
+                case possible:
-        dest_cpu = cpuset_cpus_allowed_fallback(p);
+                        do_set_cpus_allowed(p, cpu_possible_mask);
-        /*
+                        state = fail;
-         * Don't tell them about moving exiting tasks or
+                        break;
-         * kernel threads (both mm NULL), since they never
-         * leave kernel.
+                case fail:
-         */
+                        BUG();
-        if (p->mm && printk_ratelimit()) {
+                        break;
-                printk_sched("process %d (%s) no longer affine to cpu%d\n",
+                }
-                                task_pid_nr(p), p->comm, cpu);
+        }
+out:
+        if (state != cpuset) {
+                /*
+                 * Don't tell them about moving exiting tasks or
+                 * kernel threads (both mm NULL), since they never
+                 * leave kernel.
+                 */
+                if (p->mm && printk_ratelimit()) {
+                        printk_sched("process %d (%s) no longer affine to cpu%d\n",
+                                        task_pid_nr(p), p->comm, cpu);
+                }
        }
        return dest_cpu;
@@ -1932,6 +1964,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        local_irq_enable();
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
        finish_lock_switch(rq, prev);
+        finish_arch_post_lock_switch();
        fire_sched_in_preempt_notifiers(current);
        if (mm)
@@ -3069,8 +3102,6 @@ EXPORT_SYMBOL(sub_preempt_count);
 */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
-        struct pt_regs *regs = get_irq_regs();
        if (oops_in_progress)
                return;
@@ -3081,11 +3112,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
        print_modules();
        if (irqs_disabled())
                print_irqtrace_events(prev);
+        dump_stack();
-        if (regs)
-                show_regs(regs);
-        else
-                dump_stack();
 }
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 94340c7544a9..0d97ebdc58f0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -416,8 +416,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 #endif  /* CONFIG_FAIR_GROUP_SCHED */
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+static __always_inline
-                                   unsigned long delta_exec);
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
 /**************************************************************
 * Scheduling class tree data structure manipulation methods:
@@ -1162,7 +1162,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
                __clear_buddies_skip(se);
 }
-static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -1546,8 +1546,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
                resched_task(rq_of(cfs_rq)->curr);
 }
-static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+static __always_inline
-                                                   unsigned long delta_exec)
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
 {
        if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
                return;
@@ -2073,11 +2073,11 @@ void unthrottle_offline_cfs_rqs(struct rq *rq)
 }
 #else /* CONFIG_CFS_BANDWIDTH */
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+static __always_inline
-                                     unsigned long delta_exec) {}
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
-static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index b60dad720173..44af55e6d5d0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1428,7 +1428,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 next_idx:
                if (idx >= MAX_RT_PRIO)
                        continue;
-                if (next && next->prio < idx)
+                if (next && next->prio <= idx)
                        continue;
                list_for_each_entry(rt_se, array->queue + idx, run_list) {
                        struct task_struct *p;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 42b1f304b044..fb3acba4d52e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -681,6 +681,9 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
 #ifndef finish_arch_switch
 # define finish_arch_switch(prev)       do { } while (0)
 #endif
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch() do { } while (0)
+#endif
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
diff --git a/kernel/signal.c b/kernel/signal.c
index e76001ccf5cd..17afcaf582d0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -36,6 +36,7 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/siginfo.h>
+#include <asm/cacheflush.h>
 #include "audit.h"      /* audit_signal_info() */
 /*
@@ -58,21 +59,20 @@ static int sig_handler_ignored(void __user *handler, int sig)
                (handler == SIG_DFL && sig_kernel_ignore(sig));
 }
-static int sig_task_ignored(struct task_struct *t, int sig,
+static int sig_task_ignored(struct task_struct *t, int sig, bool force)
-                int from_ancestor_ns)
 {
        void __user *handler;
        handler = sig_handler(t, sig);
        if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
-                        handler == SIG_DFL && !from_ancestor_ns)
+                        handler == SIG_DFL && !force)
                return 1;
        return sig_handler_ignored(handler, sig);
 }
-static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
+static int sig_ignored(struct task_struct *t, int sig, bool force)
 {
        /*
         * Blocked signals are never ignored, since the
@@ -82,7 +82,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
        if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
                return 0;
-        if (!sig_task_ignored(t, sig, from_ancestor_ns))
+        if (!sig_task_ignored(t, sig, force))
                return 0;
        /*
@@ -855,7 +855,7 @@ static void ptrace_trap_notify(struct task_struct *t)
 * Returns true if the signal should be actually delivered, otherwise
 * it should be dropped.
 */
-static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
+static int prepare_signal(int sig, struct task_struct *p, bool force)
 {
        struct signal_struct *signal = p->signal;
        struct task_struct *t;
@@ -915,7 +915,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
                }
        }
-        return !sig_ignored(p, sig, from_ancestor_ns);
+        return !sig_ignored(p, sig, force);
 }
 /*
@@ -1059,7 +1059,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
        assert_spin_locked(&t->sighand->siglock);
        result = TRACE_SIGNAL_IGNORED;
-        if (!prepare_signal(sig, t, from_ancestor_ns))
+        if (!prepare_signal(sig, t,
+                        from_ancestor_ns || (info == SEND_SIG_FORCED)))
                goto ret;
        pending = group ? &t->signal->shared_pending : &t->pending;
@@ -1601,7 +1602,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
        ret = 1; /* the signal is ignored */
        result = TRACE_SIGNAL_IGNORED;
-        if (!prepare_signal(sig, t, 0))
+        if (!prepare_signal(sig, t, false))
                goto out;
        ret = 0;
diff --git a/kernel/smp.c b/kernel/smp.c
index db197d60489b..2f8b10ecf759 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -701,3 +701,93 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait)
        return ret;
 }
 EXPORT_SYMBOL(on_each_cpu);
+/**
+ * on_each_cpu_mask(): Run a function on processors specified by
+ * cpumask, which may include the local processor.
+ * @mask: The set of cpus to run on (only runs on online subset).
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed
+ *        on other CPUs.
+ *
+ * If @wait is true, then returns once @func has returned.
+ *
+ * You must not call this function with disabled interrupts or
+ * from a hardware interrupt handler or from a bottom half handler.
+ */
+void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
+                        void *info, bool wait)
+{
+        int cpu = get_cpu();
+        smp_call_function_many(mask, func, info, wait);
+        if (cpumask_test_cpu(cpu, mask)) {
+                local_irq_disable();
+                func(info);
+                local_irq_enable();
+        }
+        put_cpu();
+}
+EXPORT_SYMBOL(on_each_cpu_mask);
+/*
+ * on_each_cpu_cond(): Call a function on each processor for which
+ * the supplied function cond_func returns true, optionally waiting
+ * for all the required CPUs to finish. This may include the local
+ * processor.
+ * @cond_func:  A callback function that is passed a cpu id and
+ *              the the info parameter. The function is called
+ *              with preemption disabled. The function should
+ *              return a blooean value indicating whether to IPI
+ *              the specified CPU.
+ * @func:       The function to run on all applicable CPUs.
+ *              This must be fast and non-blocking.
+ * @info:       An arbitrary pointer to pass to both functions.
+ * @wait:       If true, wait (atomically) until function has
+ *              completed on other CPUs.
+ * @gfp_flags:  GFP flags to use when allocating the cpumask
+ *              used internally by the function.
+ *
+ * The function might sleep if the GFP flags indicates a non
+ * atomic allocation is allowed.
+ *
+ * Preemption is disabled to protect against CPUs going offline but not online.
+ * CPUs going online during the call will not be seen or sent an IPI.
+ *
+ * You must not call this function with disabled interrupts or
+ * from a hardware interrupt handler or from a bottom half handler.
+ */
+void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+                        smp_call_func_t func, void *info, bool wait,
+                        gfp_t gfp_flags)
+{
+        cpumask_var_t cpus;
+        int cpu, ret;
+        might_sleep_if(gfp_flags & __GFP_WAIT);
+        if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
+                preempt_disable();
+                for_each_online_cpu(cpu)
+                        if (cond_func(cpu, info))
+                                cpumask_set_cpu(cpu, cpus);
+                on_each_cpu_mask(cpus, func, info, wait);
+                preempt_enable();
+                free_cpumask_var(cpus);
+        } else {
+                /*
+                 * No free cpumask, bother. No matter, we'll
+                 * just have to IPI them one by one.
+                 */
+                preempt_disable();
+                for_each_online_cpu(cpu)
+                        if (cond_func(cpu, info)) {
+                                ret = smp_call_function_single(cpu, func,
+                                                                info, wait);
+                                WARN_ON_ONCE(!ret);
+                        }
+                preempt_enable();
+        }
+}
+EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 84c7d96918bf..5cdd8065a3ce 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -163,7 +163,7 @@ void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
 EXPORT_SYMBOL(_raw_spin_lock_bh);
 #endif
-#ifndef CONFIG_INLINE_SPIN_UNLOCK
+#ifdef CONFIG_UNINLINE_SPIN_UNLOCK
 void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
 {
        __raw_spin_unlock(lock);
diff --git a/kernel/sys.c b/kernel/sys.c
index 888d227fd195..e7006eb6c1e4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -444,6 +444,15 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
                        magic2 != LINUX_REBOOT_MAGIC2C))
                return -EINVAL;
+        /*
+         * If pid namespaces are enabled and the current task is in a child
+         * pid_namespace, the command is handled by reboot_pid_ns() which will
+         * call do_exit().
+         */
+        ret = reboot_pid_ns(task_active_pid_ns(current), cmd);
+        if (ret)
+                return ret;
        /* Instead of trying to make the power_off code look like
         * halt when pm_power_off is not set do it the easy way.
         */
@@ -1962,6 +1971,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                case PR_SET_MM:
                        error = prctl_set_mm(arg2, arg3, arg4, arg5);
                        break;
+                case PR_SET_CHILD_SUBREAPER:
+                        me->signal->is_child_subreaper = !!arg2;
+                        error = 0;
+                        break;
+                case PR_GET_CHILD_SUBREAPER:
+                        error = put_user(me->signal->is_child_subreaper,
+                                         (int __user *) arg2);
+                        break;
                default:
                        error = -EINVAL;
                        break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f487f257e05e..4ab11879aeb4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,6 +23,7 @@
 #include <linux/swap.h>
 #include <linux/slab.h>
 #include <linux/sysctl.h>
+#include <linux/bitmap.h>
 #include <linux/signal.h>
 #include <linux/printk.h>
 #include <linux/proc_fs.h>
@@ -58,6 +59,7 @@
 #include <linux/oom.h>
 #include <linux/kmod.h>
 #include <linux/capability.h>
+#include <linux/binfmts.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -67,6 +69,9 @@
 #include <asm/stacktrace.h>
 #include <asm/io.h>
 #endif
+#ifdef CONFIG_SPARC
+#include <asm/setup.h>
+#endif
 #ifdef CONFIG_BSD_PROCESS_ACCT
 #include <linux/acct.h>
 #endif
@@ -141,7 +146,6 @@ static const int cap_last_cap = CAP_LAST_CAP;
 #include <linux/inotify.h>
 #endif
 #ifdef CONFIG_SPARC
-#include <asm/system.h>
 #endif
 #ifdef CONFIG_SPARC64
@@ -166,7 +170,7 @@ static int proc_taint(struct ctl_table *table, int write,
 #endif
 #ifdef CONFIG_PRINTK
-static int proc_dmesg_restrict(struct ctl_table *table, int write,
+static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
@@ -192,20 +196,6 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
 #endif
-static struct ctl_table root_table[];
-static struct ctl_table_root sysctl_table_root;
-static struct ctl_table_header root_table_header = {
-        {{.count = 1,
-        .ctl_table = root_table,
-        .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
-        .root = &sysctl_table_root,
-        .set = &sysctl_table_root.default_set,
-};
-static struct ctl_table_root sysctl_table_root = {
-        .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
-        .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
-};
 static struct ctl_table kern_table[];
 static struct ctl_table vm_table[];
 static struct ctl_table fs_table[];
@@ -222,7 +212,7 @@ int sysctl_legacy_va_layout;
 /* The default sysctl tables: */
-static struct ctl_table root_table[] = {
+static struct ctl_table sysctl_base_table[] = {
        {
                .procname       = "kernel",
                .mode           = 0555,
@@ -713,7 +703,7 @@ static struct ctl_table kern_table[] = {
                .data           = &dmesg_restrict,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax_sysadmin,
                .extra1         = &zero,
                .extra2         = &one,
        },
@@ -722,7 +712,7 @@ static struct ctl_table kern_table[] = {
                .data           = &kptr_restrict,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dmesg_restrict,
+                .proc_handler   = proc_dointvec_minmax_sysadmin,
                .extra1         = &zero,
                .extra2         = &two,
        },
@@ -1559,490 +1549,12 @@ static struct ctl_table dev_table[] = {
        { }
 };
-static DEFINE_SPINLOCK(sysctl_lock);
+int __init sysctl_init(void)
-/* called under sysctl_lock */
-static int use_table(struct ctl_table_header *p)
-{
-        if (unlikely(p->unregistering))
-                return 0;
-        p->used++;
-        return 1;
-}
-/* called under sysctl_lock */
-static void unuse_table(struct ctl_table_header *p)
-{
-        if (!--p->used)
-                if (unlikely(p->unregistering))
-                        complete(p->unregistering);
-}
-/* called under sysctl_lock, will reacquire if has to wait */
-static void start_unregistering(struct ctl_table_header *p)
-{
-        /*
-         * if p->used is 0, nobody will ever touch that entry again;
-         * we'll eliminate all paths to it before dropping sysctl_lock
-         */
-        if (unlikely(p->used)) {
-                struct completion wait;
-                init_completion(&wait);
-                p->unregistering = &wait;
-                spin_unlock(&sysctl_lock);
-                wait_for_completion(&wait);
-                spin_lock(&sysctl_lock);
-        } else {
-                /* anything non-NULL; we'll never dereference it */
-                p->unregistering = ERR_PTR(-EINVAL);
-        }
-        /*
-         * do not remove from the list until nobody holds it; walking the
-         * list in do_sysctl() relies on that.
-         */
-        list_del_init(&p->ctl_entry);
-}
-void sysctl_head_get(struct ctl_table_header *head)
-{
-        spin_lock(&sysctl_lock);
-        head->count++;
-        spin_unlock(&sysctl_lock);
-}
-void sysctl_head_put(struct ctl_table_header *head)
-{
-        spin_lock(&sysctl_lock);
-        if (!--head->count)
-                kfree_rcu(head, rcu);
-        spin_unlock(&sysctl_lock);
-}
-struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
-{
-        if (!head)
-                BUG();
-        spin_lock(&sysctl_lock);
-        if (!use_table(head))
-                head = ERR_PTR(-ENOENT);
-        spin_unlock(&sysctl_lock);
-        return head;
-}
-void sysctl_head_finish(struct ctl_table_header *head)
-{
-        if (!head)
-                return;
-        spin_lock(&sysctl_lock);
-        unuse_table(head);
-        spin_unlock(&sysctl_lock);
-}
-static struct ctl_table_set *
-lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
-{
-        struct ctl_table_set *set = &root->default_set;
-        if (root->lookup)
-                set = root->lookup(root, namespaces);
-        return set;
-}
-static struct list_head *
-lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
-{
-        struct ctl_table_set *set = lookup_header_set(root, namespaces);
-        return &set->list;
-}
-struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
-                                            struct ctl_table_header *prev)
 {
-        struct ctl_table_root *root;
+        register_sysctl_table(sysctl_base_table);
-        struct list_head *header_list;
-        struct ctl_table_header *head;
-        struct list_head *tmp;
-        spin_lock(&sysctl_lock);
-        if (prev) {
-                head = prev;
-                tmp = &prev->ctl_entry;
-                unuse_table(prev);
-                goto next;
-        }
-        tmp = &root_table_header.ctl_entry;
-        for (;;) {
-                head = list_entry(tmp, struct ctl_table_header, ctl_entry);
-                if (!use_table(head))
-                        goto next;
-                spin_unlock(&sysctl_lock);
-                return head;
-        next:
-                root = head->root;
-                tmp = tmp->next;
-                header_list = lookup_header_list(root, namespaces);
-                if (tmp != header_list)
-                        continue;
-                do {
-                        root = list_entry(root->root_list.next,
-                                        struct ctl_table_root, root_list);
-                        if (root == &sysctl_table_root)
-                                goto out;
-                        header_list = lookup_header_list(root, namespaces);
-                } while (list_empty(header_list));
-                tmp = header_list->next;
-        }
-out:
-        spin_unlock(&sysctl_lock);
-        return NULL;
-}
-struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
-{
-        return __sysctl_head_next(current->nsproxy, prev);
-}
-void register_sysctl_root(struct ctl_table_root *root)
-{
-        spin_lock(&sysctl_lock);
-        list_add_tail(&root->root_list, &sysctl_table_root.root_list);
-        spin_unlock(&sysctl_lock);
-}
-/*
- * sysctl_perm does NOT grant the superuser all rights automatically, because
- * some sysctl variables are readonly even to root.
- */
-static int test_perm(int mode, int op)
-{
-        if (!current_euid())
-                mode >>= 6;
-        else if (in_egroup_p(0))
-                mode >>= 3;
-        if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
-                return 0;
-        return -EACCES;
-}
-int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
-{
-        int mode;
-        if (root->permissions)
-                mode = root->permissions(root, current->nsproxy, table);
-        else
-                mode = table->mode;
-        return test_perm(mode, op);
-}
-static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
-{
-        for (; table->procname; table++) {
-                table->parent = parent;
-                if (table->child)
-                        sysctl_set_parent(table, table->child);
-        }
-}
-static __init int sysctl_init(void)
-{
-        sysctl_set_parent(NULL, root_table);
-#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
-        sysctl_check_table(current->nsproxy, root_table);
-#endif
        return 0;
 }
-core_initcall(sysctl_init);
-static struct ctl_table *is_branch_in(struct ctl_table *branch,
-                                      struct ctl_table *table)
-{
-        struct ctl_table *p;
-        const char *s = branch->procname;
-        /* branch should have named subdirectory as its first element */
-        if (!s || !branch->child)
-                return NULL;
-        /* ... and nothing else */
-        if (branch[1].procname)
-                return NULL;
-        /* table should contain subdirectory with the same name */
-        for (p = table; p->procname; p++) {
-                if (!p->child)
-                        continue;
-                if (p->procname && strcmp(p->procname, s) == 0)
-                        return p;
-        }
-        return NULL;
-}
-/* see if attaching q to p would be an improvement */
-static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
-{
-        struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
-        struct ctl_table *next;
-        int is_better = 0;
-        int not_in_parent = !p->attached_by;
-        while ((next = is_branch_in(by, to)) != NULL) {
-                if (by == q->attached_by)
-                        is_better = 1;
-                if (to == p->attached_by)
-                        not_in_parent = 1;
-                by = by->child;
-                to = next->child;
-        }
-        if (is_better && not_in_parent) {
-                q->attached_by = by;
-                q->attached_to = to;
-                q->parent = p;
-        }
-}
-/**
- * __register_sysctl_paths - register a sysctl hierarchy
- * @root: List of sysctl headers to register on
- * @namespaces: Data to compute which lists of sysctl entries are visible
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * The members of the &struct ctl_table structure are used as follows:
- *
- * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
- *            enter a sysctl file
- *
- * data - a pointer to data for use by proc_handler
- *
- * maxlen - the maximum size in bytes of the data
- *
- * mode - the file permissions for the /proc/sys file, and for sysctl(2)
- *
- * child - a pointer to the child sysctl table if this entry is a directory, or
- *         %NULL.
- *
- * proc_handler - the text handler routine (described below)
- *
- * de - for internal use by the sysctl routines
- *
- * extra1, extra2 - extra pointers usable by the proc handler routines
- *
- * Leaf nodes in the sysctl tree will be represented by a single file
- * under /proc; non-leaf nodes will be represented by directories.
- *
- * sysctl(2) can automatically manage read and write requests through
- * the sysctl table.  The data and maxlen fields of the ctl_table
- * struct enable minimal validation of the values being written to be
- * performed, and the mode field allows minimal authentication.
- *
- * There must be a proc_handler routine for any terminal nodes
- * mirrored under /proc/sys (non-terminals are handled by a built-in
- * directory handler).  Several default handlers are available to
- * cover common cases -
- *
- * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
- * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), 
- * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
- *
- * It is the handler's job to read the input buffer from user memory
- * and process it. The handler should return 0 on success.
- *
- * This routine returns %NULL on a failure to register, and a pointer
- * to the table header on success.
- */
-struct ctl_table_header *__register_sysctl_paths(
-        struct ctl_table_root *root,
-        struct nsproxy *namespaces,
-        const struct ctl_path *path, struct ctl_table *table)
-{
-        struct ctl_table_header *header;
-        struct ctl_table *new, **prevp;
-        unsigned int n, npath;
-        struct ctl_table_set *set;
-        /* Count the path components */
-        for (npath = 0; path[npath].procname; ++npath)
-                ;
-        /*
-         * For each path component, allocate a 2-element ctl_table array.
-         * The first array element will be filled with the sysctl entry
-         * for this, the second will be the sentinel (procname == 0).
-         *
-         * We allocate everything in one go so that we don't have to
-         * worry about freeing additional memory in unregister_sysctl_table.
-         */
-        header = kzalloc(sizeof(struct ctl_table_header) +
-                         (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
-        if (!header)
-                return NULL;
-        new = (struct ctl_table *) (header + 1);
-        /* Now connect the dots */
-        prevp = &header->ctl_table;
-        for (n = 0; n < npath; ++n, ++path) {
-                /* Copy the procname */
-                new->procname = path->procname;
-                new->mode     = 0555;
-                *prevp = new;
-                prevp = &new->child;
-                new += 2;
-        }
-        *prevp = table;
-        header->ctl_table_arg = table;
-        INIT_LIST_HEAD(&header->ctl_entry);
-        header->used = 0;
-        header->unregistering = NULL;
-        header->root = root;
-        sysctl_set_parent(NULL, header->ctl_table);
-        header->count = 1;
-#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
-        if (sysctl_check_table(namespaces, header->ctl_table)) {
-                kfree(header);
-                return NULL;
-        }
-#endif
-        spin_lock(&sysctl_lock);
-        header->set = lookup_header_set(root, namespaces);
-        header->attached_by = header->ctl_table;
-        header->attached_to = root_table;
-        header->parent = &root_table_header;
-        for (set = header->set; set; set = set->parent) {
-                struct ctl_table_header *p;
-                list_for_each_entry(p, &set->list, ctl_entry) {
-                        if (p->unregistering)
-                                continue;
-                        try_attach(p, header);
-                }
-        }
-        header->parent->count++;
-        list_add_tail(&header->ctl_entry, &header->set->list);
-        spin_unlock(&sysctl_lock);
-        return header;
-}
-/**
- * register_sysctl_table_path - register a sysctl table hierarchy
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See __register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
-                                                struct ctl_table *table)
-{
-        return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
-                                        path, table);
-}
-/**
- * register_sysctl_table - register a sysctl table hierarchy
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
-{
-        static const struct ctl_path null_path[] = { {} };
-        return register_sysctl_paths(null_path, table);
-}
-/**
- * unregister_sysctl_table - unregister a sysctl table hierarchy
- * @header: the header returned from register_sysctl_table
- *
- * Unregisters the sysctl table and all children. proc entries may not
- * actually be removed until they are no longer used by anyone.
- */
-void unregister_sysctl_table(struct ctl_table_header * header)
-{
-        might_sleep();
-        if (header == NULL)
-                return;
-        spin_lock(&sysctl_lock);
-        start_unregistering(header);
-        if (!--header->parent->count) {
-                WARN_ON(1);
-                kfree_rcu(header->parent, rcu);
-        }
-        if (!--header->count)
-                kfree_rcu(header, rcu);
-        spin_unlock(&sysctl_lock);
-}
-int sysctl_is_seen(struct ctl_table_header *p)
-{
-        struct ctl_table_set *set = p->set;
-        int res;
-        spin_lock(&sysctl_lock);
-        if (p->unregistering)
-                res = 0;
-        else if (!set->is_seen)
-                res = 1;
-        else
-                res = set->is_seen(set);
-        spin_unlock(&sysctl_lock);
-        return res;
-}
-void setup_sysctl_set(struct ctl_table_set *p,
-        struct ctl_table_set *parent,
-        int (*is_seen)(struct ctl_table_set *))
-{
-        INIT_LIST_HEAD(&p->list);
-        p->parent = parent ? parent : &sysctl_table_root.default_set;
-        p->is_seen = is_seen;
-}
-#else /* !CONFIG_SYSCTL */
-struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
-{
-        return NULL;
-}
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
-                                                    struct ctl_table *table)
-{
-        return NULL;
-}
-void unregister_sysctl_table(struct ctl_table_header * table)
-{
-}
-void setup_sysctl_set(struct ctl_table_set *p,
-        struct ctl_table_set *parent,
-        int (*is_seen)(struct ctl_table_set *))
-{
-}
-void sysctl_head_put(struct ctl_table_header *head)
-{
-}
 #endif /* CONFIG_SYSCTL */
 /*
@@ -2431,7 +1943,7 @@ static int proc_taint(struct ctl_table *table, int write,
 }
 #ifdef CONFIG_PRINTK
-static int proc_dmesg_restrict(struct ctl_table *table, int write,
+static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        if (write && !capable(CAP_SYS_ADMIN))
@@ -2884,9 +2396,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
                                }
                        }
-                        while (val_a <= val_b)
+                        bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
-                                set_bit(val_a++, tmp_bitmap);
                        first = 0;
                        proc_skip_char(&kbuf, &left, '\n');
                }
@@ -2929,8 +2439,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
                        if (*ppos)
                                bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
                        else
-                                memcpy(bitmap, tmp_bitmap,
+                                bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
-                                        BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
                }
                kfree(tmp_bitmap);
                *lenp -= left;
@@ -3008,6 +2517,3 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
 EXPORT_SYMBOL(proc_dostring);
 EXPORT_SYMBOL(proc_doulongvec_minmax);
 EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
-EXPORT_SYMBOL(register_sysctl_table);
-EXPORT_SYMBOL(register_sysctl_paths);
-EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
deleted file mode 100644
index 362da653813d..000000000000
--- a/kernel/sysctl_check.c
+++ /dev/null
@@ -1,160 +0,0 @@
-#include <linux/stat.h>
-#include <linux/sysctl.h>
-#include "../fs/xfs/xfs_sysctl.h"
-#include <linux/sunrpc/debug.h>
-#include <linux/string.h>
-#include <net/ip_vs.h>
-static int sysctl_depth(struct ctl_table *table)
-{
-        struct ctl_table *tmp;
-        int depth;
-        depth = 0;
-        for (tmp = table; tmp->parent; tmp = tmp->parent)
-                depth++;
-        return depth;
-}
-static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
-{
-        int i;
-        for (i = 0; table && i < n; i++)
-                table = table->parent;
-        return table;
-}
-static void sysctl_print_path(struct ctl_table *table)
-{
-        struct ctl_table *tmp;
-        int depth, i;
-        depth = sysctl_depth(table);
-        if (table->procname) {
-                for (i = depth; i >= 0; i--) {
-                        tmp = sysctl_parent(table, i);
-                        printk("/%s", tmp->procname?tmp->procname:"");
-                }
-        }
-        printk(" ");
-}
-static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
-                                                struct ctl_table *table)
-{
-        struct ctl_table_header *head;
-        struct ctl_table *ref, *test;
-        int depth, cur_depth;
-        depth = sysctl_depth(table);
-        for (head = __sysctl_head_next(namespaces, NULL); head;
-             head = __sysctl_head_next(namespaces, head)) {
-                cur_depth = depth;
-                ref = head->ctl_table;
-repeat:
-                test = sysctl_parent(table, cur_depth);
-                for (; ref->procname; ref++) {
-                        int match = 0;
-                        if (cur_depth && !ref->child)
-                                continue;
-                        if (test->procname && ref->procname &&
-                            (strcmp(test->procname, ref->procname) == 0))
-                                        match++;
-                        if (match) {
-                                if (cur_depth != 0) {
-                                        cur_depth--;
-                                        ref = ref->child;
-                                        goto repeat;
-                                }
-                                goto out;
-                        }
-                }
-        }
-        ref = NULL;
-out:
-        sysctl_head_finish(head);
-        return ref;
-}
-static void set_fail(const char **fail, struct ctl_table *table, const char *str)
-{
-        if (*fail) {
-                printk(KERN_ERR "sysctl table check failed: ");
-                sysctl_print_path(table);
-                printk(" %s\n", *fail);
-                dump_stack();
-        }
-        *fail = str;
-}
-static void sysctl_check_leaf(struct nsproxy *namespaces,
-                                struct ctl_table *table, const char **fail)
-{
-        struct ctl_table *ref;
-        ref = sysctl_check_lookup(namespaces, table);
-        if (ref && (ref != table))
-                set_fail(fail, table, "Sysctl already exists");
-}
-int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
-{
-        int error = 0;
-        for (; table->procname; table++) {
-                const char *fail = NULL;
-                if (table->parent) {
-                        if (!table->parent->procname)
-                                set_fail(&fail, table, "Parent without procname");
-                }
-                if (table->child) {
-                        if (table->data)
-                                set_fail(&fail, table, "Directory with data?");
-                        if (table->maxlen)
-                                set_fail(&fail, table, "Directory with maxlen?");
-                        if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
-                                set_fail(&fail, table, "Writable sysctl directory");
-                        if (table->proc_handler)
-                                set_fail(&fail, table, "Directory with proc_handler");
-                        if (table->extra1)
-                                set_fail(&fail, table, "Directory with extra1");
-                        if (table->extra2)
-                                set_fail(&fail, table, "Directory with extra2");
-                } else {
-                        if ((table->proc_handler == proc_dostring) ||
-                            (table->proc_handler == proc_dointvec) ||
-                            (table->proc_handler == proc_dointvec_minmax) ||
-                            (table->proc_handler == proc_dointvec_jiffies) ||
-                            (table->proc_handler == proc_dointvec_userhz_jiffies) ||
-                            (table->proc_handler == proc_dointvec_ms_jiffies) ||
-                            (table->proc_handler == proc_doulongvec_minmax) ||
-                            (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
-                                if (!table->data)
-                                        set_fail(&fail, table, "No data");
-                                if (!table->maxlen)
-                                        set_fail(&fail, table, "No maxlen");
-                        }
-#ifdef CONFIG_PROC_SYSCTL
-                        if (!table->proc_handler)
-                                set_fail(&fail, table, "No proc_handler");
-#endif
-                        sysctl_check_leaf(namespaces, table, &fail);
-                }
-                if (table->mode > 0777)
-                        set_fail(&fail, table, "bogus .mode");
-                if (fail) {
-                        set_fail(&fail, table, NULL);
-                        error = -EINVAL;
-                }
-                if (table->child)
-                        error |= sysctl_check_table(namespaces, table->child);
-        }
-        return error;
-}
diff --git a/kernel/time.c b/kernel/time.c
index 73e416db0a1e..ba744cf80696 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -163,7 +163,6 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
                return error;
        if (tz) {
-                /* SMP safe, global irq locking makes it work. */
                sys_tz = *tz;
                update_vsyscall_tz();
                if (firsttime) {
@@ -173,12 +172,7 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
                }
        }
        if (tv)
-        {
-                /* SMP safe, again the code in arch/foo/time.c should
-                 * globally block out interrupts when it runs.
-                 */
                return do_settimeofday(tv);
-        }
        return 0;
 }
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 2cf9cc7aa103..a20dc8a3c949 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -1,6 +1,10 @@
 #
 # Timer subsystem related configuration options
 #
+# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is
+# only related to the tick functionality. Oneshot clockevent devices
+# are supported independ of this.
 config TICK_ONESHOT
        bool
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8a46f5d64504..8a538c55fc7b 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -96,6 +96,11 @@ static int alarmtimer_rtc_add_device(struct device *dev,
        return 0;
 }
+static inline void alarmtimer_rtc_timer_init(void)
+{
+        rtc_timer_init(&rtctimer, NULL, NULL);
+}
 static struct class_interface alarmtimer_rtc_interface = {
        .add_dev = &alarmtimer_rtc_add_device,
 };
@@ -117,6 +122,7 @@ static inline struct rtc_device *alarmtimer_get_rtcdev(void)
 #define rtcdev (NULL)
 static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
 static inline void alarmtimer_rtc_interface_remove(void) { }
+static inline void alarmtimer_rtc_timer_init(void) { }
 #endif
 /**
@@ -783,6 +789,8 @@ static int __init alarmtimer_init(void)
                .nsleep         = alarm_timer_nsleep,
        };
+        alarmtimer_rtc_timer_init();
        posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
        posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index a45ca167ab24..c9583382141a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -500,7 +500,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
 {
        u64 ret;
        /*
-         * We won't try to correct for more then 11% adjustments (110,000 ppm),
+         * We won't try to correct for more than 11% adjustments (110,000 ppm),
         */
        ret = (u64)cs->mult * 11;
        do_div(ret,100);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 6e039b144daf..f03fd83b170b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -34,8 +34,6 @@ unsigned long			tick_nsec;
 static u64                      tick_length;
 static u64                      tick_length_base;
-static struct hrtimer           leap_timer;
 #define MAX_TICKADJ             500LL           /* usecs */
 #define MAX_TICKADJ_SCALED \
        (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -381,70 +379,63 @@ u64 ntp_tick_length(void)
 /*
- * Leap second processing. If in leap-insert state at the end of the
+ * this routine handles the overflow of the microsecond field
- * day, the system clock is set back one second; if in leap-delete
+ *
- * state, the system clock is set ahead one second.
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ *
+ * Also handles leap second processing, and returns leap offset
 */
-static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
+int second_overflow(unsigned long secs)
 {
-        enum hrtimer_restart res = HRTIMER_NORESTART;
+        s64 delta;
-        unsigned long flags;
        int leap = 0;
+        unsigned long flags;
        spin_lock_irqsave(&ntp_lock, flags);
+        /*
+         * Leap second processing. If in leap-insert state at the end of the
+         * day, the system clock is set back one second; if in leap-delete
+         * state, the system clock is set ahead one second.
+         */
        switch (time_state) {
        case TIME_OK:
+                if (time_status & STA_INS)
+                        time_state = TIME_INS;
+                else if (time_status & STA_DEL)
+                        time_state = TIME_DEL;
                break;
        case TIME_INS:
-                leap = -1;
+                if (secs % 86400 == 0) {
-                time_state = TIME_OOP;
+                        leap = -1;
-                printk(KERN_NOTICE
+                        time_state = TIME_OOP;
-                        "Clock: inserting leap second 23:59:60 UTC\n");
+                        printk(KERN_NOTICE
-                hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
+                                "Clock: inserting leap second 23:59:60 UTC\n");
-                res = HRTIMER_RESTART;
+                }
                break;
        case TIME_DEL:
-                leap = 1;
+                if ((secs + 1) % 86400 == 0) {
-                time_tai--;
+                        leap = 1;
-                time_state = TIME_WAIT;
+                        time_tai--;
-                printk(KERN_NOTICE
+                        time_state = TIME_WAIT;
-                        "Clock: deleting leap second 23:59:59 UTC\n");
+                        printk(KERN_NOTICE
+                                "Clock: deleting leap second 23:59:59 UTC\n");
+                }
                break;
        case TIME_OOP:
                time_tai++;
                time_state = TIME_WAIT;
-                /* fall through */
+                break;
        case TIME_WAIT:
                if (!(time_status & (STA_INS | STA_DEL)))
                        time_state = TIME_OK;
                break;
        }
-        spin_unlock_irqrestore(&ntp_lock, flags);
-        /*
-         * We have to call this outside of the ntp_lock to keep
-         * the proper locking hierarchy
-         */
-        if (leap)
-                timekeeping_leap_insert(leap);
-        return res;
-}
-/*
- * this routine handles the overflow of the microsecond field
- *
- * The tricky bits of code to handle the accurate clock support
- * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
- * They were originally developed for SUN and DEC kernels.
- * All the kudos should go to Dave for this stuff.
- */
-void second_overflow(void)
-{
-        s64 delta;
-        unsigned long flags;
-        spin_lock_irqsave(&ntp_lock, flags);
        /* Bump the maxerror field */
        time_maxerror += MAXFREQ / NSEC_PER_USEC;
@@ -481,15 +472,17 @@ void second_overflow(void)
        tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
                                                         << NTP_SCALE_SHIFT;
        time_adjust = 0;
 out:
        spin_unlock_irqrestore(&ntp_lock, flags);
+        return leap;
 }
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
-/* Disable the cmos update - used by virtualization and embedded */
-int no_sync_cmos_clock  __read_mostly;
 static void sync_cmos_clock(struct work_struct *work);
 static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
@@ -536,35 +529,13 @@ static void sync_cmos_clock(struct work_struct *work)
 static void notify_cmos_timer(void)
 {
-        if (!no_sync_cmos_clock)
+        schedule_delayed_work(&sync_cmos_work, 0);
-                schedule_delayed_work(&sync_cmos_work, 0);
 }
 #else
 static inline void notify_cmos_timer(void) { }
 #endif
-/*
- * Start the leap seconds timer:
- */
-static inline void ntp_start_leap_timer(struct timespec *ts)
-{
-        long now = ts->tv_sec;
-        if (time_status & STA_INS) {
-                time_state = TIME_INS;
-                now += 86400 - now % 86400;
-                hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
-                return;
-        }
-        if (time_status & STA_DEL) {
-                time_state = TIME_DEL;
-                now += 86400 - (now + 1) % 86400;
-                hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
-        }
-}
 /*
 * Propagate a new txc->status value into the NTP state:
@@ -589,22 +560,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
        time_status &= STA_RONLY;
        time_status |= txc->status & ~STA_RONLY;
-        switch (time_state) {
-        case TIME_OK:
-                ntp_start_leap_timer(ts);
-                break;
-        case TIME_INS:
-        case TIME_DEL:
-                time_state = TIME_OK;
-                ntp_start_leap_timer(ts);
-        case TIME_WAIT:
-                if (!(time_status & (STA_INS | STA_DEL)))
-                        time_state = TIME_OK;
-                break;
-        case TIME_OOP:
-                hrtimer_restart(&leap_timer);
-                break;
-        }
 }
 /*
 * Called with the xtime lock held, so we can access and modify
@@ -686,9 +641,6 @@ int do_adjtimex(struct timex *txc)
                    (txc->tick <  900000/USER_HZ ||
                     txc->tick > 1100000/USER_HZ))
                        return -EINVAL;
-                if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
-                        hrtimer_cancel(&leap_timer);
        }
        if (txc->modes & ADJ_SETOFFSET) {
@@ -1010,6 +962,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup);
 void __init ntp_init(void)
 {
        ntp_clear();
-        hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
-        leap_timer.function = ntp_leap_second;
 }
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index e883f57a3cd3..bf57abdc7bd0 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -575,10 +575,12 @@ void tick_broadcast_switch_to_oneshot(void)
        unsigned long flags;
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
+        tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
        if (cpumask_empty(tick_get_broadcast_mask()))
                goto end;
-        tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
        bc = tick_broadcast_device.evtdev;
        if (bc)
                tick_broadcast_setup_oneshot(bc);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3526038f2836..6a3a5b9ff561 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -534,9 +534,9 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
                                hrtimer_get_expires(&ts->sched_timer), 0))
                                break;
                }
-                /* Update jiffies and reread time */
+                /* Reread time and update jiffies */
-                tick_do_update_jiffies64(now);
                now = ktime_get();
+                tick_do_update_jiffies64(now);
        }
 }
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 403c2a092830..d66b21308f7c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -184,18 +184,6 @@ static void timekeeping_update(bool clearntp)
 }
-void timekeeping_leap_insert(int leapsecond)
-{
-        unsigned long flags;
-        write_seqlock_irqsave(&timekeeper.lock, flags);
-        timekeeper.xtime.tv_sec += leapsecond;
-        timekeeper.wall_to_monotonic.tv_sec -= leapsecond;
-        timekeeping_update(false);
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
-}
 /**
 * timekeeping_forward_now - update clock to the current time
 *
@@ -448,9 +436,12 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
 static int change_clocksource(void *data)
 {
        struct clocksource *new, *old;
+        unsigned long flags;
        new = (struct clocksource *) data;
+        write_seqlock_irqsave(&timekeeper.lock, flags);
        timekeeping_forward_now();
        if (!new->enable || new->enable(new) == 0) {
                old = timekeeper.clock;
@@ -458,6 +449,10 @@ static int change_clocksource(void *data)
                if (old->disable)
                        old->disable(old);
        }
+        timekeeping_update(true);
+        write_sequnlock_irqrestore(&timekeeper.lock, flags);
        return 0;
 }
@@ -827,7 +822,7 @@ static void timekeeping_adjust(s64 offset)
        int adj;
        /*
-         * The point of this is to check if the error is greater then half
+         * The point of this is to check if the error is greater than half
         * an interval.
         *
         * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
@@ -835,7 +830,7 @@ static void timekeeping_adjust(s64 offset)
         * Note we subtract one in the shift, so that error is really error*2.
         * This "saves" dividing(shifting) interval twice, but keeps the
         * (error > interval) comparison as still measuring if error is
-         * larger then half an interval.
+         * larger than half an interval.
         *
         * Note: It does not "save" on aggravation when reading the code.
         */
@@ -843,7 +838,7 @@ static void timekeeping_adjust(s64 offset)
        if (error > interval) {
                /*
                 * We now divide error by 4(via shift), which checks if
-                 * the error is greater then twice the interval.
+                 * the error is greater than twice the interval.
                 * If it is greater, we need a bigadjust, if its smaller,
                 * we can adjust by 1.
                 */
@@ -874,13 +869,15 @@ static void timekeeping_adjust(s64 offset)
        } else /* No adjustment needed */
                return;
-        WARN_ONCE(timekeeper.clock->maxadj &&
+        if (unlikely(timekeeper.clock->maxadj &&
-                        (timekeeper.mult + adj > timekeeper.clock->mult +
+                        (timekeeper.mult + adj >
-                                                timekeeper.clock->maxadj),
+                        timekeeper.clock->mult + timekeeper.clock->maxadj))) {
-                        "Adjusting %s more then 11%% (%ld vs %ld)\n",
+                printk_once(KERN_WARNING
+                        "Adjusting %s more than 11%% (%ld vs %ld)\n",
                        timekeeper.clock->name, (long)timekeeper.mult + adj,
                        (long)timekeeper.clock->mult +
                                timekeeper.clock->maxadj);
+        }
        /*
         * So the following can be confusing.
         *
@@ -952,7 +949,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
        u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
        u64 raw_nsecs;
-        /* If the offset is smaller then a shifted interval, do nothing */
+        /* If the offset is smaller than a shifted interval, do nothing */
        if (offset < timekeeper.cycle_interval<<shift)
                return offset;
@@ -962,9 +959,11 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
        timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
        while (timekeeper.xtime_nsec >= nsecps) {
+                int leap;
                timekeeper.xtime_nsec -= nsecps;
                timekeeper.xtime.tv_sec++;
-                second_overflow();
+                leap = second_overflow(timekeeper.xtime.tv_sec);
+                timekeeper.xtime.tv_sec += leap;
        }
        /* Accumulate raw time */
@@ -1018,13 +1017,13 @@ static void update_wall_time(void)
         * With NO_HZ we may have to accumulate many cycle_intervals
         * (think "ticks") worth of time at once. To do this efficiently,
         * we calculate the largest doubling multiple of cycle_intervals
-         * that is smaller then the offset. We then accumulate that
+         * that is smaller than the offset.  We then accumulate that
         * chunk in one go, and then try to consume the next smaller
         * doubled multiple.
         */
        shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
        shift = max(0, shift);
-        /* Bound shift to one less then what overflows tick_length */
+        /* Bound shift to one less than what overflows tick_length */
        maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
        shift = min(shift, maxshift);
        while (offset >= timekeeper.cycle_interval) {
@@ -1072,12 +1071,14 @@ static void update_wall_time(void)
        /*
         * Finally, make sure that after the rounding
-         * xtime.tv_nsec isn't larger then NSEC_PER_SEC
+         * xtime.tv_nsec isn't larger than NSEC_PER_SEC
         */
        if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) {
+                int leap;
                timekeeper.xtime.tv_nsec -= NSEC_PER_SEC;
                timekeeper.xtime.tv_sec++;
-                second_overflow();
+                leap = second_overflow(timekeeper.xtime.tv_sec);
+                timekeeper.xtime.tv_sec += leap;
        }
        timekeeping_update(false);
@@ -1260,6 +1261,8 @@ ktime_t ktime_get_monotonic_offset(void)
        return timespec_to_ktime(wtom);
 }
+EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
 /**
 * xtime_update() - advances the timekeeping infrastructure
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index cd3134510f3d..a1d2849f2473 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -141,7 +141,7 @@ if FTRACE
 config FUNCTION_TRACER
        bool "Kernel Function Tracer"
        depends on HAVE_FUNCTION_TRACER
-        select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE
+        select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE
        select KALLSYMS
        select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index cdea7b56b0c9..c0bd0308741c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -311,13 +311,6 @@ int blk_trace_remove(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_trace_remove);
-static int blk_dropped_open(struct inode *inode, struct file *filp)
-{
-        filp->private_data = inode->i_private;
-        return 0;
-}
 static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
                                size_t count, loff_t *ppos)
 {
@@ -331,18 +324,11 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
 static const struct file_operations blk_dropped_fops = {
        .owner =        THIS_MODULE,
-        .open =         blk_dropped_open,
+        .open =         simple_open,
        .read =         blk_dropped_read,
        .llseek =       default_llseek,
 };
-static int blk_msg_open(struct inode *inode, struct file *filp)
-{
-        filp->private_data = inode->i_private;
-        return 0;
-}
 static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
                                size_t count, loff_t *ppos)
 {
@@ -371,7 +357,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 static const struct file_operations blk_msg_fops = {
        .owner =        THIS_MODULE,
-        .open =         blk_msg_open,
+        .open =         simple_open,
        .write =        blk_msg_write,
        .llseek =       noop_llseek,
 };
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 867bd1dd2dd0..0fa92f677c92 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -249,7 +249,8 @@ static void update_ftrace_function(void)
 #else
        __ftrace_trace_function = func;
 #endif
-        ftrace_trace_function = ftrace_test_stop_func;
+        ftrace_trace_function =
+                (func == ftrace_stub) ? func : ftrace_test_stop_func;
 #endif
 }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f5b7b5c1195b..cf8d11e91efd 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -154,33 +154,10 @@ enum {
 static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
-#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
+/* Used for individual buffers (after the counter) */
+#define RB_BUFFER_OFF           (1 << 20)
-/**
- * tracing_on - enable all tracing buffers
- *
- * This function enables all tracing buffers that may have been
- * disabled with tracing_off.
- */
-void tracing_on(void)
-{
-        set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
-}
-EXPORT_SYMBOL_GPL(tracing_on);
-/**
+#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
- * tracing_off - turn off all tracing buffers
- *
- * This function stops all tracing buffers from recording data.
- * It does not disable any overhead the tracers themselves may
- * be causing. This function simply causes all recording to
- * the ring buffers to fail.
- */
-void tracing_off(void)
-{
-        clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
-}
-EXPORT_SYMBOL_GPL(tracing_off);
 /**
 * tracing_off_permanent - permanently disable ring buffers
@@ -193,15 +170,6 @@ void tracing_off_permanent(void)
        set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
 }
-/**
- * tracing_is_on - show state of ring buffers enabled
- */
-int tracing_is_on(void)
-{
-        return ring_buffer_flags == RB_BUFFERS_ON;
-}
-EXPORT_SYMBOL_GPL(tracing_is_on);
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT            4U
 #define RB_MAX_SMALL_DATA       (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
@@ -2619,6 +2587,63 @@ void ring_buffer_record_enable(struct ring_buffer *buffer)
 EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
 /**
+ * ring_buffer_record_off - stop all writes into the buffer
+ * @buffer: The ring buffer to stop writes to.
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * This is different than ring_buffer_record_disable() as
+ * it works like an on/off switch, where as the disable() verison
+ * must be paired with a enable().
+ */
+void ring_buffer_record_off(struct ring_buffer *buffer)
+{
+        unsigned int rd;
+        unsigned int new_rd;
+        do {
+                rd = atomic_read(&buffer->record_disabled);
+                new_rd = rd | RB_BUFFER_OFF;
+        } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_record_off);
+/**
+ * ring_buffer_record_on - restart writes into the buffer
+ * @buffer: The ring buffer to start writes to.
+ *
+ * This enables all writes to the buffer that was disabled by
+ * ring_buffer_record_off().
+ *
+ * This is different than ring_buffer_record_enable() as
+ * it works like an on/off switch, where as the enable() verison
+ * must be paired with a disable().
+ */
+void ring_buffer_record_on(struct ring_buffer *buffer)
+{
+        unsigned int rd;
+        unsigned int new_rd;
+        do {
+                rd = atomic_read(&buffer->record_disabled);
+                new_rd = rd & ~RB_BUFFER_OFF;
+        } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_record_on);
+/**
+ * ring_buffer_record_is_on - return true if the ring buffer can write
+ * @buffer: The ring buffer to see if write is enabled
+ *
+ * Returns true if the ring buffer is in a state that it accepts writes.
+ */
+int ring_buffer_record_is_on(struct ring_buffer *buffer)
+{
+        return !atomic_read(&buffer->record_disabled);
+}
+/**
 * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
 * @buffer: The ring buffer to stop writes to.
 * @cpu: The CPU buffer to stop
@@ -4039,68 +4064,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_page);
-#ifdef CONFIG_TRACING
-static ssize_t
-rb_simple_read(struct file *filp, char __user *ubuf,
-               size_t cnt, loff_t *ppos)
-{
-        unsigned long *p = filp->private_data;
-        char buf[64];
-        int r;
-        if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
-                r = sprintf(buf, "permanently disabled\n");
-        else
-                r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
-        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-static ssize_t
-rb_simple_write(struct file *filp, const char __user *ubuf,
-                size_t cnt, loff_t *ppos)
-{
-        unsigned long *p = filp->private_data;
-        unsigned long val;
-        int ret;
-        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-        if (ret)
-                return ret;
-        if (val)
-                set_bit(RB_BUFFERS_ON_BIT, p);
-        else
-                clear_bit(RB_BUFFERS_ON_BIT, p);
-        (*ppos)++;
-        return cnt;
-}
-static const struct file_operations rb_simple_fops = {
-        .open           = tracing_open_generic,
-        .read           = rb_simple_read,
-        .write          = rb_simple_write,
-        .llseek         = default_llseek,
-};
-static __init int rb_init_debugfs(void)
-{
-        struct dentry *d_tracer;
-        d_tracer = tracing_init_dentry();
-        trace_create_file("tracing_on", 0644, d_tracer,
-                            &ring_buffer_flags, &rb_simple_fops);
-        return 0;
-}
-fs_initcall(rb_init_debugfs);
-#endif
 #ifdef CONFIG_HOTPLUG_CPU
 static int rb_cpu_notify(struct notifier_block *self,
                         unsigned long action, void *hcpu)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 10d5503f0d04..ed7b5d1e12f4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -36,6 +36,7 @@
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/poll.h>
+#include <linux/nmi.h>
 #include <linux/fs.h>
 #include "trace.h"
@@ -352,6 +353,59 @@ static void wakeup_work_handler(struct work_struct *work)
 static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler);
 /**
+ * tracing_on - enable tracing buffers
+ *
+ * This function enables tracing buffers that may have been
+ * disabled with tracing_off.
+ */
+void tracing_on(void)
+{
+        if (global_trace.buffer)
+                ring_buffer_record_on(global_trace.buffer);
+        /*
+         * This flag is only looked at when buffers haven't been
+         * allocated yet. We don't really care about the race
+         * between setting this flag and actually turning
+         * on the buffer.
+         */
+        global_trace.buffer_disabled = 0;
+}
+EXPORT_SYMBOL_GPL(tracing_on);
+/**
+ * tracing_off - turn off tracing buffers
+ *
+ * This function stops the tracing buffers from recording data.
+ * It does not disable any overhead the tracers themselves may
+ * be causing. This function simply causes all recording to
+ * the ring buffers to fail.
+ */
+void tracing_off(void)
+{
+        if (global_trace.buffer)
+                ring_buffer_record_on(global_trace.buffer);
+        /*
+         * This flag is only looked at when buffers haven't been
+         * allocated yet. We don't really care about the race
+         * between setting this flag and actually turning
+         * on the buffer.
+         */
+        global_trace.buffer_disabled = 1;
+}
+EXPORT_SYMBOL_GPL(tracing_off);
+/**
+ * tracing_is_on - show state of ring buffers enabled
+ */
+int tracing_is_on(void)
+{
+        if (global_trace.buffer)
+                return ring_buffer_record_is_on(global_trace.buffer);
+        return !global_trace.buffer_disabled;
+}
+EXPORT_SYMBOL_GPL(tracing_is_on);
+/**
 * trace_wake_up - wake up tasks waiting for trace input
 *
 * Schedules a delayed work to wake up any task that is blocked on the
@@ -1644,6 +1698,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
        int cpu_file = iter->cpu_file;
        u64 next_ts = 0, ts;
        int next_cpu = -1;
+        int next_size = 0;
        int cpu;
        /*
@@ -1675,9 +1730,12 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
                        next_cpu = cpu;
                        next_ts = ts;
                        next_lost = lost_events;
+                        next_size = iter->ent_size;
                }
        }
+        iter->ent_size = next_size;
        if (ent_cpu)
                *ent_cpu = next_cpu;
@@ -4567,6 +4625,55 @@ static __init void create_trace_options_dir(void)
                create_trace_option_core_file(trace_options[i], i);
 }
+static ssize_t
+rb_simple_read(struct file *filp, char __user *ubuf,
+               size_t cnt, loff_t *ppos)
+{
+        struct ring_buffer *buffer = filp->private_data;
+        char buf[64];
+        int r;
+        if (buffer)
+                r = ring_buffer_record_is_on(buffer);
+        else
+                r = 0;
+        r = sprintf(buf, "%d\n", r);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+static ssize_t
+rb_simple_write(struct file *filp, const char __user *ubuf,
+                size_t cnt, loff_t *ppos)
+{
+        struct ring_buffer *buffer = filp->private_data;
+        unsigned long val;
+        int ret;
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+        if (ret)
+                return ret;
+        if (buffer) {
+                if (val)
+                        ring_buffer_record_on(buffer);
+                else
+                        ring_buffer_record_off(buffer);
+        }
+        (*ppos)++;
+        return cnt;
+}
+static const struct file_operations rb_simple_fops = {
+        .open           = tracing_open_generic,
+        .read           = rb_simple_read,
+        .write          = rb_simple_write,
+        .llseek         = default_llseek,
+};
 static __init int tracer_init_debugfs(void)
 {
        struct dentry *d_tracer;
@@ -4626,6 +4733,9 @@ static __init int tracer_init_debugfs(void)
        trace_create_file("trace_clock", 0644, d_tracer, NULL,
                          &trace_clock_fops);
+        trace_create_file("tracing_on", 0644, d_tracer,
+                            global_trace.buffer, &rb_simple_fops);
 #ifdef CONFIG_DYNAMIC_FTRACE
        trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
                        &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -4798,6 +4908,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
                        if (ret != TRACE_TYPE_NO_CONSUME)
                                trace_consume(&iter);
                }
+                touch_nmi_watchdog();
                trace_printk_seq(&iter.seq);
        }
@@ -4863,6 +4974,8 @@ __init static int tracer_alloc_buffers(void)
                goto out_free_cpumask;
        }
        global_trace.entries = ring_buffer_size(global_trace.buffer);
+        if (global_trace.buffer_disabled)
+                tracing_off();
 #ifdef CONFIG_TRACER_MAX_TRACE
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 54faec790bc1..95059f091a24 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -154,6 +154,7 @@ struct trace_array {
        struct ring_buffer      *buffer;
        unsigned long           entries;
        int                     cpu;
+        int                     buffer_disabled;
        cycle_t                 time_start;
        struct task_struct      *waiter;
        struct trace_array_cpu  *data[NR_CPUS];
@@ -835,13 +836,11 @@ extern const char *__stop___trace_bprintk_fmt[];
                     filter)
 #include "trace_entries.h"
-#ifdef CONFIG_PERF_EVENTS
 #ifdef CONFIG_FUNCTION_TRACER
 int perf_ftrace_event_register(struct ftrace_event_call *call,
                               enum trace_reg type, void *data);
 #else
 #define perf_ftrace_event_register NULL
 #endif /* CONFIG_FUNCTION_TRACER */
-#endif /* CONFIG_PERF_EVENTS */
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index d91eb0541b3a..4108e1250ca2 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -166,6 +166,12 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
 #define FTRACE_STACK_ENTRIES    8
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
 FTRACE_ENTRY(kernel_stack, stack_entry,
        TRACE_STACK,
@@ -175,8 +181,9 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
                __dynamic_array(unsigned long,  caller  )
        ),
-        F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+        F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
-                 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
+                 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+                 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
                 __entry->caller[0], __entry->caller[1], __entry->caller[2],
                 __entry->caller[3], __entry->caller[4], __entry->caller[5],
                 __entry->caller[6], __entry->caller[7]),
@@ -193,8 +200,9 @@ FTRACE_ENTRY(user_stack, userstack_entry,
                __array(        unsigned long,  caller, FTRACE_STACK_ENTRIES    )
        ),
-        F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+        F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
-                 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
+                 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+                 "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
                 __entry->caller[0], __entry->caller[1], __entry->caller[2],
                 __entry->caller[3], __entry->caller[4], __entry->caller[5],
                 __entry->caller[6], __entry->caller[7]),
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 7b46c9bd22ae..3dd15e8bc856 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -162,7 +162,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 #define __dynamic_array(type, item)
 #undef F_printk
-#define F_printk(fmt, args...) #fmt ", "  __stringify(args)
+#define F_printk(fmt, args...) __stringify(fmt) ", "  __stringify(args)
 #undef FTRACE_ENTRY_REG
 #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c5a01873567d..859fae6b1825 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -264,7 +264,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
        return ret;
 }
-int trace_seq_path(struct trace_seq *s, struct path *path)
+int trace_seq_path(struct trace_seq *s, const struct path *path)
 {
        unsigned char *p;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 14bc092fb12c..df30ee08bdd4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -9,6 +9,8 @@
 * to those contributors as well.
 */
+#define pr_fmt(fmt) "NMI watchdog: " fmt
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/nmi.h>
@@ -319,11 +321,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 */
 static int watchdog(void *unused)
 {
-        struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+        struct sched_param param = { .sched_priority = 0 };
        struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
-        sched_setscheduler(current, SCHED_FIFO, &param);
        /* initialize timestamp */
        __touch_watchdog();
@@ -349,8 +349,11 @@ static int watchdog(void *unused)
                set_current_state(TASK_INTERRUPTIBLE);
        }
+        /*
+         * Drop the policy/priority elevation during thread exit to avoid a
+         * scheduling latency spike.
+         */
        __set_current_state(TASK_RUNNING);
-        param.sched_priority = 0;
        sched_setscheduler(current, SCHED_NORMAL, &param);
        return 0;
 }
@@ -376,18 +379,20 @@ static int watchdog_nmi_enable(int cpu)
        /* Try to register using hardware perf events */
        event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
        if (!IS_ERR(event)) {
-                printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
+                pr_info("enabled, takes one hw-pmu counter.\n");
                goto out_save;
        }
        /* vary the KERN level based on the returned errno */
        if (PTR_ERR(event) == -EOPNOTSUPP)
-                printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+                pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
        else if (PTR_ERR(event) == -ENOENT)
-                printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
+                pr_warning("disabled (cpu%i): hardware events not enabled\n",
+                         cpu);
        else
-                printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
+                pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
+                        cpu, PTR_ERR(event));
        return PTR_ERR(event);
        /* success path */
@@ -439,9 +444,10 @@ static int watchdog_enable(int cpu)
        /* create the watchdog thread */
        if (!p) {
+                struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
                p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
                if (IS_ERR(p)) {
-                        printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
+                        pr_err("softlockup watchdog for %i failed\n", cpu);
                        if (!err) {
                                /* if hardlockup hasn't already set this */
                                err = PTR_ERR(p);
@@ -450,6 +456,7 @@ static int watchdog_enable(int cpu)
                        }
                        goto out;
                }
+                sched_setscheduler(p, SCHED_FIFO, &param);
                kthread_bind(p, cpu);
                per_cpu(watchdog_touch_ts, cpu) = 0;
                per_cpu(softlockup_watchdog, cpu) = p;
@@ -496,7 +503,7 @@ static void watchdog_enable_all_cpus(void)
                        watchdog_enabled = 1;
        if (!watchdog_enabled)
-                printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
+                pr_err("failed to be enabled on some cpus\n");
 }