75 files changed, 3201 insertions, 1756 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 382dd5a8b2d7..94fabd534b03 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
        default 1000 if HZ_1000
 config SCHED_HRTICK
-        def_bool HIGH_RES_TIMERS && USE_GENERIC_SMP_HELPERS
+        def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
diff --git a/kernel/Makefile b/kernel/Makefile
index 54f69837d35a..4e1d7df7c3e2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -84,6 +84,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FTRACE) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
diff --git a/kernel/audit.c b/kernel/audit.c
index e092f1c0ce30..4414e93d8750 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -707,12 +707,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                if (status_get->mask & AUDIT_STATUS_ENABLED) {
                        err = audit_set_enabled(status_get->enabled,
                                                loginuid, sessionid, sid);
-                        if (err < 0) return err;
+                        if (err < 0)
+                                return err;
                }
                if (status_get->mask & AUDIT_STATUS_FAILURE) {
                        err = audit_set_failure(status_get->failure,
                                                loginuid, sessionid, sid);
-                        if (err < 0) return err;
+                        if (err < 0)
+                                return err;
                }
                if (status_get->mask & AUDIT_STATUS_PID) {
                        int new_pid = status_get->pid;
@@ -725,9 +727,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                        audit_pid = new_pid;
                        audit_nlk_pid = NETLINK_CB(skb).pid;
                }
-                if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
+                if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
                        err = audit_set_rate_limit(status_get->rate_limit,
                                                   loginuid, sessionid, sid);
+                        if (err < 0)
+                                return err;
+                }
                if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
                        err = audit_set_backlog_limit(status_get->backlog_limit,
                                                      loginuid, sessionid, sid);
@@ -1366,7 +1371,7 @@ int audit_string_contains_control(const char *string, size_t len)
 {
        const unsigned char *p;
        for (p = string; p < (const unsigned char *)string + len && *p; p++) {
-                if (*p == '"' || *p < 0x21 || *p > 0x7f)
+                if (*p == '"' || *p < 0x21 || *p > 0x7e)
                        return 1;
        }
        return 0;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 98c50cc671bb..b7d354e2b0ef 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1022,8 +1022,11 @@ static void audit_update_watch(struct audit_parent *parent,
                        struct audit_buffer *ab;
                        ab = audit_log_start(NULL, GFP_KERNEL,
                                AUDIT_CONFIG_CHANGE);
+                        audit_log_format(ab, "auid=%u ses=%u",
+                                audit_get_loginuid(current),
+                                audit_get_sessionid(current));
                        audit_log_format(ab,
-                                "op=updated rules specifying path=");
+                                " op=updated rules specifying path=");
                        audit_log_untrustedstring(ab, owatch->path);
                        audit_log_format(ab, " with dev=%u ino=%lu\n",
                                 dev, ino);
@@ -1058,7 +1061,10 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
                                struct audit_buffer *ab;
                                ab = audit_log_start(NULL, GFP_KERNEL,
                                        AUDIT_CONFIG_CHANGE);
-                                audit_log_format(ab, "op=remove rule path=");
+                                audit_log_format(ab, "auid=%u ses=%u",
+                                        audit_get_loginuid(current),
+                                        audit_get_sessionid(current));
+                                audit_log_format(ab, " op=remove rule path=");
                                audit_log_untrustedstring(ab, w->path);
                                if (r->filterkey) {
                                        audit_log_format(ab, " key=");
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4699950e65bd..59cedfb040e7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -243,7 +243,11 @@ static inline int open_arg(int flags, int mask)
 static int audit_match_perm(struct audit_context *ctx, int mask)
 {
-        unsigned n = ctx->major;
+        unsigned n;
+        if (unlikely(!ctx))
+                return 0;
+        n = ctx->major;
        switch (audit_classify_syscall(ctx->arch, n)) {
        case 0: /* native */
                if ((mask & AUDIT_PERM_WRITE) &&
@@ -284,6 +288,10 @@ static int audit_match_filetype(struct audit_context *ctx, int which)
 {
        unsigned index = which & ~S_IFMT;
        mode_t mode = which & S_IFMT;
+        if (unlikely(!ctx))
+                return 0;
        if (index >= ctx->name_count)
                return 0;
        if (ctx->names[index].ino == -1)
@@ -610,7 +618,7 @@ static int audit_filter_rules(struct task_struct *tsk,
                if (!result)
                        return 0;
        }
-        if (rule->filterkey)
+        if (rule->filterkey && ctx)
                ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
        switch (rule->action) {
        case AUDIT_NEVER:    *state = AUDIT_DISABLED;       break;
@@ -2375,7 +2383,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
        struct audit_context *ctx = tsk->audit_context;
        if (audit_pid && t->tgid == audit_pid) {
-                if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
+                if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
                        audit_sig_pid = tsk->pid;
                        if (tsk->loginuid != -1)
                                audit_sig_uid = tsk->loginuid;
diff --git a/kernel/capability.c b/kernel/capability.c
index 0101e847603e..33e51e78c2d8 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -486,17 +486,22 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
        return ret;
 }
-int __capable(struct task_struct *t, int cap)
+/**
+ * capable - Determine if the current task has a superior capability in effect
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+int capable(int cap)
 {
-        if (security_capable(t, cap) == 0) {
+        if (has_capability(current, cap)) {
-                t->flags |= PF_SUPERPRIV;
+                current->flags |= PF_SUPERPRIV;
                return 1;
        }
        return 0;
 }
-int capable(int cap)
-{
-        return __capable(current, cap);
-}
 EXPORT_SYMBOL(capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66ec9fd21e0c..a0123d75ec9a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -45,6 +45,7 @@
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
+#include <linux/namei.h>
 #include <asm/atomic.h>
@@ -354,6 +355,17 @@ static struct css_set *find_existing_css_set(
        return NULL;
 }
+static void free_cg_links(struct list_head *tmp)
+{
+        struct cg_cgroup_link *link;
+        struct cg_cgroup_link *saved_link;
+        list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
+                list_del(&link->cgrp_link_list);
+                kfree(link);
+        }
+}
 /*
 * allocate_cg_links() allocates "count" cg_cgroup_link structures
 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
@@ -362,17 +374,12 @@ static struct css_set *find_existing_css_set(
 static int allocate_cg_links(int count, struct list_head *tmp)
 {
        struct cg_cgroup_link *link;
-        struct cg_cgroup_link *saved_link;
        int i;
        INIT_LIST_HEAD(tmp);
        for (i = 0; i < count; i++) {
                link = kmalloc(sizeof(*link), GFP_KERNEL);
                if (!link) {
-                        list_for_each_entry_safe(link, saved_link, tmp,
+                        free_cg_links(tmp);
-                                                 cgrp_link_list) {
-                                list_del(&link->cgrp_link_list);
-                                kfree(link);
-                        }
                        return -ENOMEM;
                }
                list_add(&link->cgrp_link_list, tmp);
@@ -380,17 +387,6 @@ static int allocate_cg_links(int count, struct list_head *tmp)
        return 0;
 }
-static void free_cg_links(struct list_head *tmp)
-{
-        struct cg_cgroup_link *link;
-        struct cg_cgroup_link *saved_link;
-        list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
-                list_del(&link->cgrp_link_list);
-                kfree(link);
-        }
-}
 /*
 * find_css_set() takes an existing cgroup group and a
 * cgroup object, and returns a css_set object that's
@@ -955,7 +951,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        struct super_block *sb;
        struct cgroupfs_root *root;
        struct list_head tmp_cg_links;
-        INIT_LIST_HEAD(&tmp_cg_links);
        /* First find the desired set of subsystems */
        ret = parse_cgroupfs_options(data, &opts);
@@ -1423,14 +1418,17 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
                if (buffer == NULL)
                        return -ENOMEM;
        }
-        if (nbytes && copy_from_user(buffer, userbuf, nbytes))
+        if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
-                return -EFAULT;
+                retval = -EFAULT;
+                goto out;
+        }
        buffer[nbytes] = 0;     /* nul-terminate */
        strstrip(buffer);
        retval = cft->write_string(cgrp, cft, buffer);
        if (!retval)
                retval = nbytes;
+out:
        if (buffer != local_buffer)
                kfree(buffer);
        return retval;
@@ -1529,7 +1527,7 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
        return cft->read_seq_string(state->cgroup, cft, m);
 }
-int cgroup_seqfile_release(struct inode *inode, struct file *file)
+static int cgroup_seqfile_release(struct inode *inode, struct file *file)
 {
        struct seq_file *seq = file->private_data;
        kfree(seq->private);
@@ -2370,7 +2368,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
-static inline int cgroup_has_css_refs(struct cgroup *cgrp)
+static int cgroup_has_css_refs(struct cgroup *cgrp)
 {
        /* Check the reference count on each subsystem. Since we
         * already established that there are no tasks in the
@@ -2740,14 +2738,15 @@ void cgroup_fork_callbacks(struct task_struct *child)
 */
 void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
 {
-        struct cgroup *oldcgrp, *newcgrp;
+        struct cgroup *oldcgrp, *newcgrp = NULL;
        if (need_mm_owner_callback) {
                int i;
                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        oldcgrp = task_cgroup(old, ss->subsys_id);
-                        newcgrp = task_cgroup(new, ss->subsys_id);
+                        if (new)
+                                newcgrp = task_cgroup(new, ss->subsys_id);
                        if (oldcgrp == newcgrp)
                                continue;
                        if (ss->mm_owner_changed)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 10ba5f1004a5..86d49045daed 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param)
        struct take_cpu_down_param *param = _param;
        int err;
-        raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
-                                param->hcpu);
        /* Ensure this CPU doesn't handle any more interrupts. */
        err = __cpu_disable();
        if (err < 0)
                return err;
+        raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
+                                param->hcpu);
        /* Force idle task to run as soon as we yield: it should
           immediately notice cpu is offline and die quickly. */
        sched_idle_next();
@@ -216,7 +217,6 @@ static int __ref take_cpu_down(void *_param)
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
        int err, nr_calls = 0;
-        struct task_struct *p;
        cpumask_t old_allowed, tmp;
        void *hcpu = (void *)(long)cpu;
        unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
@@ -249,21 +249,18 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        cpus_setall(tmp);
        cpu_clear(cpu, tmp);
        set_cpus_allowed_ptr(current, &tmp);
+        tmp = cpumask_of_cpu(cpu);
-        p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
+        err = __stop_machine(take_cpu_down, &tcd_param, &tmp);
+        if (err) {
-        if (IS_ERR(p) || cpu_online(cpu)) {
                /* CPU didn't die: tell everyone.  Can't complain. */
                if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
                                            hcpu) == NOTIFY_BAD)
                        BUG();
-                if (IS_ERR(p)) {
+                goto out_allowed;
-                        err = PTR_ERR(p);
-                        goto out_allowed;
-                }
-                goto out_thread;
        }
+        BUG_ON(cpu_online(cpu));
        /* Wait for it to sleep (leaving idle task). */
        while (!idle_cpu(cpu))
@@ -279,8 +276,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        check_for_tasks(cpu);
-out_thread:
-        err = kthread_stop(p);
 out_allowed:
        set_cpus_allowed_ptr(current, &old_allowed);
 out_release:
@@ -355,6 +350,8 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
                goto out_notify;
        BUG_ON(!cpu_online(cpu));
+        cpu_set(cpu, cpu_active_map);
        /* Now call notifier in preparation. */
        raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
@@ -373,7 +370,7 @@ int __cpuinit cpu_up(unsigned int cpu)
        if (!cpu_isset(cpu, cpu_possible_map)) {
                printk(KERN_ERR "can't online cpu %d because it is not "
                        "configured as may-hotadd at boot time\n", cpu);
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) || defined(CONFIG_S390)
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
                printk(KERN_ERR "please check additional_cpus= boot "
                                "parameter\n");
 #endif
@@ -389,9 +386,6 @@ int __cpuinit cpu_up(unsigned int cpu)
        err = _cpu_up(cpu, 0);
-        if (cpu_online(cpu))
-                cpu_set(cpu, cpu_active_map);
 out:
        cpu_maps_update_done();
        return err;
@@ -460,4 +454,48 @@ out:
 }
 #endif /* CONFIG_PM_SLEEP_SMP */
+/**
+ * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
+ * @cpu: cpu that just started
+ *
+ * This function calls the cpu_chain notifiers with CPU_STARTING.
+ * It must be called by the arch code on the new cpu, before the new cpu
+ * enables interrupts and before the "boot" cpu returns from __cpu_up().
+ */
+void notify_cpu_starting(unsigned int cpu)
+{
+        unsigned long val = CPU_STARTING;
+#ifdef CONFIG_PM_SLEEP_SMP
+        if (cpu_isset(cpu, frozen_cpus))
+                val = CPU_STARTING_FROZEN;
+#endif /* CONFIG_PM_SLEEP_SMP */
+        raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
+}
 #endif /* CONFIG_SMP */
+/*
+ * cpu_bit_bitmap[] is a special, "compressed" data structure that
+ * represents all NR_CPUS bits binary values of 1<<nr.
+ *
+ * It is used by cpumask_of_cpu() to get a constant address to a CPU
+ * mask value that has a single bit set only.
+ */
+/* cpu_bit_bitmap[0] is empty - so we can back into it */
+#define MASK_DECLARE_1(x)       [x+1][0] = 1UL << (x)
+#define MASK_DECLARE_2(x)       MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
+#define MASK_DECLARE_4(x)       MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
+#define MASK_DECLARE_8(x)       MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
+const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
+        MASK_DECLARE_8(0),      MASK_DECLARE_8(8),
+        MASK_DECLARE_8(16),     MASK_DECLARE_8(24),
+#if BITS_PER_LONG > 32
+        MASK_DECLARE_8(32),     MASK_DECLARE_8(40),
+        MASK_DECLARE_8(48),     MASK_DECLARE_8(56),
+#endif
+};
+EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 91cf85b36dd5..eab7bd6628e0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -14,6 +14,8 @@
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  2006 Rework by Paul Menage to use generic cgroups
+ *  2008 Rework of the scheduler domains and CPU hotplug handling
+ *       by Max Krasnyansky
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
@@ -54,7 +56,6 @@
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
 #include <linux/mutex.h>
-#include <linux/kfifo.h>
 #include <linux/workqueue.h>
 #include <linux/cgroup.h>
@@ -237,9 +238,11 @@ static struct cpuset top_cpuset = {
 static DEFINE_MUTEX(callback_mutex);
-/* This is ugly, but preserves the userspace API for existing cpuset
+/*
+ * This is ugly, but preserves the userspace API for existing cpuset
 * users. If someone tries to mount the "cpuset" filesystem, we
- * silently switch it to mount "cgroup" instead */
+ * silently switch it to mount "cgroup" instead
+ */
 static int cpuset_get_sb(struct file_system_type *fs_type,
                         int flags, const char *unused_dev_name,
                         void *data, struct vfsmount *mnt)
@@ -474,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 }
 /*
- * Helper routine for rebuild_sched_domains().
+ * Helper routine for generate_sched_domains().
 * Do cpusets a, b have overlapping cpus_allowed masks?
 */
 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 {
        return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
@@ -486,34 +488,48 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 static void
 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 {
-        if (!dattr)
-                return;
        if (dattr->relax_domain_level < c->relax_domain_level)
                dattr->relax_domain_level = c->relax_domain_level;
        return;
 }
+static void
+update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+{
+        LIST_HEAD(q);
+        list_add(&c->stack_list, &q);
+        while (!list_empty(&q)) {
+                struct cpuset *cp;
+                struct cgroup *cont;
+                struct cpuset *child;
+                cp = list_first_entry(&q, struct cpuset, stack_list);
+                list_del(q.next);
+                if (cpus_empty(cp->cpus_allowed))
+                        continue;
+                if (is_sched_load_balance(cp))
+                        update_domain_attr(dattr, cp);
+                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+                        child = cgroup_cs(cont);
+                        list_add_tail(&child->stack_list, &q);
+                }
+        }
+}
 /*
- * rebuild_sched_domains()
+ * generate_sched_domains()
- *
+ *
- * This routine will be called to rebuild the scheduler's dynamic
+ * This function builds a partial partition of the systems CPUs
- * sched domains:
+ * A 'partial partition' is a set of non-overlapping subsets whose
- * - if the flag 'sched_load_balance' of any cpuset with non-empty
+ * union is a subset of that set.
- *   'cpus' changes,
+ * The output of this function needs to be passed to kernel/sched.c
- * - or if the 'cpus' allowed changes in any cpuset which has that
+ * partition_sched_domains() routine, which will rebuild the scheduler's
- *   flag enabled,
+ * load balancing domains (sched domains) as specified by that partial
- * - or if the 'sched_relax_domain_level' of any cpuset which has
+ * partition.
- *   that flag enabled and with non-empty 'cpus' changes,
- * - or if any cpuset with non-empty 'cpus' is removed,
- * - or if a cpu gets offlined.
- *
- * This routine builds a partial partition of the systems CPUs
- * (the set of non-overlappping cpumask_t's in the array 'part'
- * below), and passes that partial partition to the kernel/sched.c
- * partition_sched_domains() routine, which will rebuild the
- * schedulers load balancing domains (sched domains) as specified
- * by that partial partition.  A 'partial partition' is a set of
- * non-overlapping subsets whose union is a subset of that set.
 *
 * See "What is sched_load_balance" in Documentation/cpusets.txt
 * for a background explanation of this.
@@ -523,16 +539,10 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 * domains when operating in the severe memory shortage situations
 * that could cause allocation failures below.
 *
- * Call with cgroup_mutex held.  May take callback_mutex during
+ * Must be called with cgroup_lock held.
- * call due to the kfifo_alloc() and kmalloc() calls.  May nest
- * a call to the get_online_cpus()/put_online_cpus() pair.
- * Must not be called holding callback_mutex, because we must not
- * call get_online_cpus() while holding callback_mutex.  Elsewhere
- * the kernel nests callback_mutex inside get_online_cpus() calls.
- * So the reverse nesting would risk an ABBA deadlock.
 *
 * The three key local variables below are:
- *    q  - a kfifo queue of cpuset pointers, used to implement a
+ *    q  - a linked-list queue of cpuset pointers, used to implement a
 *         top-down scan of all cpusets.  This scan loads a pointer
 *         to each cpuset marked is_sched_load_balance into the
 *         array 'csa'.  For our purposes, rebuilding the schedulers
@@ -564,10 +574,10 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 *      element of the partition (one sched domain) to be passed to
 *      partition_sched_domains().
 */
+static int generate_sched_domains(cpumask_t **domains,
-void rebuild_sched_domains(void)
+                        struct sched_domain_attr **attributes)
 {
-        struct kfifo *q;        /* queue of cpusets to be scanned */
+        LIST_HEAD(q);           /* queue of cpusets to be scanned */
        struct cpuset *cp;      /* scans q */
        struct cpuset **csa;    /* array of all cpuset ptrs */
        int csn;                /* how many cpuset ptrs in csa so far */
@@ -577,49 +587,58 @@ void rebuild_sched_domains(void)
        int ndoms;              /* number of sched domains in result */
        int nslot;              /* next empty doms[] cpumask_t slot */
-        q = NULL;
+        ndoms = 0;
-        csa = NULL;
        doms = NULL;
        dattr = NULL;
+        csa = NULL;
        /* Special case for the 99% of systems with one, full, sched domain */
        if (is_sched_load_balance(&top_cpuset)) {
-                ndoms = 1;
                doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
                if (!doms)
-                        goto rebuild;
+                        goto done;
                dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
                if (dattr) {
                        *dattr = SD_ATTR_INIT;
-                        update_domain_attr(dattr, &top_cpuset);
+                        update_domain_attr_tree(dattr, &top_cpuset);
                }
                *doms = top_cpuset.cpus_allowed;
-                goto rebuild;
-        }
-        q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL);
+                ndoms = 1;
-        if (IS_ERR(q))
                goto done;
+        }
        csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
        if (!csa)
                goto done;
        csn = 0;
-        cp = &top_cpuset;
+        list_add(&top_cpuset.stack_list, &q);
-        __kfifo_put(q, (void *)&cp, sizeof(cp));
+        while (!list_empty(&q)) {
-        while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
                struct cgroup *cont;
                struct cpuset *child;   /* scans child cpusets of cp */
+                cp = list_first_entry(&q, struct cpuset, stack_list);
+                list_del(q.next);
                if (cpus_empty(cp->cpus_allowed))
                        continue;
-                if (is_sched_load_balance(cp))
+                /*
+                 * All child cpusets contain a subset of the parent's cpus, so
+                 * just skip them, and then we call update_domain_attr_tree()
+                 * to calc relax_domain_level of the corresponding sched
+                 * domain.
+                 */
+                if (is_sched_load_balance(cp)) {
                        csa[csn++] = cp;
+                        continue;
+                }
                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
                        child = cgroup_cs(cont);
-                        __kfifo_put(q, (void *)&child, sizeof(cp));
+                        list_add_tail(&child->stack_list, &q);
                }
        }
@@ -650,63 +669,141 @@ restart:
                }
        }
-        /* Convert <csn, csa> to <ndoms, doms> */
+        /*
+         * Now we know how many domains to create.
+         * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
+         */
        doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
-        if (!doms)
+        if (!doms) {
-                goto rebuild;
+                ndoms = 0;
+                goto done;
+        }
+        /*
+         * The rest of the code, including the scheduler, can deal with
+         * dattr==NULL case. No need to abort if alloc fails.
+         */
        dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
        for (nslot = 0, i = 0; i < csn; i++) {
                struct cpuset *a = csa[i];
+                cpumask_t *dp;
                int apn = a->pn;
-                if (apn >= 0) {
+                if (apn < 0) {
-                        cpumask_t *dp = doms + nslot;
+                        /* Skip completed partitions */
+                        continue;
-                        if (nslot == ndoms) {
+                }
-                                static int warnings = 10;
-                                if (warnings) {
+                dp = doms + nslot;
-                                        printk(KERN_WARNING
-                                         "rebuild_sched_domains confused:"
+                if (nslot == ndoms) {
-                                          " nslot %d, ndoms %d, csn %d, i %d,"
+                        static int warnings = 10;
-                                          " apn %d\n",
+                        if (warnings) {
-                                          nslot, ndoms, csn, i, apn);
+                                printk(KERN_WARNING
-                                        warnings--;
+                                 "rebuild_sched_domains confused:"
-                                }
+                                  " nslot %d, ndoms %d, csn %d, i %d,"
-                                continue;
+                                  " apn %d\n",
+                                  nslot, ndoms, csn, i, apn);
+                                warnings--;
                        }
+                        continue;
+                }
-                        cpus_clear(*dp);
+                cpus_clear(*dp);
-                        if (dattr)
+                if (dattr)
-                                *(dattr + nslot) = SD_ATTR_INIT;
+                        *(dattr + nslot) = SD_ATTR_INIT;
-                        for (j = i; j < csn; j++) {
+                for (j = i; j < csn; j++) {
-                                struct cpuset *b = csa[j];
+                        struct cpuset *b = csa[j];
-                                if (apn == b->pn) {
+                        if (apn == b->pn) {
-                                        cpus_or(*dp, *dp, b->cpus_allowed);
+                                cpus_or(*dp, *dp, b->cpus_allowed);
-                                        b->pn = -1;
+                                if (dattr)
-                                        if (dattr)
+                                        update_domain_attr_tree(dattr + nslot, b);
-                                                update_domain_attr(dattr
-                                                                   + nslot, b);
+                                /* Done with this partition */
-                                }
+                                b->pn = -1;
                        }
-                        nslot++;
                }
+                nslot++;
        }
        BUG_ON(nslot != ndoms);
-rebuild:
+done:
-        /* Have scheduler rebuild sched domains */
+        kfree(csa);
+        *domains    = doms;
+        *attributes = dattr;
+        return ndoms;
+}
+/*
+ * Rebuild scheduler domains.
+ *
+ * Call with neither cgroup_mutex held nor within get_online_cpus().
+ * Takes both cgroup_mutex and get_online_cpus().
+ *
+ * Cannot be directly called from cpuset code handling changes
+ * to the cpuset pseudo-filesystem, because it cannot be called
+ * from code that already holds cgroup_mutex.
+ */
+static void do_rebuild_sched_domains(struct work_struct *unused)
+{
+        struct sched_domain_attr *attr;
+        cpumask_t *doms;
+        int ndoms;
        get_online_cpus();
-        partition_sched_domains(ndoms, doms, dattr);
+        /* Generate domain masks and attrs */
+        cgroup_lock();
+        ndoms = generate_sched_domains(&doms, &attr);
+        cgroup_unlock();
+        /* Have scheduler rebuild the domains */
+        partition_sched_domains(ndoms, doms, attr);
        put_online_cpus();
+}
-done:
+static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
-        if (q && !IS_ERR(q))
-                kfifo_free(q);
+/*
-        kfree(csa);
+ * Rebuild scheduler domains, asynchronously via workqueue.
-        /* Don't kfree(doms) -- partition_sched_domains() does that. */
+ *
-        /* Don't kfree(dattr) -- partition_sched_domains() does that. */
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
+ *
+ * The rebuild_sched_domains() and partition_sched_domains()
+ * routines must nest cgroup_lock() inside get_online_cpus(),
+ * but such cpuset changes as these must nest that locking the
+ * other way, holding cgroup_lock() for much of the code.
+ *
+ * So in order to avoid an ABBA deadlock, the cpuset code handling
+ * these user changes delegates the actual sched domain rebuilding
+ * to a separate workqueue thread, which ends up processing the
+ * above do_rebuild_sched_domains() function.
+ */
+static void async_rebuild_sched_domains(void)
+{
+        schedule_work(&rebuild_sched_domains_work);
+}
+/*
+ * Accomplishes the same scheduler domain rebuild as the above
+ * async_rebuild_sched_domains(), however it directly calls the
+ * rebuild routine synchronously rather than calling it via an
+ * asynchronous work thread.
+ *
+ * This can only be called from code that is not holding
+ * cgroup_mutex (not nested in a cgroup_lock() call.)
+ */
+void rebuild_sched_domains(void)
+{
+        do_rebuild_sched_domains(NULL);
 }
 /**
@@ -746,37 +843,25 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
 /**
 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 *
 * Called with cgroup_mutex held
 *
 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
 * calling callback functions for each.
 *
- * Return 0 if successful, -errno if not.
+ * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * if @heap != NULL.
 */
-static int update_tasks_cpumask(struct cpuset *cs)
+static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
 {
        struct cgroup_scanner scan;
-        struct ptr_heap heap;
-        int retval;
-        /*
-         * cgroup_scan_tasks() will initialize heap->gt for us.
-         * heap_init() is still needed here for we should not change
-         * cs->cpus_allowed when heap_init() fails.
-         */
-        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
-        if (retval)
-                return retval;
        scan.cg = cs->css.cgroup;
        scan.test_task = cpuset_test_cpumask;
        scan.process_task = cpuset_change_cpumask;
-        scan.heap = &heap;
+        scan.heap = heap;
-        retval = cgroup_scan_tasks(&scan);
+        cgroup_scan_tasks(&scan);
-        heap_free(&heap);
-        return retval;
 }
 /**
@@ -786,6 +871,7 @@ static int update_tasks_cpumask(struct cpuset *cs)
 */
 static int update_cpumask(struct cpuset *cs, const char *buf)
 {
+        struct ptr_heap heap;
        struct cpuset trialcs;
        int retval;
        int is_load_balanced;
@@ -820,6 +906,10 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
        if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
                return 0;
+        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+        if (retval)
+                return retval;
        is_load_balanced = is_sched_load_balance(&trialcs);
        mutex_lock(&callback_mutex);
@@ -830,12 +920,12 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
         * Scan tasks in the cpuset, and update the cpumasks of any
         * that need an update.
         */
-        retval = update_tasks_cpumask(cs);
+        update_tasks_cpumask(cs, &heap);
-        if (retval < 0)
-                return retval;
+        heap_free(&heap);
        if (is_load_balanced)
-                rebuild_sched_domains();
+                async_rebuild_sched_domains();
        return 0;
 }
@@ -1062,7 +1152,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
        if (val != cs->relax_domain_level) {
                cs->relax_domain_level = val;
                if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
-                        rebuild_sched_domains();
+                        async_rebuild_sched_domains();
        }
        return 0;
@@ -1103,7 +1193,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
        mutex_unlock(&callback_mutex);
        if (cpus_nonempty && balance_flag_changed)
-                rebuild_sched_domains();
+                async_rebuild_sched_domains();
        return 0;
 }
@@ -1464,6 +1554,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
        default:
                BUG();
        }
+        /* Unreachable but makes gcc happy */
+        return 0;
 }
 static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
@@ -1476,6 +1569,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
        default:
                BUG();
        }
+        /* Unrechable but makes gcc happy */
+        return 0;
 }
@@ -1664,15 +1760,9 @@ static struct cgroup_subsys_state *cpuset_create(
 }
 /*
- * Locking note on the strange update_flag() call below:
- *
 * If the cpuset being removed has its flag 'sched_load_balance'
 * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains().  The get_online_cpus()
+ * will call async_rebuild_sched_domains().
- * call in rebuild_sched_domains() must not be made while holding
- * callback_mutex.  Elsewhere the kernel nests callback_mutex inside
- * get_online_cpus() calls.  So the reverse nesting would risk an
- * ABBA deadlock.
 */
 static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -1691,7 +1781,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
 struct cgroup_subsys cpuset_subsys = {
        .name = "cpuset",
        .create = cpuset_create,
-        .destroy  = cpuset_destroy,
+        .destroy = cpuset_destroy,
        .can_attach = cpuset_can_attach,
        .attach = cpuset_attach,
        .populate = cpuset_populate,
@@ -1783,7 +1873,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
 }
 /*
- * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
+ * If CPU and/or memory hotplug handlers, below, unplug any CPUs
 * or memory nodes, we need to walk over the cpuset hierarchy,
 * removing that CPU or node from all cpusets.  If this removes the
 * last CPU or node from a cpuset, then move the tasks in the empty
@@ -1831,26 +1921,23 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 * that has tasks along with an empty 'mems'.  But if we did see such
 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
 */
-static void scan_for_empty_cpusets(const struct cpuset *root)
+static void scan_for_empty_cpusets(struct cpuset *root)
 {
+        LIST_HEAD(queue);
        struct cpuset *cp;      /* scans cpusets being updated */
        struct cpuset *child;   /* scans child cpusets of cp */
-        struct list_head queue;
        struct cgroup *cont;
        nodemask_t oldmems;
-        INIT_LIST_HEAD(&queue);
        list_add_tail((struct list_head *)&root->stack_list, &queue);
        while (!list_empty(&queue)) {
-                cp = container_of(queue.next, struct cpuset, stack_list);
+                cp = list_first_entry(&queue, struct cpuset, stack_list);
                list_del(queue.next);
                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
                        child = cgroup_cs(cont);
                        list_add_tail(&child->stack_list, &queue);
                }
-                cont = cp->css.cgroup;
                /* Continue past cpusets with all cpus, mems online */
                if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
@@ -1871,42 +1958,13 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
                     nodes_empty(cp->mems_allowed))
                        remove_tasks_in_empty_cpuset(cp);
                else {
-                        update_tasks_cpumask(cp);
+                        update_tasks_cpumask(cp, NULL);
                        update_tasks_nodemask(cp, &oldmems);
                }
        }
 }
 /*
- * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
- * cpu_online_map and node_states[N_HIGH_MEMORY].  Force the top cpuset to
- * track what's online after any CPU or memory node hotplug or unplug event.
- *
- * Since there are two callers of this routine, one for CPU hotplug
- * events and one for memory node hotplug events, we could have coded
- * two separate routines here.  We code it as a single common routine
- * in order to minimize text size.
- */
-static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
-{
-        cgroup_lock();
-        top_cpuset.cpus_allowed = cpu_online_map;
-        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-        scan_for_empty_cpusets(&top_cpuset);
-        /*
-         * Scheduler destroys domains on hotplug events.
-         * Rebuild them based on the current settings.
-         */
-        if (rebuild_sd)
-                rebuild_sched_domains();
-        cgroup_unlock();
-}
-/*
 * The top_cpuset tracks what CPUs and Memory Nodes are online,
 * period.  This is necessary in order to make cpusets transparent
 * (of no affect) on systems that are actively using CPU hotplug
@@ -1914,40 +1972,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
 *
 * This routine ensures that top_cpuset.cpus_allowed tracks
 * cpu_online_map on each CPU hotplug (cpuhp) event.
+ *
+ * Called within get_online_cpus().  Needs to call cgroup_lock()
+ * before calling generate_sched_domains().
 */
+static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
-static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
                                unsigned long phase, void *unused_cpu)
 {
+        struct sched_domain_attr *attr;
+        cpumask_t *doms;
+        int ndoms;
        switch (phase) {
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
-        case CPU_DOWN_FAILED:
-        case CPU_DOWN_FAILED_FROZEN:
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                common_cpu_mem_hotplug_unplug(1);
                break;
        default:
                return NOTIFY_DONE;
        }
+        cgroup_lock();
+        top_cpuset.cpus_allowed = cpu_online_map;
+        scan_for_empty_cpusets(&top_cpuset);
+        ndoms = generate_sched_domains(&doms, &attr);
+        cgroup_unlock();
+        /* Have scheduler rebuild the domains */
+        partition_sched_domains(ndoms, doms, attr);
        return NOTIFY_OK;
 }
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
- * Call this routine anytime after you change
+ * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
- * node_states[N_HIGH_MEMORY].
+ * See also the previous routine cpuset_track_online_cpus().
- * See also the previous routine cpuset_handle_cpuhp().
 */
 void cpuset_track_online_nodes(void)
 {
-        common_cpu_mem_hotplug_unplug(0);
+        cgroup_lock();
+        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+        scan_for_empty_cpusets(&top_cpuset);
+        cgroup_unlock();
 }
 #endif
@@ -1962,7 +2032,7 @@ void __init cpuset_init_smp(void)
        top_cpuset.cpus_allowed = cpu_online_map;
        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-        hotcpu_notifier(cpuset_handle_cpuhp, 0);
+        hotcpu_notifier(cpuset_track_online_cpus, 0);
 }
 /**
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
new file mode 100644
index 000000000000..f013a0c2e111
--- /dev/null
+++ b/kernel/dma-coherent.c
@@ -0,0 +1,155 @@
+/*
+ * Coherent per-device memory handling.
+ * Borrowed from i386
+ */
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+struct dma_coherent_mem {
+        void            *virt_base;
+        u32             device_base;
+        int             size;
+        int             flags;
+        unsigned long   *bitmap;
+};
+int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
+                                dma_addr_t device_addr, size_t size, int flags)
+{
+        void __iomem *mem_base = NULL;
+        int pages = size >> PAGE_SHIFT;
+        int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
+        if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
+                goto out;
+        if (!size)
+                goto out;
+        if (dev->dma_mem)
+                goto out;
+        /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
+        mem_base = ioremap(bus_addr, size);
+        if (!mem_base)
+                goto out;
+        dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
+        if (!dev->dma_mem)
+                goto out;
+        dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+        if (!dev->dma_mem->bitmap)
+                goto free1_out;
+        dev->dma_mem->virt_base = mem_base;
+        dev->dma_mem->device_base = device_addr;
+        dev->dma_mem->size = pages;
+        dev->dma_mem->flags = flags;
+        if (flags & DMA_MEMORY_MAP)
+                return DMA_MEMORY_MAP;
+        return DMA_MEMORY_IO;
+ free1_out:
+        kfree(dev->dma_mem);
+ out:
+        if (mem_base)
+                iounmap(mem_base);
+        return 0;
+}
+EXPORT_SYMBOL(dma_declare_coherent_memory);
+void dma_release_declared_memory(struct device *dev)
+{
+        struct dma_coherent_mem *mem = dev->dma_mem;
+        if (!mem)
+                return;
+        dev->dma_mem = NULL;
+        iounmap(mem->virt_base);
+        kfree(mem->bitmap);
+        kfree(mem);
+}
+EXPORT_SYMBOL(dma_release_declared_memory);
+void *dma_mark_declared_memory_occupied(struct device *dev,
+                                        dma_addr_t device_addr, size_t size)
+{
+        struct dma_coherent_mem *mem = dev->dma_mem;
+        int pos, err;
+        size += device_addr & ~PAGE_MASK;
+        if (!mem)
+                return ERR_PTR(-EINVAL);
+        pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
+        err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
+        if (err != 0)
+                return ERR_PTR(err);
+        return mem->virt_base + (pos << PAGE_SHIFT);
+}
+EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
+/**
+ * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
+ *
+ * @dev:        device from which we allocate memory
+ * @size:       size of requested memory area
+ * @dma_handle: This will be filled with the correct dma handle
+ * @ret:        This pointer will be filled with the virtual address
+ *              to allocated area.
+ *
+ * This function should be only called from per-arch dma_alloc_coherent()
+ * to support allocation from per-device coherent memory pools.
+ *
+ * Returns 0 if dma_alloc_coherent should continue with allocating from
+ * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
+ */
+int dma_alloc_from_coherent(struct device *dev, ssize_t size,
+                                       dma_addr_t *dma_handle, void **ret)
+{
+        struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+        int order = get_order(size);
+        if (mem) {
+                int page = bitmap_find_free_region(mem->bitmap, mem->size,
+                                                     order);
+                if (page >= 0) {
+                        *dma_handle = mem->device_base + (page << PAGE_SHIFT);
+                        *ret = mem->virt_base + (page << PAGE_SHIFT);
+                        memset(*ret, 0, size);
+                } else if (mem->flags & DMA_MEMORY_EXCLUSIVE)
+                        *ret = NULL;
+        }
+        return (mem != NULL);
+}
+EXPORT_SYMBOL(dma_alloc_from_coherent);
+/**
+ * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
+ * @dev:        device from which the memory was allocated
+ * @order:      the order of pages allocated
+ * @vaddr:      virtual address of allocated pages
+ *
+ * This checks whether the memory was allocated from the per-device
+ * coherent memory pool and if so, releases that memory.
+ *
+ * Returns 1 if we correctly released the memory, or 0 if
+ * dma_release_coherent() should proceed with releasing memory from
+ * generic pools.
+ */
+int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
+{
+        struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+        if (mem && vaddr >= mem->virt_base && vaddr <
+                   (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+                int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+                bitmap_release_region(mem->bitmap, page, order);
+                return 1;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c1ef192aa655..0d407e886735 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -168,7 +168,6 @@ __set_personality(u_long personality)
        current->personality = personality;
        oep = current_thread_info()->exec_domain;
        current_thread_info()->exec_domain = ep;
-        set_fs_altroot();
        module_put(oep->module);
        return 0;
diff --git a/kernel/exit.c b/kernel/exit.c
index ad933bb29ec7..85a83c831856 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,6 +46,7 @@
 #include <linux/resource.h>
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/tracehook.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -111,27 +112,16 @@ static void __exit_signal(struct task_struct *tsk)
                 * We won't ever get here for the group leader, since it
                 * will have been the last reference on the signal_struct.
                 */
-                sig->utime = cputime_add(sig->utime, tsk->utime);
+                sig->utime = cputime_add(sig->utime, task_utime(tsk));
-                sig->stime = cputime_add(sig->stime, tsk->stime);
+                sig->stime = cputime_add(sig->stime, task_stime(tsk));
-                sig->gtime = cputime_add(sig->gtime, tsk->gtime);
+                sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
                sig->min_flt += tsk->min_flt;
                sig->maj_flt += tsk->maj_flt;
                sig->nvcsw += tsk->nvcsw;
                sig->nivcsw += tsk->nivcsw;
                sig->inblock += task_io_get_inblock(tsk);
                sig->oublock += task_io_get_oublock(tsk);
-#ifdef CONFIG_TASK_XACCT
+                task_io_accounting_add(&sig->ioac, &tsk->ioac);
-                sig->rchar += tsk->rchar;
-                sig->wchar += tsk->wchar;
-                sig->syscr += tsk->syscr;
-                sig->syscw += tsk->syscw;
-#endif /* CONFIG_TASK_XACCT */
-#ifdef CONFIG_TASK_IO_ACCOUNTING
-                sig->ioac.read_bytes += tsk->ioac.read_bytes;
-                sig->ioac.write_bytes += tsk->ioac.write_bytes;
-                sig->ioac.cancelled_write_bytes +=
-                                        tsk->ioac.cancelled_write_bytes;
-#endif /* CONFIG_TASK_IO_ACCOUNTING */
                sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
                sig = NULL; /* Marker for below. */
        }
@@ -162,27 +152,17 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
        put_task_struct(container_of(rhp, struct task_struct, rcu));
 }
-/*
- * Do final ptrace-related cleanup of a zombie being reaped.
- *
- * Called with write_lock(&tasklist_lock) held.
- */
-static void ptrace_release_task(struct task_struct *p)
-{
-        BUG_ON(!list_empty(&p->ptraced));
-        ptrace_unlink(p);
-        BUG_ON(!list_empty(&p->ptrace_entry));
-}
 void release_task(struct task_struct * p)
 {
        struct task_struct *leader;
        int zap_leader;
 repeat:
+        tracehook_prepare_release_task(p);
        atomic_dec(&p->user->processes);
        proc_flush_task(p);
        write_lock_irq(&tasklist_lock);
-        ptrace_release_task(p);
+        tracehook_finish_release_task(p);
        __exit_signal(p);
        /*
@@ -204,6 +184,13 @@ repeat:
                 * that case.
                 */
                zap_leader = task_detached(leader);
+                /*
+                 * This maintains the invariant that release_task()
+                 * only runs on a task in EXIT_DEAD, just for sanity.
+                 */
+                if (zap_leader)
+                        leader->exit_state = EXIT_DEAD;
        }
        write_unlock_irq(&tasklist_lock);
@@ -567,8 +554,6 @@ void put_fs_struct(struct fs_struct *fs)
        if (atomic_dec_and_test(&fs->count)) {
                path_put(&fs->root);
                path_put(&fs->pwd);
-                if (fs->altroot.dentry)
-                        path_put(&fs->altroot);
                kmem_cache_free(fs_cachep, fs);
        }
 }
@@ -598,8 +583,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
         * If there are other users of the mm and the owner (us) is exiting
         * we need to find a new owner to take on the responsibility.
         */
-        if (!mm)
-                return 0;
        if (atomic_read(&mm->mm_users) <= 1)
                return 0;
        if (mm->owner != p)
@@ -642,6 +625,16 @@ retry:
        } while_each_thread(g, c);
        read_unlock(&tasklist_lock);
+        /*
+         * We found no owner yet mm_users > 1: this implies that we are
+         * most likely racing with swapoff (try_to_unuse()) or /proc or
+         * ptrace or page migration (get_task_mm()).  Mark owner as NULL,
+         * so that subsystems can understand the callback and take action.
+         */
+        down_write(&mm->mmap_sem);
+        cgroup_mm_owner_callbacks(mm->owner, NULL);
+        mm->owner = NULL;
+        up_write(&mm->mmap_sem);
        return;
 assign_new_owner:
@@ -846,26 +839,50 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
 * the child reaper process (ie "init") in our pid
 * space.
 */
+static struct task_struct *find_new_reaper(struct task_struct *father)
+{
+        struct pid_namespace *pid_ns = task_active_pid_ns(father);
+        struct task_struct *thread;
+        thread = father;
+        while_each_thread(father, thread) {
+                if (thread->flags & PF_EXITING)
+                        continue;
+                if (unlikely(pid_ns->child_reaper == father))
+                        pid_ns->child_reaper = thread;
+                return thread;
+        }
+        if (unlikely(pid_ns->child_reaper == father)) {
+                write_unlock_irq(&tasklist_lock);
+                if (unlikely(pid_ns == &init_pid_ns))
+                        panic("Attempted to kill init!");
+                zap_pid_ns_processes(pid_ns);
+                write_lock_irq(&tasklist_lock);
+                /*
+                 * We can not clear ->child_reaper or leave it alone.
+                 * There may by stealth EXIT_DEAD tasks on ->children,
+                 * forget_original_parent() must move them somewhere.
+                 */
+                pid_ns->child_reaper = init_pid_ns.child_reaper;
+        }
+        return pid_ns->child_reaper;
+}
 static void forget_original_parent(struct task_struct *father)
 {
-        struct task_struct *p, *n, *reaper = father;
+        struct task_struct *p, *n, *reaper;
        LIST_HEAD(ptrace_dead);
        write_lock_irq(&tasklist_lock);
+        reaper = find_new_reaper(father);
        /*
         * First clean up ptrace if we were using it.
         */
        ptrace_exit(father, &ptrace_dead);
-        do {
-                reaper = next_thread(reaper);
-                if (reaper == father) {
-                        reaper = task_child_reaper(father);
-                        break;
-                }
-        } while (reaper->flags & PF_EXITING);
        list_for_each_entry_safe(p, n, &father->children, sibling) {
                p->real_parent = reaper;
                if (p->parent == father) {
@@ -887,7 +904,8 @@ static void forget_original_parent(struct task_struct *father)
 */
 static void exit_notify(struct task_struct *tsk, int group_dead)
 {
-        int state;
+        int signal;
+        void *cookie;
        /*
         * This does two things:
@@ -924,33 +942,24 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
            !capable(CAP_KILL))
                tsk->exit_signal = SIGCHLD;
-        /* If something other than our normal parent is ptracing us, then
+        signal = tracehook_notify_death(tsk, &cookie, group_dead);
-         * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
+        if (signal >= 0)
-         * only has special meaning to our real parent.
+                signal = do_notify_parent(tsk, signal);
-         */
-        if (!task_detached(tsk) && thread_group_empty(tsk)) {
-                int signal = ptrace_reparented(tsk) ?
-                                SIGCHLD : tsk->exit_signal;
-                do_notify_parent(tsk, signal);
-        } else if (tsk->ptrace) {
-                do_notify_parent(tsk, SIGCHLD);
-        }
-        state = EXIT_ZOMBIE;
+        tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
-        if (task_detached(tsk) && likely(!tsk->ptrace))
-                state = EXIT_DEAD;
-        tsk->exit_state = state;
        /* mt-exec, de_thread() is waiting for us */
        if (thread_group_leader(tsk) &&
-            tsk->signal->notify_count < 0 &&
+            tsk->signal->group_exit_task &&
-            tsk->signal->group_exit_task)
+            tsk->signal->notify_count < 0)
                wake_up_process(tsk->signal->group_exit_task);
        write_unlock_irq(&tasklist_lock);
+        tracehook_report_death(tsk, signal, cookie, group_dead);
        /* If the process is dead, release it - nobody will wait for it */
-        if (state == EXIT_DEAD)
+        if (signal == DEATH_REAP)
                release_task(tsk);
 }
@@ -982,39 +991,6 @@ static void check_stack_usage(void)
 static inline void check_stack_usage(void) {}
 #endif
-static inline void exit_child_reaper(struct task_struct *tsk)
-{
-        if (likely(tsk->group_leader != task_child_reaper(tsk)))
-                return;
-        if (tsk->nsproxy->pid_ns == &init_pid_ns)
-                panic("Attempted to kill init!");
-        /*
-         * @tsk is the last thread in the 'cgroup-init' and is exiting.
-         * Terminate all remaining processes in the namespace and reap them
-         * before exiting @tsk.
-         *
-         * Note that @tsk (last thread of cgroup-init) may not necessarily
-         * be the child-reaper (i.e main thread of cgroup-init) of the
-         * namespace i.e the child_reaper may have already exited.
-         *
-         * Even after a child_reaper exits, we let it inherit orphaned children,
-         * because, pid_ns->child_reaper remains valid as long as there is
-         * at least one living sub-thread in the cgroup init.
-         * This living sub-thread of the cgroup-init will be notified when
-         * a child inherited by the 'child-reaper' exits (do_notify_parent()
-         * uses __group_send_sig_info()). Further, when reaping child processes,
-         * do_wait() iterates over children of all living sub threads.
-         * i.e even though 'child_reaper' thread is listed as the parent of the
-         * orphaned children, any living sub-thread in the cgroup-init can
-         * perform the role of the child_reaper.
-         */
-        zap_pid_ns_processes(tsk->nsproxy->pid_ns);
-}
 NORET_TYPE void do_exit(long code)
 {
        struct task_struct *tsk = current;
@@ -1029,10 +1005,7 @@ NORET_TYPE void do_exit(long code)
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");
-        if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
+        tracehook_report_exit(&code);
-                current->ptrace_message = code;
-                ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
-        }
        /*
         * We're taking recursive faults here in do_exit. Safest is to just
@@ -1077,7 +1050,6 @@ NORET_TYPE void do_exit(long code)
        }
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
-                exit_child_reaper(tsk);
                hrtimer_cancel(&tsk->signal->real_timer);
                exit_itimers(tsk->signal);
        }
@@ -1378,21 +1350,8 @@ static int wait_task_zombie(struct task_struct *p, int options,
                psig->coublock +=
                        task_io_get_oublock(p) +
                        sig->oublock + sig->coublock;
-#ifdef CONFIG_TASK_XACCT
+                task_io_accounting_add(&psig->ioac, &p->ioac);
-                psig->rchar += p->rchar + sig->rchar;
+                task_io_accounting_add(&psig->ioac, &sig->ioac);
-                psig->wchar += p->wchar + sig->wchar;
-                psig->syscr += p->syscr + sig->syscr;
-                psig->syscw += p->syscw + sig->syscw;
-#endif /* CONFIG_TASK_XACCT */
-#ifdef CONFIG_TASK_IO_ACCOUNTING
-                psig->ioac.read_bytes +=
-                        p->ioac.read_bytes + sig->ioac.read_bytes;
-                psig->ioac.write_bytes +=
-                        p->ioac.write_bytes + sig->ioac.write_bytes;
-                psig->ioac.cancelled_write_bytes +=
-                                p->ioac.cancelled_write_bytes +
-                                sig->ioac.cancelled_write_bytes;
-#endif /* CONFIG_TASK_IO_ACCOUNTING */
                spin_unlock_irq(&p->parent->sighand->siglock);
        }
diff --git a/kernel/fork.c b/kernel/fork.c
index b99d73e971a4..7ce2ebe84796 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -27,6 +27,7 @@
 #include <linux/key.h>
 #include <linux/binfmts.h>
 #include <linux/mman.h>
+#include <linux/mmu_notifier.h>
 #include <linux/fs.h>
 #include <linux/nsproxy.h>
 #include <linux/capability.h>
@@ -37,6 +38,7 @@
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
+#include <linux/tracehook.h>
 #include <linux/futex.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/rcupdate.h>
@@ -413,6 +415,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
        if (likely(!mm_alloc_pgd(mm))) {
                mm->def_flags = 0;
+                mmu_notifier_mm_init(mm);
                return mm;
        }
@@ -445,6 +448,7 @@ void __mmdrop(struct mm_struct *mm)
        BUG_ON(mm == &init_mm);
        mm_free_pgd(mm);
        destroy_context(mm);
+        mmu_notifier_mm_destroy(mm);
        free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -656,13 +660,6 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
                path_get(&old->root);
                fs->pwd = old->pwd;
                path_get(&old->pwd);
-                if (old->altroot.dentry) {
-                        fs->altroot = old->altroot;
-                        path_get(&old->altroot);
-                } else {
-                        fs->altroot.mnt = NULL;
-                        fs->altroot.dentry = NULL;
-                }
                read_unlock(&old->lock);
        }
        return fs;
@@ -812,12 +809,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
        sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
        sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
-#ifdef CONFIG_TASK_XACCT
+        task_io_accounting_init(&sig->ioac);
-        sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0;
-#endif
-#ifdef CONFIG_TASK_IO_ACCOUNTING
-        memset(&sig->ioac, 0, sizeof(sig->ioac));
-#endif
        sig->sum_sched_runtime = 0;
        INIT_LIST_HEAD(&sig->cpu_timers[0]);
        INIT_LIST_HEAD(&sig->cpu_timers[1]);
@@ -865,8 +857,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
        new_flags &= ~PF_SUPERPRIV;
        new_flags |= PF_FORKNOEXEC;
-        if (!(clone_flags & CLONE_PTRACE))
+        new_flags |= PF_STARTING;
-                p->ptrace = 0;
        p->flags = new_flags;
        clear_freeze_flag(p);
 }
@@ -907,7 +898,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                                        struct pt_regs *regs,
                                        unsigned long stack_size,
                                        int __user *child_tidptr,
-                                        struct pid *pid)
+                                        struct pid *pid,
+                                        int trace)
 {
        int retval;
        struct task_struct *p;
@@ -1000,13 +992,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->last_switch_timestamp = 0;
 #endif
-#ifdef CONFIG_TASK_XACCT
+        task_io_accounting_init(&p->ioac);
-        p->rchar = 0;           /* I/O counter: bytes read */
-        p->wchar = 0;           /* I/O counter: bytes written */
-        p->syscr = 0;           /* I/O counter: read syscalls */
-        p->syscw = 0;           /* I/O counter: write syscalls */
-#endif
-        task_io_accounting_init(p);
        acct_clear_integrals(p);
        p->it_virt_expires = cputime_zero;
@@ -1163,8 +1149,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         */
        p->group_leader = p;
        INIT_LIST_HEAD(&p->thread_group);
-        INIT_LIST_HEAD(&p->ptrace_entry);
-        INIT_LIST_HEAD(&p->ptraced);
        /* Now that the task is set up, run cgroup callbacks if
         * necessary. We need to run them before the task is visible
@@ -1195,7 +1179,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                p->real_parent = current->real_parent;
        else
                p->real_parent = current;
-        p->parent = p->real_parent;
        spin_lock(&current->sighand->siglock);
@@ -1237,8 +1220,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (likely(p->pid)) {
                list_add_tail(&p->sibling, &p->real_parent->children);
-                if (unlikely(p->ptrace & PT_PTRACED))
+                tracehook_finish_clone(p, clone_flags, trace);
-                        __ptrace_link(p, current->parent);
                if (thread_group_leader(p)) {
                        if (clone_flags & CLONE_NEWPID)
@@ -1323,29 +1305,13 @@ struct task_struct * __cpuinit fork_idle(int cpu)
        struct pt_regs regs;
        task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
-                                &init_struct_pid);
+                            &init_struct_pid, 0);
        if (!IS_ERR(task))
                init_idle(task, cpu);
        return task;
 }
-static int fork_traceflag(unsigned clone_flags)
-{
-        if (clone_flags & CLONE_UNTRACED)
-                return 0;
-        else if (clone_flags & CLONE_VFORK) {
-                if (current->ptrace & PT_TRACE_VFORK)
-                        return PTRACE_EVENT_VFORK;
-        } else if ((clone_flags & CSIGNAL) != SIGCHLD) {
-                if (current->ptrace & PT_TRACE_CLONE)
-                        return PTRACE_EVENT_CLONE;
-        } else if (current->ptrace & PT_TRACE_FORK)
-                return PTRACE_EVENT_FORK;
-        return 0;
-}
 /*
 *  Ok, this is the main fork-routine.
 *
@@ -1380,14 +1346,14 @@ long do_fork(unsigned long clone_flags,
                }
        }
-        if (unlikely(current->ptrace)) {
+        /*
-                trace = fork_traceflag (clone_flags);
+         * When called from kernel_thread, don't do user tracing stuff.
-                if (trace)
+         */
-                        clone_flags |= CLONE_PTRACE;
+        if (likely(user_mode(regs)))
-        }
+                trace = tracehook_prepare_clone(clone_flags);
        p = copy_process(clone_flags, stack_start, regs, stack_size,
-                        child_tidptr, NULL);
+                         child_tidptr, NULL, trace);
        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
@@ -1405,32 +1371,35 @@ long do_fork(unsigned long clone_flags,
                        init_completion(&vfork);
                }
-                if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
+                tracehook_report_clone(trace, regs, clone_flags, nr, p);
+                /*
+                 * We set PF_STARTING at creation in case tracing wants to
+                 * use this to distinguish a fully live task from one that
+                 * hasn't gotten to tracehook_report_clone() yet.  Now we
+                 * clear it and set the child going.
+                 */
+                p->flags &= ~PF_STARTING;
+                if (unlikely(clone_flags & CLONE_STOPPED)) {
                        /*
                         * We'll start up with an immediate SIGSTOP.
                         */
                        sigaddset(&p->pending.signal, SIGSTOP);
                        set_tsk_thread_flag(p, TIF_SIGPENDING);
-                }
-                if (!(clone_flags & CLONE_STOPPED))
-                        wake_up_new_task(p, clone_flags);
-                else
                        __set_task_state(p, TASK_STOPPED);
+                } else {
-                if (unlikely (trace)) {
+                        wake_up_new_task(p, clone_flags);
-                        current->ptrace_message = nr;
-                        ptrace_notify ((trace << 8) | SIGTRAP);
                }
+                tracehook_report_clone_complete(trace, regs,
+                                                clone_flags, nr, p);
                if (clone_flags & CLONE_VFORK) {
                        freezer_do_not_count();
                        wait_for_completion(&vfork);
                        freezer_count();
-                        if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
+                        tracehook_report_vfork_done(p, nr);
-                                current->ptrace_message = nr;
-                                ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
-                        }
                }
        } else {
                nr = PTR_ERR(p);
@@ -1442,7 +1411,7 @@ long do_fork(unsigned long clone_flags,
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
-static void sighand_ctor(struct kmem_cache *cachep, void *data)
+static void sighand_ctor(void *data)
 {
        struct sighand_struct *sighand = data;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b8e4dce80a74..cdec83e722fa 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -672,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
                         */
                        BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
                        return 1;
-                case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
+                case HRTIMER_CB_IRQSAFE_PERCPU:
+                case HRTIMER_CB_IRQSAFE_UNLOCKED:
                        /*
                         * This is solely for the sched tick emulation with
                         * dynamic tick support to ensure that we do not
                         * restart the tick right on the edge and end up with
                         * the tick timer in the softirq ! The calling site
-                         * takes care of this.
+                         * takes care of this. Also used for hrtimer sleeper !
                         */
                        debug_hrtimer_deactivate(timer);
                        return 1;
@@ -1245,7 +1246,8 @@ static void __run_hrtimer(struct hrtimer *timer)
        timer_stats_account_hrtimer(timer);
        fn = timer->function;
-        if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+        if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
+            timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
                /*
                 * Used for scheduler timers, avoid lock inversion with
                 * rq->lock and tasklist_lock.
@@ -1452,7 +1454,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
        sl->timer.function = hrtimer_wakeup;
        sl->task = task;
 #ifdef CONFIG_HIGH_RES_TIMERS
-        sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+        sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
 #endif
 }
@@ -1591,29 +1593,95 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 #ifdef CONFIG_HOTPLUG_CPU
-static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
+static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
-                                struct hrtimer_clock_base *new_base)
+                                struct hrtimer_clock_base *new_base, int dcpu)
 {
        struct hrtimer *timer;
        struct rb_node *node;
+        int raise = 0;
        while ((node = rb_first(&old_base->active))) {
                timer = rb_entry(node, struct hrtimer, node);
                BUG_ON(hrtimer_callback_running(timer));
                debug_hrtimer_deactivate(timer);
-                __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
+                /*
+                 * Should not happen. Per CPU timers should be
+                 * canceled _before_ the migration code is called
+                 */
+                if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
+                        __remove_hrtimer(timer, old_base,
+                                         HRTIMER_STATE_INACTIVE, 0);
+                        WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
+                             timer, timer->function, dcpu);
+                        continue;
+                }
+                /*
+                 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+                 * timer could be seen as !active and just vanish away
+                 * under us on another CPU
+                 */
+                __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
                timer->base = new_base;
                /*
                 * Enqueue the timer. Allow reprogramming of the event device
                 */
                enqueue_hrtimer(timer, new_base, 1);
+#ifdef CONFIG_HIGH_RES_TIMERS
+                /*
+                 * Happens with high res enabled when the timer was
+                 * already expired and the callback mode is
+                 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
+                 * enqueue code does not move them to the soft irq
+                 * pending list for performance/latency reasons, but
+                 * in the migration state, we need to do that
+                 * otherwise we end up with a stale timer.
+                 */
+                if (timer->state == HRTIMER_STATE_MIGRATE) {
+                        timer->state = HRTIMER_STATE_PENDING;
+                        list_add_tail(&timer->cb_entry,
+                                      &new_base->cpu_base->cb_pending);
+                        raise = 1;
+                }
+#endif
+                /* Clear the migration state bit */
+                timer->state &= ~HRTIMER_STATE_MIGRATE;
+        }
+        return raise;
+}
+#ifdef CONFIG_HIGH_RES_TIMERS
+static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
+                                   struct hrtimer_cpu_base *new_base)
+{
+        struct hrtimer *timer;
+        int raise = 0;
+        while (!list_empty(&old_base->cb_pending)) {
+                timer = list_entry(old_base->cb_pending.next,
+                                   struct hrtimer, cb_entry);
+                __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
+                timer->base = &new_base->clock_base[timer->base->index];
+                list_add_tail(&timer->cb_entry, &new_base->cb_pending);
+                raise = 1;
        }
+        return raise;
+}
+#else
+static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
+                                   struct hrtimer_cpu_base *new_base)
+{
+        return 0;
 }
+#endif
 static void migrate_hrtimers(int cpu)
 {
        struct hrtimer_cpu_base *old_base, *new_base;
-        int i;
+        int i, raise = 0;
        BUG_ON(cpu_online(cpu));
        old_base = &per_cpu(hrtimer_bases, cpu);
@@ -1626,14 +1694,21 @@ static void migrate_hrtimers(int cpu)
        spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-                migrate_hrtimer_list(&old_base->clock_base[i],
+                if (migrate_hrtimer_list(&old_base->clock_base[i],
-                                     &new_base->clock_base[i]);
+                                         &new_base->clock_base[i], cpu))
+                        raise = 1;
        }
+        if (migrate_hrtimer_pending(old_base, new_base))
+                raise = 1;
        spin_unlock(&old_base->lock);
        spin_unlock(&new_base->lock);
        local_irq_enable();
        put_cpu_var(hrtimer_bases);
+        if (raise)
+                hrtimer_raise_softirq();
 }
 #endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 964964baefa2..3cd441ebf5d2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -28,8 +28,7 @@ void dynamic_irq_init(unsigned int irq)
        unsigned long flags;
        if (irq >= NR_IRQS) {
-                printk(KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
+                WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
-                WARN_ON(1);
                return;
        }
@@ -62,8 +61,7 @@ void dynamic_irq_cleanup(unsigned int irq)
        unsigned long flags;
        if (irq >= NR_IRQS) {
-                printk(KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
+                WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
-                WARN_ON(1);
                return;
        }
@@ -71,9 +69,8 @@ void dynamic_irq_cleanup(unsigned int irq)
        spin_lock_irqsave(&desc->lock, flags);
        if (desc->action) {
                spin_unlock_irqrestore(&desc->lock, flags);
-                printk(KERN_ERR "Destroying IRQ%d without calling free_irq\n",
+                WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
                        irq);
-                WARN_ON(1);
                return;
        }
        desc->msi_desc = NULL;
@@ -96,8 +93,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
        unsigned long flags;
        if (irq >= NR_IRQS) {
-                printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
+                WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
-                WARN_ON(1);
                return -EINVAL;
        }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f8914b92b664..60c49e324390 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -89,7 +89,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
        set_balance_irq_affinity(irq, cpumask);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-        set_pending_irq(irq, cpumask);
+        if (desc->status & IRQ_MOVE_PCNTXT) {
+                unsigned long flags;
+                spin_lock_irqsave(&desc->lock, flags);
+                desc->chip->set_affinity(irq, cpumask);
+                spin_unlock_irqrestore(&desc->lock, flags);
+        } else
+                set_pending_irq(irq, cpumask);
 #else
        desc->affinity = cpumask;
        desc->chip->set_affinity(irq, cpumask);
@@ -177,8 +184,7 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq)
 {
        switch (desc->depth) {
        case 0:
-                printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
+                WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
-                WARN_ON(1);
                break;
        case 1: {
                unsigned int status = desc->status & ~IRQ_DISABLED;
@@ -324,7 +330,8 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq,
        ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK);
        if (ret)
-                pr_err("setting flow type for irq %u failed (%pF)\n",
+                pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
+                                (int)(flags & IRQF_TRIGGER_MASK),
                                irq, chip->set_type);
        return ret;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6c6d35d68ee9..a09dd29c2fd7 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -8,6 +8,7 @@
 #include <linux/irq.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/interrupt.h>
 #include "internals.h"
@@ -16,23 +17,18 @@ static struct proc_dir_entry *root_irq_dir;
 #ifdef CONFIG_SMP
-static int irq_affinity_read_proc(char *page, char **start, off_t off,
+static int irq_affinity_proc_show(struct seq_file *m, void *v)
-                                  int count, int *eof, void *data)
 {
-        struct irq_desc *desc = irq_desc + (long)data;
+        struct irq_desc *desc = irq_desc + (long)m->private;
        cpumask_t *mask = &desc->affinity;
-        int len;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        if (desc->status & IRQ_MOVE_PENDING)
                mask = &desc->pending_mask;
 #endif
-        len = cpumask_scnprintf(page, count, *mask);
+        seq_cpumask(m, mask);
+        seq_putc(m, '\n');
-        if (count - len < 2)
+        return 0;
-                return -EINVAL;
-        len += sprintf(page + len, "\n");
-        return len;
 }
 #ifndef is_affinity_mask_valid
@@ -40,11 +36,12 @@ static int irq_affinity_read_proc(char *page, char **start, off_t off,
 #endif
 int no_irq_affinity;
-static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
+static ssize_t irq_affinity_proc_write(struct file *file,
-                                   unsigned long count, void *data)
+                const char __user *buffer, size_t count, loff_t *pos)
 {
-        unsigned int irq = (int)(long)data, full_count = count, err;
+        unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
        cpumask_t new_value;
+        int err;
        if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
            irq_balancing_disabled(irq))
@@ -65,28 +62,38 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
        if (!cpus_intersects(new_value, cpu_online_map))
                /* Special case for empty set - allow the architecture
                   code to set default SMP affinity. */
-                return irq_select_affinity(irq) ? -EINVAL : full_count;
+                return irq_select_affinity(irq) ? -EINVAL : count;
        irq_set_affinity(irq, new_value);
-        return full_count;
+        return count;
 }
-static int default_affinity_read(char *page, char **start, off_t off,
+static int irq_affinity_proc_open(struct inode *inode, struct file *file)
-                                  int count, int *eof, void *data)
 {
-        int len = cpumask_scnprintf(page, count, irq_default_affinity);
+        return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
-        if (count - len < 2)
-                return -EINVAL;
-        len += sprintf(page + len, "\n");
-        return len;
 }
-static int default_affinity_write(struct file *file, const char __user *buffer,
+static const struct file_operations irq_affinity_proc_fops = {
-                                   unsigned long count, void *data)
+        .open           = irq_affinity_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = irq_affinity_proc_write,
+};
+static int default_affinity_show(struct seq_file *m, void *v)
+{
+        seq_cpumask(m, &irq_default_affinity);
+        seq_putc(m, '\n');
+        return 0;
+}
+static ssize_t default_affinity_write(struct file *file,
+                const char __user *buffer, size_t count, loff_t *ppos)
 {
-        unsigned int full_count = count, err;
        cpumask_t new_value;
+        int err;
        err = cpumask_parse_user(buffer, count, new_value);
        if (err)
@@ -105,8 +112,21 @@ static int default_affinity_write(struct file *file, const char __user *buffer,
        irq_default_affinity = new_value;
-        return full_count;
+        return count;
 }
+static int default_affinity_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, default_affinity_show, NULL);
+}
+static const struct file_operations default_affinity_proc_fops = {
+        .open           = default_affinity_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+        .write          = default_affinity_write,
+};
 #endif
 static int irq_spurious_read(char *page, char **start, off_t off,
@@ -178,16 +198,9 @@ void register_irq_proc(unsigned int irq)
        irq_desc[irq].dir = proc_mkdir(name, root_irq_dir);
 #ifdef CONFIG_SMP
-        {
+        /* create /proc/irq/<irq>/smp_affinity */
-                /* create /proc/irq/<irq>/smp_affinity */
+        proc_create_data("smp_affinity", 0600, irq_desc[irq].dir,
-                entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
+                         &irq_affinity_proc_fops, (void *)(long)irq);
-                if (entry) {
-                        entry->data = (void *)(long)irq;
-                        entry->read_proc = irq_affinity_read_proc;
-                        entry->write_proc = irq_affinity_write_proc;
-                }
-        }
 #endif
        entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir);
@@ -208,15 +221,8 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action)
 void register_default_affinity_proc(void)
 {
 #ifdef CONFIG_SMP
-        struct proc_dir_entry *entry;
+        proc_create("irq/default_smp_affinity", 0600, NULL,
+                    &default_affinity_proc_fops);
-        /* create /proc/irq/default_smp_affinity */
-        entry = create_proc_entry("default_smp_affinity", 0600, root_irq_dir);
-        if (entry) {
-                entry->data = NULL;
-                entry->read_proc  = default_affinity_read;
-                entry->write_proc = default_affinity_write;
-        }
 #endif
 }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 1c5fcacbcf33..aef265325cd3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -12,7 +12,7 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/kexec.h>
-#include <linux/spinlock.h>
+#include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/highmem.h>
 #include <linux/syscalls.h>
@@ -24,6 +24,12 @@
 #include <linux/utsrelease.h>
 #include <linux/utsname.h>
 #include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/console.h>
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -71,7 +77,7 @@ int kexec_should_crash(struct task_struct *p)
 *
 * The code for the transition from the current kernel to the
 * the new kernel is placed in the control_code_buffer, whose size
- * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
+ * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
 * page of memory is necessary, but some architectures require more.
 * Because this memory must be identity mapped in the transition from
 * virtual to physical addresses it must live in the range
@@ -236,12 +242,18 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
         */
        result = -ENOMEM;
        image->control_code_page = kimage_alloc_control_pages(image,
-                                           get_order(KEXEC_CONTROL_CODE_SIZE));
+                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
        if (!image->control_code_page) {
                printk(KERN_ERR "Could not allocate control_code_buffer\n");
                goto out;
        }
+        image->swap_page = kimage_alloc_control_pages(image, 0);
+        if (!image->swap_page) {
+                printk(KERN_ERR "Could not allocate swap buffer\n");
+                goto out;
+        }
        result = 0;
 out:
        if (result == 0)
@@ -305,7 +317,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
         */
        result = -ENOMEM;
        image->control_code_page = kimage_alloc_control_pages(image,
-                                           get_order(KEXEC_CONTROL_CODE_SIZE));
+                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
        if (!image->control_code_page) {
                printk(KERN_ERR "Could not allocate control_code_buffer\n");
                goto out;
@@ -589,14 +601,12 @@ static void kimage_free_extra_pages(struct kimage *image)
        kimage_free_page_list(&image->unuseable_pages);
 }
-static int kimage_terminate(struct kimage *image)
+static void kimage_terminate(struct kimage *image)
 {
        if (*image->entry != 0)
                image->entry++;
        *image->entry = IND_DONE;
-        return 0;
 }
 #define for_each_kimage_entry(image, ptr, entry) \
@@ -743,8 +753,14 @@ static struct page *kimage_alloc_page(struct kimage *image,
                        *old = addr | (*old & ~PAGE_MASK);
                        /* The old page I have found cannot be a
-                         * destination page, so return it.
+                         * destination page, so return it if it's
+                         * gfp_flags honor the ones passed in.
                         */
+                        if (!(gfp_mask & __GFP_HIGHMEM) &&
+                            PageHighMem(old_page)) {
+                                kimage_free_pages(old_page);
+                                continue;
+                        }
                        addr = old_addr;
                        page = old_page;
                        break;
@@ -914,19 +930,14 @@ static int kimage_load_segment(struct kimage *image,
 */
 struct kimage *kexec_image;
 struct kimage *kexec_crash_image;
-/*
- * A home grown binary mutex.
+static DEFINE_MUTEX(kexec_mutex);
- * Nothing can wait so this mutex is safe to use
- * in interrupt context :)
- */
-static int kexec_lock;
 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
                                struct kexec_segment __user *segments,
                                unsigned long flags)
 {
        struct kimage **dest_image, *image;
-        int locked;
        int result;
        /* We only trust the superuser with rebooting the system. */
@@ -962,8 +973,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
         *
         * KISS: always take the mutex.
         */
-        locked = xchg(&kexec_lock, 1);
+        if (!mutex_trylock(&kexec_mutex))
-        if (locked)
                return -EBUSY;
        dest_image = &kexec_image;
@@ -988,6 +998,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
                if (result)
                        goto out;
+                if (flags & KEXEC_PRESERVE_CONTEXT)
+                        image->preserve_context = 1;
                result = machine_kexec_prepare(image);
                if (result)
                        goto out;
@@ -997,16 +1009,13 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
                        if (result)
                                goto out;
                }
-                result = kimage_terminate(image);
+                kimage_terminate(image);
-                if (result)
-                        goto out;
        }
        /* Install the new kernel, and  Uninstall the old */
        image = xchg(dest_image, image);
 out:
-        locked = xchg(&kexec_lock, 0); /* Release the mutex */
+        mutex_unlock(&kexec_mutex);
-        BUG_ON(!locked);
        kimage_free(image);
        return result;
@@ -1053,10 +1062,7 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
 void crash_kexec(struct pt_regs *regs)
 {
-        int locked;
+        /* Take the kexec_mutex here to prevent sys_kexec_load
-        /* Take the kexec_lock here to prevent sys_kexec_load
         * running on one cpu from replacing the crash kernel
         * we are using after a panic on a different cpu.
         *
@@ -1064,8 +1070,7 @@ void crash_kexec(struct pt_regs *regs)
         * of memory the xchg(&kexec_crash_image) would be
         * sufficient.  But since I reuse the memory...
         */
-        locked = xchg(&kexec_lock, 1);
+        if (mutex_trylock(&kexec_mutex)) {
-        if (!locked) {
                if (kexec_crash_image) {
                        struct pt_regs fixed_regs;
                        crash_setup_regs(&fixed_regs, regs);
@@ -1073,8 +1078,7 @@ void crash_kexec(struct pt_regs *regs)
                        machine_crash_shutdown(&fixed_regs);
                        machine_kexec(kexec_crash_image);
                }
-                locked = xchg(&kexec_lock, 0);
+                mutex_unlock(&kexec_mutex);
-                BUG_ON(!locked);
        }
 }
@@ -1415,3 +1419,79 @@ static int __init crash_save_vmcoreinfo_init(void)
 }
 module_init(crash_save_vmcoreinfo_init)
+/*
+ * Move into place and start executing a preloaded standalone
+ * executable.  If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+        int error = 0;
+        if (!mutex_trylock(&kexec_mutex))
+                return -EBUSY;
+        if (!kexec_image) {
+                error = -EINVAL;
+                goto Unlock;
+        }
+#ifdef CONFIG_KEXEC_JUMP
+        if (kexec_image->preserve_context) {
+                mutex_lock(&pm_mutex);
+                pm_prepare_console();
+                error = freeze_processes();
+                if (error) {
+                        error = -EBUSY;
+                        goto Restore_console;
+                }
+                suspend_console();
+                error = device_suspend(PMSG_FREEZE);
+                if (error)
+                        goto Resume_console;
+                error = disable_nonboot_cpus();
+                if (error)
+                        goto Resume_devices;
+                device_pm_lock();
+                local_irq_disable();
+                /* At this point, device_suspend() has been called,
+                 * but *not* device_power_down(). We *must*
+                 * device_power_down() now.  Otherwise, drivers for
+                 * some devices (e.g. interrupt controllers) become
+                 * desynchronized with the actual state of the
+                 * hardware at resume time, and evil weirdness ensues.
+                 */
+                error = device_power_down(PMSG_FREEZE);
+                if (error)
+                        goto Enable_irqs;
+        } else
+#endif
+        {
+                kernel_restart_prepare(NULL);
+                printk(KERN_EMERG "Starting new kernel\n");
+                machine_shutdown();
+        }
+        machine_kexec(kexec_image);
+#ifdef CONFIG_KEXEC_JUMP
+        if (kexec_image->preserve_context) {
+                device_power_up(PMSG_RESTORE);
+ Enable_irqs:
+                local_irq_enable();
+                device_pm_unlock();
+                enable_nonboot_cpus();
+ Resume_devices:
+                device_resume(PMSG_RESTORE);
+ Resume_console:
+                resume_console();
+                thaw_processes();
+ Restore_console:
+                pm_restore_console();
+                mutex_unlock(&pm_mutex);
+        }
+#endif
+ Unlock:
+        mutex_unlock(&kexec_mutex);
+        return error;
+}
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 3ec23c3ec97f..e4dcfb2272a4 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -56,12 +56,14 @@
 static int kgdb_break_asap;
+#define KGDB_MAX_THREAD_QUERY 17
 struct kgdb_state {
        int                     ex_vector;
        int                     signo;
        int                     err_code;
        int                     cpu;
        int                     pass_exception;
+        unsigned long           thr_query;
        unsigned long           threadid;
        long                    kgdb_usethreadid;
        struct pt_regs          *linux_regs;
@@ -166,13 +168,6 @@ early_param("nokgdbroundup", opt_nokgdbroundup);
 * Weak aliases for breakpoint management,
 * can be overriden by architectures when needed:
 */
-int __weak kgdb_validate_break_address(unsigned long addr)
-{
-        char tmp_variable[BREAK_INSTR_SIZE];
-        return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE);
-}
 int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
 {
        int err;
@@ -191,6 +186,25 @@ int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
                                  (char *)bundle, BREAK_INSTR_SIZE);
 }
+int __weak kgdb_validate_break_address(unsigned long addr)
+{
+        char tmp_variable[BREAK_INSTR_SIZE];
+        int err;
+        /* Validate setting the breakpoint and then removing it.  In the
+         * remove fails, the kernel needs to emit a bad message because we
+         * are deep trouble not being able to put things back the way we
+         * found them.
+         */
+        err = kgdb_arch_set_breakpoint(addr, tmp_variable);
+        if (err)
+                return err;
+        err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
+        if (err)
+                printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
+                   "memory destroyed at: %lx", addr);
+        return err;
+}
 unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
 {
        return instruction_pointer(regs);
@@ -433,9 +447,14 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
 {
        int hex_val;
        int num = 0;
+        int negate = 0;
        *long_val = 0;
+        if (**ptr == '-') {
+                negate = 1;
+                (*ptr)++;
+        }
        while (**ptr) {
                hex_val = hex(**ptr);
                if (hex_val < 0)
@@ -446,6 +465,9 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
                (*ptr)++;
        }
+        if (negate)
+                *long_val = -*long_val;
        return num;
 }
@@ -466,7 +488,7 @@ static int write_mem_msg(int binary)
                if (err)
                        return err;
                if (CACHE_FLUSH_IS_SAFE)
-                        flush_icache_range(addr, addr + length + 1);
+                        flush_icache_range(addr, addr + length);
                return 0;
        }
@@ -515,10 +537,16 @@ static void int_to_threadref(unsigned char *id, int value)
 static struct task_struct *getthread(struct pt_regs *regs, int tid)
 {
        /*
-         * Non-positive TIDs are remapped idle tasks:
+         * Non-positive TIDs are remapped to the cpu shadow information
         */
-        if (tid <= 0)
+        if (tid == 0 || tid == -1)
-                return idle_task(-tid);
+                tid = -atomic_read(&kgdb_active) - 2;
+        if (tid < 0) {
+                if (kgdb_info[-tid - 2].task)
+                        return kgdb_info[-tid - 2].task;
+                else
+                        return idle_task(-tid - 2);
+        }
        /*
         * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -562,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs)
        /* Signal the primary CPU that we are done: */
        atomic_set(&cpu_in_kgdb[cpu], 0);
+        touch_softlockup_watchdog();
        clocksource_touch_watchdog();
        local_irq_restore(flags);
 }
@@ -725,14 +754,15 @@ setundefined:
 }
 /*
- * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs:
+ * Remap normal tasks to their real PID,
+ * CPU shadow threads are mapped to -CPU - 2
 */
 static inline int shadow_pid(int realpid)
 {
        if (realpid)
                return realpid;
-        return -1-raw_smp_processor_id();
+        return -raw_smp_processor_id() - 2;
 }
 static char gdbmsgbuf[BUFMAX + 1];
@@ -826,7 +856,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
                local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
        } else {
                local_debuggerinfo = NULL;
-                for (i = 0; i < NR_CPUS; i++) {
+                for_each_online_cpu(i) {
                        /*
                         * Try to find the task on some other
                         * or possibly this node if we do not
@@ -960,10 +990,13 @@ static int gdb_cmd_reboot(struct kgdb_state *ks)
 /* Handle the 'q' query packets */
 static void gdb_cmd_query(struct kgdb_state *ks)
 {
-        struct task_struct *thread;
+        struct task_struct *g;
+        struct task_struct *p;
        unsigned char thref[8];
        char *ptr;
        int i;
+        int cpu;
+        int finished = 0;
        switch (remcom_in_buffer[1]) {
        case 's':
@@ -973,22 +1006,34 @@ static void gdb_cmd_query(struct kgdb_state *ks)
                        break;
                }
-                if (remcom_in_buffer[1] == 'f')
+                i = 0;
-                        ks->threadid = 1;
                remcom_out_buffer[0] = 'm';
                ptr = remcom_out_buffer + 1;
+                if (remcom_in_buffer[1] == 'f') {
-                for (i = 0; i < 17; ks->threadid++) {
+                        /* Each cpu is a shadow thread */
-                        thread = getthread(ks->linux_regs, ks->threadid);
+                        for_each_online_cpu(cpu) {
-                        if (thread) {
+                                ks->thr_query = 0;
-                                int_to_threadref(thref, ks->threadid);
+                                int_to_threadref(thref, -cpu - 2);
                                pack_threadid(ptr, thref);
                                ptr += BUF_THREAD_ID_SIZE;
                                *(ptr++) = ',';
                                i++;
                        }
                }
+                do_each_thread(g, p) {
+                        if (i >= ks->thr_query && !finished) {
+                                int_to_threadref(thref, p->pid);
+                                pack_threadid(ptr, thref);
+                                ptr += BUF_THREAD_ID_SIZE;
+                                *(ptr++) = ',';
+                                ks->thr_query++;
+                                if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
+                                        finished = 1;
+                        }
+                        i++;
+                } while_each_thread(g, p);
                *(--ptr) = '\0';
                break;
@@ -1011,15 +1056,15 @@ static void gdb_cmd_query(struct kgdb_state *ks)
                        error_packet(remcom_out_buffer, -EINVAL);
                        break;
                }
-                if (ks->threadid > 0) {
+                if ((int)ks->threadid > 0) {
                        kgdb_mem2hex(getthread(ks->linux_regs,
                                        ks->threadid)->comm,
                                        remcom_out_buffer, 16);
                } else {
                        static char tmpstr[23 + BUF_THREAD_ID_SIZE];
-                        sprintf(tmpstr, "Shadow task %d for pid 0",
+                        sprintf(tmpstr, "shadowCPU%d",
-                                        (int)(-ks->threadid-1));
+                                        (int)(-ks->threadid - 2));
                        kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
                }
                break;
@@ -1388,6 +1433,7 @@ acquirelock:
            atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
                atomic_set(&kgdb_active, -1);
+                touch_softlockup_watchdog();
                clocksource_touch_watchdog();
                local_irq_restore(flags);
@@ -1418,7 +1464,7 @@ acquirelock:
         * Get the passive CPU lock which will hold all the non-primary
         * CPU in a spin state while the debugger is active
         */
-        if (!kgdb_single_step || !kgdb_contthread) {
+        if (!kgdb_single_step) {
                for (i = 0; i < NR_CPUS; i++)
                        atomic_set(&passive_cpu_wait[i], 1);
        }
@@ -1431,7 +1477,7 @@ acquirelock:
 #ifdef CONFIG_SMP
        /* Signal the other CPUs to enter kgdb_wait() */
-        if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup)
+        if ((!kgdb_single_step) && kgdb_do_roundup)
                kgdb_roundup_cpus(flags);
 #endif
@@ -1450,7 +1496,7 @@ acquirelock:
        kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
        kgdb_deactivate_sw_breakpoints();
        kgdb_single_step = 0;
-        kgdb_contthread = NULL;
+        kgdb_contthread = current;
        exception_level = 0;
        /* Talk to debugger with gdbserial protocol */
@@ -1464,7 +1510,7 @@ acquirelock:
        kgdb_info[ks->cpu].task = NULL;
        atomic_set(&cpu_in_kgdb[ks->cpu], 0);
-        if (!kgdb_single_step || !kgdb_contthread) {
+        if (!kgdb_single_step) {
                for (i = NR_CPUS-1; i >= 0; i--)
                        atomic_set(&passive_cpu_wait[i], 0);
                /*
@@ -1480,6 +1526,7 @@ acquirelock:
 kgdb_restore:
        /* Free kgdb_active */
        atomic_set(&kgdb_active, -1);
+        touch_softlockup_watchdog();
        clocksource_touch_watchdog();
        local_irq_restore(flags);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 6111c27491b1..96cff2f8710b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -176,7 +176,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
                return;
        }
        /* Must have done schedule() in kthread() before we set_task_cpu */
-        wait_task_inactive(k);
+        wait_task_inactive(k, 0);
        set_task_cpu(k, cpu);
        k->cpus_allowed = cpumask_of_cpu(cpu);
        k->rt.nr_cpus_allowed = 1;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index d38a64362973..dbda475b13bd 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -124,6 +124,15 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
 unsigned long nr_lock_classes;
 static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+static inline struct lock_class *hlock_class(struct held_lock *hlock)
+{
+        if (!hlock->class_idx) {
+                DEBUG_LOCKS_WARN_ON(1);
+                return NULL;
+        }
+        return lock_classes + hlock->class_idx - 1;
+}
 #ifdef CONFIG_LOCK_STAT
 static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
@@ -222,7 +231,7 @@ static void lock_release_holdtime(struct held_lock *hlock)
        holdtime = sched_clock() - hlock->holdtime_stamp;
-        stats = get_lock_stats(hlock->class);
+        stats = get_lock_stats(hlock_class(hlock));
        if (hlock->read)
                lock_time_inc(&stats->read_holdtime, holdtime);
        else
@@ -372,6 +381,19 @@ unsigned int nr_process_chains;
 unsigned int max_lockdep_depth;
 unsigned int max_recursion_depth;
+static unsigned int lockdep_dependency_gen_id;
+static bool lockdep_dependency_visit(struct lock_class *source,
+                                     unsigned int depth)
+{
+        if (!depth)
+                lockdep_dependency_gen_id++;
+        if (source->dep_gen_id == lockdep_dependency_gen_id)
+                return true;
+        source->dep_gen_id = lockdep_dependency_gen_id;
+        return false;
+}
 #ifdef CONFIG_DEBUG_LOCKDEP
 /*
 * We cannot printk in early bootup code. Not even early_printk()
@@ -505,7 +527,7 @@ static void print_lockdep_cache(struct lockdep_map *lock)
 static void print_lock(struct held_lock *hlock)
 {
-        print_lock_name(hlock->class);
+        print_lock_name(hlock_class(hlock));
        printk(", at: ");
        print_ip_sym(hlock->acquire_ip);
 }
@@ -558,6 +580,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
 {
        struct lock_list *entry;
+        if (lockdep_dependency_visit(class, depth))
+                return;
        if (DEBUG_LOCKS_WARN_ON(depth >= 20))
                return;
@@ -850,11 +875,11 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
        if (!entry)
                return 0;
-        entry->class = this;
-        entry->distance = distance;
        if (!save_trace(&entry->trace))
                return 0;
+        entry->class = this;
+        entry->distance = distance;
        /*
         * Since we never remove from the dependency list, the list can
         * be walked lockless by other CPUs, it's only allocation
@@ -932,7 +957,7 @@ static noinline int print_circular_bug_tail(void)
        if (debug_locks_silent)
                return 0;
-        this.class = check_source->class;
+        this.class = hlock_class(check_source);
        if (!save_trace(&this.trace))
                return 0;
@@ -959,6 +984,67 @@ static int noinline print_infinite_recursion_bug(void)
        return 0;
 }
+unsigned long __lockdep_count_forward_deps(struct lock_class *class,
+                                           unsigned int depth)
+{
+        struct lock_list *entry;
+        unsigned long ret = 1;
+        if (lockdep_dependency_visit(class, depth))
+                return 0;
+        /*
+         * Recurse this class's dependency list:
+         */
+        list_for_each_entry(entry, &class->locks_after, entry)
+                ret += __lockdep_count_forward_deps(entry->class, depth + 1);
+        return ret;
+}
+unsigned long lockdep_count_forward_deps(struct lock_class *class)
+{
+        unsigned long ret, flags;
+        local_irq_save(flags);
+        __raw_spin_lock(&lockdep_lock);
+        ret = __lockdep_count_forward_deps(class, 0);
+        __raw_spin_unlock(&lockdep_lock);
+        local_irq_restore(flags);
+        return ret;
+}
+unsigned long __lockdep_count_backward_deps(struct lock_class *class,
+                                            unsigned int depth)
+{
+        struct lock_list *entry;
+        unsigned long ret = 1;
+        if (lockdep_dependency_visit(class, depth))
+                return 0;
+        /*
+         * Recurse this class's dependency list:
+         */
+        list_for_each_entry(entry, &class->locks_before, entry)
+                ret += __lockdep_count_backward_deps(entry->class, depth + 1);
+        return ret;
+}
+unsigned long lockdep_count_backward_deps(struct lock_class *class)
+{
+        unsigned long ret, flags;
+        local_irq_save(flags);
+        __raw_spin_lock(&lockdep_lock);
+        ret = __lockdep_count_backward_deps(class, 0);
+        __raw_spin_unlock(&lockdep_lock);
+        local_irq_restore(flags);
+        return ret;
+}
 /*
 * Prove that the dependency graph starting at <entry> can not
 * lead to <target>. Print an error and return 0 if it does.
@@ -968,6 +1054,9 @@ check_noncircular(struct lock_class *source, unsigned int depth)
 {
        struct lock_list *entry;
+        if (lockdep_dependency_visit(source, depth))
+                return 1;
        debug_atomic_inc(&nr_cyclic_check_recursions);
        if (depth > max_recursion_depth)
                max_recursion_depth = depth;
@@ -977,7 +1066,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
         * Check this lock's dependency list:
         */
        list_for_each_entry(entry, &source->locks_after, entry) {
-                if (entry->class == check_target->class)
+                if (entry->class == hlock_class(check_target))
                        return print_circular_bug_header(entry, depth+1);
                debug_atomic_inc(&nr_cyclic_checks);
                if (!check_noncircular(entry->class, depth+1))
@@ -1011,6 +1100,9 @@ find_usage_forwards(struct lock_class *source, unsigned int depth)
        struct lock_list *entry;
        int ret;
+        if (lockdep_dependency_visit(source, depth))
+                return 1;
        if (depth > max_recursion_depth)
                max_recursion_depth = depth;
        if (depth >= RECURSION_LIMIT)
@@ -1050,6 +1142,9 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
        struct lock_list *entry;
        int ret;
+        if (lockdep_dependency_visit(source, depth))
+                return 1;
        if (!__raw_spin_is_locked(&lockdep_lock))
                return DEBUG_LOCKS_WARN_ON(1);
@@ -1064,6 +1159,11 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
                return 2;
        }
+        if (!source && debug_locks_off_graph_unlock()) {
+                WARN_ON(1);
+                return 0;
+        }
        /*
         * Check this lock's dependency list:
         */
@@ -1103,9 +1203,9 @@ print_bad_irq_dependency(struct task_struct *curr,
        printk("\nand this task is already holding:\n");
        print_lock(prev);
        printk("which would create a new lock dependency:\n");
-        print_lock_name(prev->class);
+        print_lock_name(hlock_class(prev));
        printk(" ->");
-        print_lock_name(next->class);
+        print_lock_name(hlock_class(next));
        printk("\n");
        printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
@@ -1146,12 +1246,12 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
        find_usage_bit = bit_backwards;
        /* fills in <backwards_match> */
-        ret = find_usage_backwards(prev->class, 0);
+        ret = find_usage_backwards(hlock_class(prev), 0);
        if (!ret || ret == 1)
                return ret;
        find_usage_bit = bit_forwards;
-        ret = find_usage_forwards(next->class, 0);
+        ret = find_usage_forwards(hlock_class(next), 0);
        if (!ret || ret == 1)
                return ret;
        /* ret == 2 */
@@ -1272,18 +1372,32 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
               struct lockdep_map *next_instance, int read)
 {
        struct held_lock *prev;
+        struct held_lock *nest = NULL;
        int i;
        for (i = 0; i < curr->lockdep_depth; i++) {
                prev = curr->held_locks + i;
-                if (prev->class != next->class)
+                if (prev->instance == next->nest_lock)
+                        nest = prev;
+                if (hlock_class(prev) != hlock_class(next))
                        continue;
                /*
                 * Allow read-after-read recursion of the same
                 * lock class (i.e. read_lock(lock)+read_lock(lock)):
                 */
                if ((read == 2) && prev->read)
                        return 2;
+                /*
+                 * We're holding the nest_lock, which serializes this lock's
+                 * nesting behaviour.
+                 */
+                if (nest)
+                        return 2;
                return print_deadlock_bug(curr, prev, next);
        }
        return 1;
@@ -1329,7 +1443,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
         */
        check_source = next;
        check_target = prev;
-        if (!(check_noncircular(next->class, 0)))
+        if (!(check_noncircular(hlock_class(next), 0)))
                return print_circular_bug_tail();
        if (!check_prev_add_irq(curr, prev, next))
@@ -1353,8 +1467,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
         *  chains - the second one will be new, but L1 already has
         *  L2 added to its dependency list, due to the first chain.)
         */
-        list_for_each_entry(entry, &prev->class->locks_after, entry) {
+        list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) {
-                if (entry->class == next->class) {
+                if (entry->class == hlock_class(next)) {
                        if (distance == 1)
                                entry->distance = 1;
                        return 2;
@@ -1365,26 +1479,28 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
         * Ok, all validations passed, add the new lock
         * to the previous lock's dependency list:
         */
-        ret = add_lock_to_list(prev->class, next->class,
+        ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
-                               &prev->class->locks_after, next->acquire_ip, distance);
+                               &hlock_class(prev)->locks_after,
+                               next->acquire_ip, distance);
        if (!ret)
                return 0;
-        ret = add_lock_to_list(next->class, prev->class,
+        ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
-                               &next->class->locks_before, next->acquire_ip, distance);
+                               &hlock_class(next)->locks_before,
+                               next->acquire_ip, distance);
        if (!ret)
                return 0;
        /*
         * Debugging printouts:
         */
-        if (verbose(prev->class) || verbose(next->class)) {
+        if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
                graph_unlock();
                printk("\n new dependency: ");
-                print_lock_name(prev->class);
+                print_lock_name(hlock_class(prev));
                printk(" => ");
-                print_lock_name(next->class);
+                print_lock_name(hlock_class(next));
                printk("\n");
                dump_stack();
                return graph_lock();
@@ -1481,7 +1597,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
                                     struct held_lock *hlock,
                                     u64 chain_key)
 {
-        struct lock_class *class = hlock->class;
+        struct lock_class *class = hlock_class(hlock);
        struct list_head *hash_head = chainhashentry(chain_key);
        struct lock_chain *chain;
        struct held_lock *hlock_curr, *hlock_next;
@@ -1554,7 +1670,7 @@ cache_hit:
        if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
                chain->base = cn;
                for (j = 0; j < chain->depth - 1; j++, i++) {
-                        int lock_id = curr->held_locks[i].class - lock_classes;
+                        int lock_id = curr->held_locks[i].class_idx - 1;
                        chain_hlocks[chain->base + j] = lock_id;
                }
                chain_hlocks[chain->base + j] = class - lock_classes;
@@ -1643,14 +1759,13 @@ static void check_chain_key(struct task_struct *curr)
                hlock = curr->held_locks + i;
                if (chain_key != hlock->prev_chain_key) {
                        debug_locks_off();
-                        printk("hm#1, depth: %u [%u], %016Lx != %016Lx\n",
+                        WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
                                curr->lockdep_depth, i,
                                (unsigned long long)chain_key,
                                (unsigned long long)hlock->prev_chain_key);
-                        WARN_ON(1);
                        return;
                }
-                id = hlock->class - lock_classes;
+                id = hlock->class_idx - 1;
                if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
                        return;
@@ -1662,11 +1777,10 @@ static void check_chain_key(struct task_struct *curr)
        }
        if (chain_key != curr->curr_chain_key) {
                debug_locks_off();
-                printk("hm#2, depth: %u [%u], %016Lx != %016Lx\n",
+                WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
                        curr->lockdep_depth, i,
                        (unsigned long long)chain_key,
                        (unsigned long long)curr->curr_chain_key);
-                WARN_ON(1);
        }
 #endif
 }
@@ -1695,7 +1809,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
        print_lock(this);
        printk("{%s} state was registered at:\n", usage_str[prev_bit]);
-        print_stack_trace(this->class->usage_traces + prev_bit, 1);
+        print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
        print_irqtrace_events(curr);
        printk("\nother info that might help us debug this:\n");
@@ -1714,7 +1828,7 @@ static inline int
 valid_state(struct task_struct *curr, struct held_lock *this,
            enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
 {
-        if (unlikely(this->class->usage_mask & (1 << bad_bit)))
+        if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit)))
                return print_usage_bug(curr, this, bad_bit, new_bit);
        return 1;
 }
@@ -1753,7 +1867,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
        lockdep_print_held_locks(curr);
        printk("\nthe first lock's dependencies:\n");
-        print_lock_dependencies(this->class, 0);
+        print_lock_dependencies(hlock_class(this), 0);
        printk("\nthe second lock's dependencies:\n");
        print_lock_dependencies(other, 0);
@@ -1776,7 +1890,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
        find_usage_bit = bit;
        /* fills in <forwards_match> */
-        ret = find_usage_forwards(this->class, 0);
+        ret = find_usage_forwards(hlock_class(this), 0);
        if (!ret || ret == 1)
                return ret;
@@ -1795,7 +1909,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
        find_usage_bit = bit;
        /* fills in <backwards_match> */
-        ret = find_usage_backwards(this->class, 0);
+        ret = find_usage_backwards(hlock_class(this), 0);
        if (!ret || ret == 1)
                return ret;
@@ -1861,7 +1975,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                                LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
                        return 0;
 #endif
-                if (hardirq_verbose(this->class))
+                if (hardirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_USED_IN_SOFTIRQ:
@@ -1886,7 +2000,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                                LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
                        return 0;
 #endif
-                if (softirq_verbose(this->class))
+                if (softirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_USED_IN_HARDIRQ_READ:
@@ -1899,7 +2013,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                if (!check_usage_forwards(curr, this,
                                          LOCK_ENABLED_HARDIRQS, "hard"))
                        return 0;
-                if (hardirq_verbose(this->class))
+                if (hardirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_USED_IN_SOFTIRQ_READ:
@@ -1912,7 +2026,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                if (!check_usage_forwards(curr, this,
                                          LOCK_ENABLED_SOFTIRQS, "soft"))
                        return 0;
-                if (softirq_verbose(this->class))
+                if (softirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_ENABLED_HARDIRQS:
@@ -1938,7 +2052,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                                   LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
                        return 0;
 #endif
-                if (hardirq_verbose(this->class))
+                if (hardirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_ENABLED_SOFTIRQS:
@@ -1964,7 +2078,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                                   LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
                        return 0;
 #endif
-                if (softirq_verbose(this->class))
+                if (softirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_ENABLED_HARDIRQS_READ:
@@ -1979,7 +2093,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                                           LOCK_USED_IN_HARDIRQ, "hard"))
                        return 0;
 #endif
-                if (hardirq_verbose(this->class))
+                if (hardirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        case LOCK_ENABLED_SOFTIRQS_READ:
@@ -1994,7 +2108,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                                           LOCK_USED_IN_SOFTIRQ, "soft"))
                        return 0;
 #endif
-                if (softirq_verbose(this->class))
+                if (softirq_verbose(hlock_class(this)))
                        ret = 2;
                break;
        default:
@@ -2310,7 +2424,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
         * If already set then do not dirty the cacheline,
         * nor do any checks:
         */
-        if (likely(this->class->usage_mask & new_mask))
+        if (likely(hlock_class(this)->usage_mask & new_mask))
                return 1;
        if (!graph_lock())
@@ -2318,14 +2432,14 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
        /*
         * Make sure we didnt race:
         */
-        if (unlikely(this->class->usage_mask & new_mask)) {
+        if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
                graph_unlock();
                return 1;
        }
-        this->class->usage_mask |= new_mask;
+        hlock_class(this)->usage_mask |= new_mask;
-        if (!save_trace(this->class->usage_traces + new_bit))
+        if (!save_trace(hlock_class(this)->usage_traces + new_bit))
                return 0;
        switch (new_bit) {
@@ -2405,7 +2519,7 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
 */
 static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                          int trylock, int read, int check, int hardirqs_off,
-                          unsigned long ip)
+                          struct lockdep_map *nest_lock, unsigned long ip)
 {
        struct task_struct *curr = current;
        struct lock_class *class = NULL;
@@ -2459,14 +2573,16 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                return 0;
        hlock = curr->held_locks + depth;
+        if (DEBUG_LOCKS_WARN_ON(!class))
-        hlock->class = class;
+                return 0;
+        hlock->class_idx = class - lock_classes + 1;
        hlock->acquire_ip = ip;
        hlock->instance = lock;
+        hlock->nest_lock = nest_lock;
        hlock->trylock = trylock;
        hlock->read = read;
        hlock->check = check;
-        hlock->hardirqs_off = hardirqs_off;
+        hlock->hardirqs_off = !!hardirqs_off;
 #ifdef CONFIG_LOCK_STAT
        hlock->waittime_stamp = 0;
        hlock->holdtime_stamp = sched_clock();
@@ -2574,6 +2690,55 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
        return 1;
 }
+static int
+__lock_set_subclass(struct lockdep_map *lock,
+                    unsigned int subclass, unsigned long ip)
+{
+        struct task_struct *curr = current;
+        struct held_lock *hlock, *prev_hlock;
+        struct lock_class *class;
+        unsigned int depth;
+        int i;
+        depth = curr->lockdep_depth;
+        if (DEBUG_LOCKS_WARN_ON(!depth))
+                return 0;
+        prev_hlock = NULL;
+        for (i = depth-1; i >= 0; i--) {
+                hlock = curr->held_locks + i;
+                /*
+                 * We must not cross into another context:
+                 */
+                if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+                        break;
+                if (hlock->instance == lock)
+                        goto found_it;
+                prev_hlock = hlock;
+        }
+        return print_unlock_inbalance_bug(curr, lock, ip);
+found_it:
+        class = register_lock_class(lock, subclass, 0);
+        hlock->class_idx = class - lock_classes + 1;
+        curr->lockdep_depth = i;
+        curr->curr_chain_key = hlock->prev_chain_key;
+        for (; i < depth; i++) {
+                hlock = curr->held_locks + i;
+                if (!__lock_acquire(hlock->instance,
+                        hlock_class(hlock)->subclass, hlock->trylock,
+                                hlock->read, hlock->check, hlock->hardirqs_off,
+                                hlock->nest_lock, hlock->acquire_ip))
+                        return 0;
+        }
+        if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+                return 0;
+        return 1;
+}
 /*
 * Remove the lock to the list of currently held locks in a
 * potentially non-nested (out of order) manner. This is a
@@ -2624,9 +2789,9 @@ found_it:
        for (i++; i < depth; i++) {
                hlock = curr->held_locks + i;
                if (!__lock_acquire(hlock->instance,
-                        hlock->class->subclass, hlock->trylock,
+                        hlock_class(hlock)->subclass, hlock->trylock,
                                hlock->read, hlock->check, hlock->hardirqs_off,
-                                hlock->acquire_ip))
+                                hlock->nest_lock, hlock->acquire_ip))
                        return 0;
        }
@@ -2669,7 +2834,7 @@ static int lock_release_nested(struct task_struct *curr,
 #ifdef CONFIG_DEBUG_LOCKDEP
        hlock->prev_chain_key = 0;
-        hlock->class = NULL;
+        hlock->class_idx = 0;
        hlock->acquire_ip = 0;
        hlock->irq_context = 0;
 #endif
@@ -2738,18 +2903,36 @@ static void check_flags(unsigned long flags)
 #endif
 }
+void
+lock_set_subclass(struct lockdep_map *lock,
+                  unsigned int subclass, unsigned long ip)
+{
+        unsigned long flags;
+        if (unlikely(current->lockdep_recursion))
+                return;
+        raw_local_irq_save(flags);
+        current->lockdep_recursion = 1;
+        check_flags(flags);
+        if (__lock_set_subclass(lock, subclass, ip))
+                check_chain_key(current);
+        current->lockdep_recursion = 0;
+        raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_set_subclass);
 /*
 * We are not always called with irqs disabled - do that here,
 * and also avoid lockdep recursion:
 */
 void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
-                          int trylock, int read, int check, unsigned long ip)
+                          int trylock, int read, int check,
+                          struct lockdep_map *nest_lock, unsigned long ip)
 {
        unsigned long flags;
-        if (unlikely(!lock_stat && !prove_locking))
-                return;
        if (unlikely(current->lockdep_recursion))
                return;
@@ -2758,7 +2941,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        current->lockdep_recursion = 1;
        __lock_acquire(lock, subclass, trylock, read, check,
-                       irqs_disabled_flags(flags), ip);
+                       irqs_disabled_flags(flags), nest_lock, ip);
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
 }
@@ -2770,9 +2953,6 @@ void lock_release(struct lockdep_map *lock, int nested,
 {
        unsigned long flags;
-        if (unlikely(!lock_stat && !prove_locking))
-                return;
        if (unlikely(current->lockdep_recursion))
                return;
@@ -2845,11 +3025,11 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
 found_it:
        hlock->waittime_stamp = sched_clock();
-        point = lock_contention_point(hlock->class, ip);
+        point = lock_contention_point(hlock_class(hlock), ip);
-        stats = get_lock_stats(hlock->class);
+        stats = get_lock_stats(hlock_class(hlock));
        if (point < ARRAY_SIZE(stats->contention_point))
-                stats->contention_point[i]++;
+                stats->contention_point[point]++;
        if (lock->cpu != smp_processor_id())
                stats->bounces[bounce_contended + !!hlock->read]++;
        put_lock_stats(stats);
@@ -2893,7 +3073,7 @@ found_it:
                hlock->holdtime_stamp = now;
        }
-        stats = get_lock_stats(hlock->class);
+        stats = get_lock_stats(hlock_class(hlock));
        if (waittime) {
                if (hlock->read)
                        lock_time_inc(&stats->read_waittime, waittime);
@@ -2988,6 +3168,7 @@ static void zap_class(struct lock_class *class)
        list_del_rcu(&class->hash_entry);
        list_del_rcu(&class->lock_entry);
+        class->key = NULL;
 }
 static inline int within(const void *addr, void *start, unsigned long size)
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index c3600a091a28..56b196932c08 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -17,9 +17,6 @@
 */
 #define MAX_LOCKDEP_ENTRIES     8192UL
-#define MAX_LOCKDEP_KEYS_BITS   11
-#define MAX_LOCKDEP_KEYS        (1UL << MAX_LOCKDEP_KEYS_BITS)
 #define MAX_LOCKDEP_CHAINS_BITS 14
 #define MAX_LOCKDEP_CHAINS      (1UL << MAX_LOCKDEP_CHAINS_BITS)
@@ -53,6 +50,22 @@ extern unsigned int nr_process_chains;
 extern unsigned int max_lockdep_depth;
 extern unsigned int max_recursion_depth;
+#ifdef CONFIG_PROVE_LOCKING
+extern unsigned long lockdep_count_forward_deps(struct lock_class *);
+extern unsigned long lockdep_count_backward_deps(struct lock_class *);
+#else
+static inline unsigned long
+lockdep_count_forward_deps(struct lock_class *class)
+{
+        return 0;
+}
+static inline unsigned long
+lockdep_count_backward_deps(struct lock_class *class)
+{
+        return 0;
+}
+#endif
 #ifdef CONFIG_DEBUG_LOCKDEP
 /*
 * Various lockdep statistics:
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 9b0e940e2545..20dbcbf9c7dd 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -63,34 +63,6 @@ static void l_stop(struct seq_file *m, void *v)
 {
 }
-static unsigned long count_forward_deps(struct lock_class *class)
-{
-        struct lock_list *entry;
-        unsigned long ret = 1;
-        /*
-         * Recurse this class's dependency list:
-         */
-        list_for_each_entry(entry, &class->locks_after, entry)
-                ret += count_forward_deps(entry->class);
-        return ret;
-}
-static unsigned long count_backward_deps(struct lock_class *class)
-{
-        struct lock_list *entry;
-        unsigned long ret = 1;
-        /*
-         * Recurse this class's dependency list:
-         */
-        list_for_each_entry(entry, &class->locks_before, entry)
-                ret += count_backward_deps(entry->class);
-        return ret;
-}
 static void print_name(struct seq_file *m, struct lock_class *class)
 {
        char str[128];
@@ -110,7 +82,6 @@ static void print_name(struct seq_file *m, struct lock_class *class)
 static int l_show(struct seq_file *m, void *v)
 {
-        unsigned long nr_forward_deps, nr_backward_deps;
        struct lock_class *class = v;
        struct lock_list *entry;
        char c1, c2, c3, c4;
@@ -124,11 +95,10 @@ static int l_show(struct seq_file *m, void *v)
 #ifdef CONFIG_DEBUG_LOCKDEP
        seq_printf(m, " OPS:%8ld", class->ops);
 #endif
-        nr_forward_deps = count_forward_deps(class);
+#ifdef CONFIG_PROVE_LOCKING
-        seq_printf(m, " FD:%5ld", nr_forward_deps);
+        seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
+        seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
-        nr_backward_deps = count_backward_deps(class);
+#endif
-        seq_printf(m, " BD:%5ld", nr_backward_deps);
        get_usage_chars(class, &c1, &c2, &c3, &c4);
        seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
@@ -229,6 +199,9 @@ static int lc_show(struct seq_file *m, void *v)
        for (i = 0; i < chain->depth; i++) {
                class = lock_chain_get_class(chain, i);
+                if (!class->key)
+                        continue;
                seq_printf(m, "[%p] ", class->key);
                print_name(m, class);
                seq_puts(m, "\n");
@@ -350,7 +323,9 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
                if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
                        nr_hardirq_read_unsafe++;
-                sum_forward_deps += count_forward_deps(class);
+#ifdef CONFIG_PROVE_LOCKING
+                sum_forward_deps += lockdep_count_forward_deps(class);
+#endif
        }
 #ifdef CONFIG_DEBUG_LOCKDEP
        DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
@@ -497,8 +472,9 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr)
 {
        unsigned long rem;
+        nr += 5; /* for display rounding */
        rem = do_div(nr, 1000); /* XXX: do_div_signed */
-        snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10);
+        snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10);
 }
 static void seq_time(struct seq_file *m, s64 time)
diff --git a/kernel/marker.c b/kernel/marker.c
index 971da5317903..7d1faecd7a51 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -126,6 +126,11 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
                struct marker_probe_closure *multi;
                int i;
                /*
+                 * Read mdata->ptype before mdata->multi.
+                 */
+                smp_rmb();
+                multi = mdata->multi;
+                /*
                 * multi points to an array, therefore accessing the array
                 * depends on reading multi. However, even in this case,
                 * we must insure that the pointer is read _before_ the array
@@ -133,7 +138,6 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
                 * in the fast path, so put the explicit barrier here.
                 */
                smp_read_barrier_depends();
-                multi = mdata->multi;
                for (i = 0; multi[i].func; i++) {
                        va_start(args, call_private);
                        multi[i].func(multi[i].probe_private, call_private,
@@ -175,6 +179,11 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
                struct marker_probe_closure *multi;
                int i;
                /*
+                 * Read mdata->ptype before mdata->multi.
+                 */
+                smp_rmb();
+                multi = mdata->multi;
+                /*
                 * multi points to an array, therefore accessing the array
                 * depends on reading multi. However, even in this case,
                 * we must insure that the pointer is read _before_ the array
@@ -182,7 +191,6 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
                 * in the fast path, so put the explicit barrier here.
                 */
                smp_read_barrier_depends();
-                multi = mdata->multi;
                for (i = 0; multi[i].func; i++)
                        multi[i].func(multi[i].probe_private, call_private,
                                mdata->format, &args);
diff --git a/kernel/module.c b/kernel/module.c
index d8b5605132a0..9db11911e04b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -325,18 +325,6 @@ static unsigned long find_symbol(const char *name,
        return -ENOENT;
 }
-/* lookup symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_symbol(const char *name,
-        const struct kernel_symbol *start,
-        const struct kernel_symbol *stop)
-{
-        const struct kernel_symbol *ks = start;
-        for (; ks < stop; ks++)
-                if (strcmp(ks->name, name) == 0)
-                        return ks;
-        return NULL;
-}
 /* Search for module by name: must hold module_mutex. */
 static struct module *find_module(const char *name)
 {
@@ -690,7 +678,7 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
        if (flags & O_NONBLOCK) {
                struct stopref sref = { mod, flags, forced };
-                return stop_machine_run(__try_stop_module, &sref, NR_CPUS);
+                return stop_machine(__try_stop_module, &sref, NULL);
        } else {
                /* We don't need to stop the machine for this. */
                mod->state = MODULE_STATE_GOING;
@@ -1428,7 +1416,7 @@ static int __unlink_module(void *_mod)
 static void free_module(struct module *mod)
 {
        /* Delete from various lists */
-        stop_machine_run(__unlink_module, mod, NR_CPUS);
+        stop_machine(__unlink_module, mod, NULL);
        remove_notes_attrs(mod);
        remove_sect_attrs(mod);
        mod_kobject_remove(mod);
@@ -1703,6 +1691,19 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
 }
 #ifdef CONFIG_KALLSYMS
+/* lookup symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_symbol(const char *name,
+        const struct kernel_symbol *start,
+        const struct kernel_symbol *stop)
+{
+        const struct kernel_symbol *ks = start;
+        for (; ks < stop; ks++)
+                if (strcmp(ks->name, name) == 0)
+                        return ks;
+        return NULL;
+}
 static int is_exported(const char *name, const struct module *mod)
 {
        if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
@@ -1798,7 +1799,7 @@ static void *module_alloc_update_bounds(unsigned long size)
 /* Allocate and load the module: note that size of section 0 is always
   zero, and we rely on this for optional sections. */
-static struct module *load_module(void __user *umod,
+static noinline struct module *load_module(void __user *umod,
                                  unsigned long len,
                                  const char __user *uargs)
 {
@@ -2196,7 +2197,7 @@ static struct module *load_module(void __user *umod,
        /* Now sew it into the lists so we can get lockdep and oops
         * info during argument parsing.  Noone should access us, since
         * strong_try_module_get() will fail. */
-        stop_machine_run(__link_module, mod, NR_CPUS);
+        stop_machine(__link_module, mod, NULL);
        /* Size of section 0 is 0, so this works well if no params */
        err = parse_args(mod->name, mod->args,
@@ -2230,7 +2231,7 @@ static struct module *load_module(void __user *umod,
        return mod;
 unlink:
-        stop_machine_run(__unlink_module, mod, NR_CPUS);
+        stop_machine(__unlink_module, mod, NULL);
        module_arch_cleanup(mod);
 cleanup:
        kobject_del(&mod->mkobj.kobj);
@@ -2287,7 +2288,7 @@ sys_init_module(void __user *umod,
        /* Start the module */
        if (mod->init != NULL)
-                ret = mod->init();
+                ret = do_one_initcall(mod->init);
        if (ret < 0) {
                /* Init routine failed: abort.  Try to protect us from
                   buggy refcounters. */
diff --git a/kernel/mutex.c b/kernel/mutex.c
index bcdc9ac8ef60..12c779dc65d4 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -34,6 +34,7 @@
 /***
 * mutex_init - initialize the mutex
 * @lock: the mutex to be initialized
+ * @key: the lock_class_key for the class; used by mutex lock debugging
 *
 * Initialize the mutex to unlocked state.
 *
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 21575fc46d05..1d3ef29a2583 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -14,7 +14,6 @@
 */
 #include <linux/module.h>
-#include <linux/version.h>
 #include <linux/nsproxy.h>
 #include <linux/init_task.h>
 #include <linux/mnt_namespace.h>
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ea567b78d1aa..fab8ea86fac3 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -179,9 +179,6 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
                rc = sys_wait4(-1, NULL, __WALL, NULL);
        } while (rc != -ECHILD);
-        /* Child reaper for the pid namespace is going away */
-        pid_ns->child_reaper = NULL;
        acct_exit_ns(pid_ns);
        return;
 }
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 8cb757026386..dfdec524d1b7 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -24,7 +24,7 @@
 * requirement that the application has is cleaned up when closes the file
 * pointer or exits the pm_qos_object will get an opportunity to clean up.
 *
- * mark gross mgross@linux.intel.com
+ * Mark Gross <mgross@linux.intel.com>
 */
 #include <linux/pm_qos_params.h>
@@ -43,7 +43,7 @@
 #include <linux/uaccess.h>
 /*
- * locking rule: all changes to target_value or requirements or notifiers lists
+ * locking rule: all changes to requirements or notifiers lists
 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
 * held, taken with _irqsave.  One lock to rule them all
 */
@@ -66,7 +66,7 @@ struct pm_qos_object {
        struct miscdevice pm_qos_power_miscdev;
        char *name;
        s32 default_value;
-        s32 target_value;
+        atomic_t target_value;
        s32 (*comparitor)(s32, s32);
 };
@@ -77,7 +77,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
        .notifiers = &cpu_dma_lat_notifier,
        .name = "cpu_dma_latency",
        .default_value = 2000 * USEC_PER_SEC,
-        .target_value = 2000 * USEC_PER_SEC,
+        .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
        .comparitor = min_compare
 };
@@ -87,7 +87,7 @@ static struct pm_qos_object network_lat_pm_qos = {
        .notifiers = &network_lat_notifier,
        .name = "network_latency",
        .default_value = 2000 * USEC_PER_SEC,
-        .target_value = 2000 * USEC_PER_SEC,
+        .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
        .comparitor = min_compare
 };
@@ -99,7 +99,7 @@ static struct pm_qos_object network_throughput_pm_qos = {
        .notifiers = &network_throughput_notifier,
        .name = "network_throughput",
        .default_value = 0,
-        .target_value = 0,
+        .target_value = ATOMIC_INIT(0),
        .comparitor = max_compare
 };
@@ -150,11 +150,11 @@ static void update_target(int target)
                extreme_value = pm_qos_array[target]->comparitor(
                                extreme_value, node->value);
        }
-        if (pm_qos_array[target]->target_value != extreme_value) {
+        if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) {
                call_notifier = 1;
-                pm_qos_array[target]->target_value = extreme_value;
+                atomic_set(&pm_qos_array[target]->target_value, extreme_value);
                pr_debug(KERN_ERR "new target for qos %d is %d\n", target,
-                        pm_qos_array[target]->target_value);
+                        atomic_read(&pm_qos_array[target]->target_value));
        }
        spin_unlock_irqrestore(&pm_qos_lock, flags);
@@ -193,14 +193,7 @@ static int find_pm_qos_object_by_minor(int minor)
 */
 int pm_qos_requirement(int pm_qos_class)
 {
-        int ret_val;
+        return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
-        unsigned long flags;
-        spin_lock_irqsave(&pm_qos_lock, flags);
-        ret_val = pm_qos_array[pm_qos_class]->target_value;
-        spin_unlock_irqrestore(&pm_qos_lock, flags);
-        return ret_val;
 }
 EXPORT_SYMBOL_GPL(pm_qos_requirement);
@@ -211,8 +204,8 @@ EXPORT_SYMBOL_GPL(pm_qos_requirement);
 * @value: defines the qos request
 *
 * This function inserts a new entry in the pm_qos_class list of requested qos
- * performance charactoistics.  It recomputes the agregate QoS expectations for
+ * performance characteristics.  It recomputes the aggregate QoS expectations
- * the pm_qos_class of parrameters.
+ * for the pm_qos_class of parameters.
 */
 int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value)
 {
@@ -250,10 +243,10 @@ EXPORT_SYMBOL_GPL(pm_qos_add_requirement);
 * @name: identifies the request
 * @value: defines the qos request
 *
- * Updates an existing qos requierement for the pm_qos_class of parameters along
+ * Updates an existing qos requirement for the pm_qos_class of parameters along
 * with updating the target pm_qos_class value.
 *
- * If the named request isn't in the lest then no change is made.
+ * If the named request isn't in the list then no change is made.
 */
 int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value)
 {
@@ -287,7 +280,7 @@ EXPORT_SYMBOL_GPL(pm_qos_update_requirement);
 * @pm_qos_class: identifies which list of qos request to us
 * @name: identifies the request
 *
- * Will remove named qos request from pm_qos_class list of parrameters and
+ * Will remove named qos request from pm_qos_class list of parameters and
 * recompute the current target value for the pm_qos_class.
 */
 void pm_qos_remove_requirement(int pm_qos_class, char *name)
@@ -319,7 +312,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
 * @notifier: notifier block managed by caller.
 *
 * will register the notifier into a notification chain that gets called
- * uppon changes to the pm_qos_class target value.
+ * upon changes to the pm_qos_class target value.
 */
 int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
 {
@@ -338,7 +331,7 @@ EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
 * @notifier: notifier block to be removed.
 *
 * will remove the notifier from the notification chain that gets called
- * uppon changes to the pm_qos_class target value.
+ * upon changes to the pm_qos_class target value.
 */
 int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
 {
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9a21681aa80f..5131e5471169 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -289,21 +289,29 @@ void do_schedule_next_timer(struct siginfo *info)
                else
                        schedule_next_timer(timr);
-                info->si_overrun = timr->it_overrun_last;
+                info->si_overrun += timr->it_overrun_last;
        }
        if (timr)
                unlock_timer(timr, flags);
 }
-int posix_timer_event(struct k_itimer *timr,int si_private)
+int posix_timer_event(struct k_itimer *timr, int si_private)
 {
-        memset(&timr->sigq->info, 0, sizeof(siginfo_t));
+        /*
+         * FIXME: if ->sigq is queued we can race with
+         * dequeue_signal()->do_schedule_next_timer().
+         *
+         * If dequeue_signal() sees the "right" value of
+         * si_sys_private it calls do_schedule_next_timer().
+         * We re-queue ->sigq and drop ->it_lock().
+         * do_schedule_next_timer() locks the timer
+         * and re-schedules it while ->sigq is pending.
+         * Not really bad, but not that we want.
+         */
        timr->sigq->info.si_sys_private = si_private;
-        /* Send signal to the process that owns this timer.*/
        timr->sigq->info.si_signo = timr->it_sigev_signo;
-        timr->sigq->info.si_errno = 0;
        timr->sigq->info.si_code = SI_TIMER;
        timr->sigq->info.si_tid = timr->it_id;
        timr->sigq->info.si_value = timr->it_sigev_value;
@@ -433,8 +441,9 @@ static struct k_itimer * alloc_posix_timer(void)
                return tmr;
        if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
                kmem_cache_free(posix_timers_cache, tmr);
-                tmr = NULL;
+                return NULL;
        }
+        memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
        return tmr;
 }
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f011e0870b52..bbd85c60f741 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -21,6 +21,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <linux/ftrace.h>
 #include "power.h"
@@ -255,7 +256,7 @@ static int create_image(int platform_mode)
 int hibernation_snapshot(int platform_mode)
 {
-        int error;
+        int error, ftrace_save;
        /* Free memory before shutting down devices. */
        error = swsusp_shrink_memory();
@@ -267,6 +268,7 @@ int hibernation_snapshot(int platform_mode)
                goto Close;
        suspend_console();
+        ftrace_save = __ftrace_enabled_save();
        error = device_suspend(PMSG_FREEZE);
        if (error)
                goto Recover_platform;
@@ -296,6 +298,7 @@ int hibernation_snapshot(int platform_mode)
 Resume_devices:
        device_resume(in_suspend ?
                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+        __ftrace_enabled_restore(ftrace_save);
        resume_console();
 Close:
        platform_end(platform_mode);
@@ -366,10 +369,11 @@ static int resume_target_kernel(void)
 int hibernation_restore(int platform_mode)
 {
-        int error;
+        int error, ftrace_save;
        pm_prepare_console();
        suspend_console();
+        ftrace_save = __ftrace_enabled_save();
        error = device_suspend(PMSG_QUIESCE);
        if (error)
                goto Finish;
@@ -384,6 +388,7 @@ int hibernation_restore(int platform_mode)
        platform_restore_cleanup(platform_mode);
        device_resume(PMSG_RECOVER);
 Finish:
+        __ftrace_enabled_restore(ftrace_save);
        resume_console();
        pm_restore_console();
        return error;
@@ -396,7 +401,7 @@ int hibernation_restore(int platform_mode)
 int hibernation_platform_enter(void)
 {
-        int error;
+        int error, ftrace_save;
        if (!hibernation_ops)
                return -ENOSYS;
@@ -411,6 +416,7 @@ int hibernation_platform_enter(void)
                goto Close;
        suspend_console();
+        ftrace_save = __ftrace_enabled_save();
        error = device_suspend(PMSG_HIBERNATE);
        if (error) {
                if (hibernation_ops->recover)
@@ -445,6 +451,7 @@ int hibernation_platform_enter(void)
        hibernation_ops->finish();
 Resume_devices:
        device_resume(PMSG_RESTORE);
+        __ftrace_enabled_restore(ftrace_save);
        resume_console();
 Close:
        hibernation_ops->end();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 95bff23ecdaa..540b16b68565 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -21,6 +21,7 @@
 #include <linux/freezer.h>
 #include <linux/vmstat.h>
 #include <linux/syscalls.h>
+#include <linux/ftrace.h>
 #include "power.h"
@@ -310,7 +311,7 @@ static int suspend_enter(suspend_state_t state)
 */
 int suspend_devices_and_enter(suspend_state_t state)
 {
-        int error;
+        int error, ftrace_save;
        if (!suspend_ops)
                return -ENOSYS;
@@ -321,6 +322,7 @@ int suspend_devices_and_enter(suspend_state_t state)
                        goto Close;
        }
        suspend_console();
+        ftrace_save = __ftrace_enabled_save();
        suspend_test_start();
        error = device_suspend(PMSG_SUSPEND);
        if (error) {
@@ -352,6 +354,7 @@ int suspend_devices_and_enter(suspend_state_t state)
        suspend_test_start();
        device_resume(PMSG_RESUME);
        suspend_test_finish("resume devices");
+        __ftrace_enabled_restore(ftrace_save);
        resume_console();
 Close:
        if (suspend_ops->end)
@@ -635,6 +638,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
        }
        if (status < 0)
                printk(err_suspend, status);
+        /* Some platforms can't detect that the alarm triggered the
+         * wakeup, or (accordingly) disable it after it afterwards.
+         * It's supposed to give oneshot behavior; cope.
+         */
+        alm.enabled = false;
+        rtc_set_alarm(rtc, &alm);
 }
 static int __init has_wakealarm(struct device *dev, void *name_ptr)
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 700f44ec8406..acc0c101dbd5 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -53,8 +53,6 @@ extern int hibernation_platform_enter(void);
 extern int pfn_is_nosave(unsigned long);
-extern struct mutex pm_mutex;
 #define power_attr(_name) \
 static struct kobj_attribute _name##_attr = {   \
        .attr   = {                             \
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a0abf9a463f9..80ccac849e46 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -14,7 +14,6 @@
 #include <linux/module.h>
 #include <linux/file.h>
 #include <linux/utsname.h>
-#include <linux/version.h>
 #include <linux/delay.h>
 #include <linux/bitops.h>
 #include <linux/genhd.h>
diff --git a/kernel/printk.c b/kernel/printk.c
index a7f7559c5f6c..b51b1567bb55 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1309,14 +1309,14 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 #if defined CONFIG_PRINTK
-DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
 /*
 * printk rate limiting, lifted from the networking subsystem.
 *
- * This enforces a rate limit: not more than one kernel message
+ * This enforces a rate limit: not more than 10 kernel messages
- * every printk_ratelimit_jiffies to make a denial-of-service
+ * every 5s to make a denial-of-service attack impossible.
- * attack impossible.
 */
+DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
 int printk_ratelimit(void)
 {
        return __ratelimit(&printk_ratelimit_state);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8392a9da6450..356699a96d56 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -107,7 +107,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
        read_unlock(&tasklist_lock);
        if (!ret && !kill)
-                wait_task_inactive(child);
+                ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
        /* All systems go.. */
        return ret;
@@ -140,7 +140,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
        if (!dumpable && !capable(CAP_SYS_PTRACE))
                return -EPERM;
-        return security_ptrace(current, task, mode);
+        return security_ptrace_may_access(task, mode);
 }
 bool ptrace_may_access(struct task_struct *task, unsigned int mode)
@@ -499,8 +499,7 @@ repeat:
                        goto repeat;
                }
-                ret = security_ptrace(current->parent, current,
+                ret = security_ptrace_traceme(current->parent);
-                                      PTRACE_MODE_ATTACH);
                /*
                 * Set the ptrace bit in the process ptrace flags.
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 6f8696c502f4..37f72e551542 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -47,6 +47,7 @@
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
+#include <linux/time.h>
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
@@ -60,12 +61,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
 static struct rcu_ctrlblk rcu_ctrlblk = {
        .cur = -300,
        .completed = -300,
+        .pending = -300,
        .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
        .cpumask = CPU_MASK_NONE,
 };
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
        .cur = -300,
        .completed = -300,
+        .pending = -300,
        .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
        .cpumask = CPU_MASK_NONE,
 };
@@ -83,7 +86,10 @@ static void force_quiescent_state(struct rcu_data *rdp,
 {
        int cpu;
        cpumask_t cpumask;
+        unsigned long flags;
        set_need_resched();
+        spin_lock_irqsave(&rcp->lock, flags);
        if (unlikely(!rcp->signaled)) {
                rcp->signaled = 1;
                /*
@@ -91,8 +97,8 @@ static void force_quiescent_state(struct rcu_data *rdp,
                 * rdp->cpu is the current cpu.
                 *
                 * cpu_online_map is updated by the _cpu_down()
-                 * using stop_machine_run(). Since we're in irqs disabled
+                 * using __stop_machine(). Since we're in irqs disabled
-                 * section, stop_machine_run() is not exectuting, hence
+                 * section, __stop_machine() is not exectuting, hence
                 * the cpu_online_map is stable.
                 *
                 * However,  a cpu might have been offlined _just_ before
@@ -109,6 +115,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
                for_each_cpu_mask_nr(cpu, cpumask)
                        smp_send_reschedule(cpu);
        }
+        spin_unlock_irqrestore(&rcp->lock, flags);
 }
 #else
 static inline void force_quiescent_state(struct rcu_data *rdp,
@@ -118,6 +125,126 @@ static inline void force_quiescent_state(struct rcu_data *rdp,
 }
 #endif
+static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
+                struct rcu_data *rdp)
+{
+        long batch;
+        head->next = NULL;
+        smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
+        /*
+         * Determine the batch number of this callback.
+         *
+         * Using ACCESS_ONCE to avoid the following error when gcc eliminates
+         * local variable "batch" and emits codes like this:
+         *      1) rdp->batch = rcp->cur + 1 # gets old value
+         *      ......
+         *      2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
+         * then [*nxttail[0], *nxttail[1]) may contain callbacks
+         * that batch# = rdp->batch, see the comment of struct rcu_data.
+         */
+        batch = ACCESS_ONCE(rcp->cur) + 1;
+        if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
+                /* process callbacks */
+                rdp->nxttail[0] = rdp->nxttail[1];
+                rdp->nxttail[1] = rdp->nxttail[2];
+                if (rcu_batch_after(batch - 1, rdp->batch))
+                        rdp->nxttail[0] = rdp->nxttail[2];
+        }
+        rdp->batch = batch;
+        *rdp->nxttail[2] = head;
+        rdp->nxttail[2] = &head->next;
+        if (unlikely(++rdp->qlen > qhimark)) {
+                rdp->blimit = INT_MAX;
+                force_quiescent_state(rdp, &rcu_ctrlblk);
+        }
+}
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
+{
+        rcp->gp_start = jiffies;
+        rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
+}
+static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+        int cpu;
+        long delta;
+        unsigned long flags;
+        /* Only let one CPU complain about others per time interval. */
+        spin_lock_irqsave(&rcp->lock, flags);
+        delta = jiffies - rcp->jiffies_stall;
+        if (delta < 2 || rcp->cur != rcp->completed) {
+                spin_unlock_irqrestore(&rcp->lock, flags);
+                return;
+        }
+        rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+        spin_unlock_irqrestore(&rcp->lock, flags);
+        /* OK, time to rat on our buddy... */
+        printk(KERN_ERR "RCU detected CPU stalls:");
+        for_each_possible_cpu(cpu) {
+                if (cpu_isset(cpu, rcp->cpumask))
+                        printk(" %d", cpu);
+        }
+        printk(" (detected by %d, t=%ld jiffies)\n",
+               smp_processor_id(), (long)(jiffies - rcp->gp_start));
+}
+static void print_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+        unsigned long flags;
+        printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
+                        smp_processor_id(), jiffies,
+                        jiffies - rcp->gp_start);
+        dump_stack();
+        spin_lock_irqsave(&rcp->lock, flags);
+        if ((long)(jiffies - rcp->jiffies_stall) >= 0)
+                rcp->jiffies_stall =
+                        jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+        spin_unlock_irqrestore(&rcp->lock, flags);
+        set_need_resched();  /* kick ourselves to get things going. */
+}
+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+        long delta;
+        delta = jiffies - rcp->jiffies_stall;
+        if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
+                /* We haven't checked in, so go dump stack. */
+                print_cpu_stall(rcp);
+        } else if (rcp->cur != rcp->completed && delta >= 2) {
+                /* They had two seconds to dump stack, so complain. */
+                print_other_cpu_stall(rcp);
+        }
+}
+#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
+{
+}
+static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+}
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 /**
 * call_rcu - Queue an RCU callback for invocation after a grace period.
 * @head: structure to be used for queueing the RCU updates.
@@ -133,18 +260,10 @@ void call_rcu(struct rcu_head *head,
                                void (*func)(struct rcu_head *rcu))
 {
        unsigned long flags;
-        struct rcu_data *rdp;
        head->func = func;
-        head->next = NULL;
        local_irq_save(flags);
-        rdp = &__get_cpu_var(rcu_data);
+        __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
-        *rdp->nxttail = head;
-        rdp->nxttail = &head->next;
-        if (unlikely(++rdp->qlen > qhimark)) {
-                rdp->blimit = INT_MAX;
-                force_quiescent_state(rdp, &rcu_ctrlblk);
-        }
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
@@ -169,20 +288,10 @@ void call_rcu_bh(struct rcu_head *head,
                                void (*func)(struct rcu_head *rcu))
 {
        unsigned long flags;
-        struct rcu_data *rdp;
        head->func = func;
-        head->next = NULL;
        local_irq_save(flags);
-        rdp = &__get_cpu_var(rcu_bh_data);
+        __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
-        *rdp->nxttail = head;
-        rdp->nxttail = &head->next;
-        if (unlikely(++rdp->qlen > qhimark)) {
-                rdp->blimit = INT_MAX;
-                force_quiescent_state(rdp, &rcu_bh_ctrlblk);
-        }
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
@@ -211,12 +320,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 static inline void raise_rcu_softirq(void)
 {
        raise_softirq(RCU_SOFTIRQ);
-        /*
-         * The smp_mb() here is required to ensure that this cpu's
-         * __rcu_process_callbacks() reads the most recently updated
-         * value of rcu->cur.
-         */
-        smp_mb();
 }
 /*
@@ -225,6 +328,7 @@ static inline void raise_rcu_softirq(void)
 */
 static void rcu_do_batch(struct rcu_data *rdp)
 {
+        unsigned long flags;
        struct rcu_head *next, *list;
        int count = 0;
@@ -239,9 +343,9 @@ static void rcu_do_batch(struct rcu_data *rdp)
        }
        rdp->donelist = list;
-        local_irq_disable();
+        local_irq_save(flags);
        rdp->qlen -= count;
-        local_irq_enable();
+        local_irq_restore(flags);
        if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
                rdp->blimit = blimit;
@@ -269,6 +373,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
 *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
 *   period (if necessary).
 */
 /*
 * Register a new batch of callbacks, and start it up if there is currently no
 * active batch and the batch to be registered has not already occurred.
@@ -276,15 +381,10 @@ static void rcu_do_batch(struct rcu_data *rdp)
 */
 static void rcu_start_batch(struct rcu_ctrlblk *rcp)
 {
-        if (rcp->next_pending &&
+        if (rcp->cur != rcp->pending &&
                        rcp->completed == rcp->cur) {
-                rcp->next_pending = 0;
-                /*
-                 * next_pending == 0 must be visible in
-                 * __rcu_process_callbacks() before it can see new value of cur.
-                 */
-                smp_wmb();
                rcp->cur++;
+                record_gp_stall_check_time(rcp);
                /*
                 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -322,6 +422,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
 static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
                                        struct rcu_data *rdp)
 {
+        unsigned long flags;
        if (rdp->quiescbatch != rcp->cur) {
                /* start new grace period: */
                rdp->qs_pending = 1;
@@ -345,7 +447,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
                return;
        rdp->qs_pending = 0;
-        spin_lock(&rcp->lock);
+        spin_lock_irqsave(&rcp->lock, flags);
        /*
         * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
         * during cpu startup. Ignore the quiescent state.
@@ -353,7 +455,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
        if (likely(rdp->quiescbatch == rcp->cur))
                cpu_quiet(rdp->cpu, rcp);
-        spin_unlock(&rcp->lock);
+        spin_unlock_irqrestore(&rcp->lock, flags);
 }
@@ -364,33 +466,38 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
 * which is dead and hence not processing interrupts.
 */
 static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
-                                struct rcu_head **tail)
+                                struct rcu_head **tail, long batch)
 {
-        local_irq_disable();
+        unsigned long flags;
-        *this_rdp->nxttail = list;
-        if (list)
+        if (list) {
-                this_rdp->nxttail = tail;
+                local_irq_save(flags);
-        local_irq_enable();
+                this_rdp->batch = batch;
+                *this_rdp->nxttail[2] = list;
+                this_rdp->nxttail[2] = tail;
+                local_irq_restore(flags);
+        }
 }
 static void __rcu_offline_cpu(struct rcu_data *this_rdp,
                                struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
-        /* if the cpu going offline owns the grace period
+        unsigned long flags;
+        /*
+         * if the cpu going offline owns the grace period
         * we can block indefinitely waiting for it, so flush
         * it here
         */
-        spin_lock_bh(&rcp->lock);
+        spin_lock_irqsave(&rcp->lock, flags);
        if (rcp->cur != rcp->completed)
                cpu_quiet(rdp->cpu, rcp);
-        spin_unlock_bh(&rcp->lock);
+        rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
-        rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
+        rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
-        rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
+        spin_unlock(&rcp->lock);
-        rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
-        local_irq_disable();
        this_rdp->qlen += rdp->qlen;
-        local_irq_enable();
+        local_irq_restore(flags);
 }
 static void rcu_offline_cpu(int cpu)
@@ -420,38 +527,52 @@ static void rcu_offline_cpu(int cpu)
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
                                        struct rcu_data *rdp)
 {
-        if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
+        unsigned long flags;
-                *rdp->donetail = rdp->curlist;
+        long completed_snap;
-                rdp->donetail = rdp->curtail;
-                rdp->curlist = NULL;
-                rdp->curtail = &rdp->curlist;
-        }
-        if (rdp->nxtlist && !rdp->curlist) {
+        if (rdp->nxtlist) {
-                local_irq_disable();
+                local_irq_save(flags);
-                rdp->curlist = rdp->nxtlist;
+                completed_snap = ACCESS_ONCE(rcp->completed);
-                rdp->curtail = rdp->nxttail;
-                rdp->nxtlist = NULL;
-                rdp->nxttail = &rdp->nxtlist;
-                local_irq_enable();
                /*
-                 * start the next batch of callbacks
+                 * move the other grace-period-completed entries to
+                 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
                 */
+                if (!rcu_batch_before(completed_snap, rdp->batch))
+                        rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
+                else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
+                        rdp->nxttail[0] = rdp->nxttail[1];
-                /* determine batch number */
+                /*
-                rdp->batch = rcp->cur + 1;
+                 * the grace period for entries in
-                /* see the comment and corresponding wmb() in
+                 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
-                 * the rcu_start_batch()
+                 * move these entries to donelist
                 */
-                smp_rmb();
+                if (rdp->nxttail[0] != &rdp->nxtlist) {
+                        *rdp->donetail = rdp->nxtlist;
+                        rdp->donetail = rdp->nxttail[0];
+                        rdp->nxtlist = *rdp->nxttail[0];
+                        *rdp->donetail = NULL;
+                        if (rdp->nxttail[1] == rdp->nxttail[0])
+                                rdp->nxttail[1] = &rdp->nxtlist;
+                        if (rdp->nxttail[2] == rdp->nxttail[0])
+                                rdp->nxttail[2] = &rdp->nxtlist;
+                        rdp->nxttail[0] = &rdp->nxtlist;
+                }
+                local_irq_restore(flags);
+                if (rcu_batch_after(rdp->batch, rcp->pending)) {
+                        unsigned long flags2;
-                if (!rcp->next_pending) {
                        /* and start it/schedule start if it's a new batch */
-                        spin_lock(&rcp->lock);
+                        spin_lock_irqsave(&rcp->lock, flags2);
-                        rcp->next_pending = 1;
+                        if (rcu_batch_after(rdp->batch, rcp->pending)) {
-                        rcu_start_batch(rcp);
+                                rcp->pending = rdp->batch;
-                        spin_unlock(&rcp->lock);
+                                rcu_start_batch(rcp);
+                        }
+                        spin_unlock_irqrestore(&rcp->lock, flags2);
                }
        }
@@ -462,21 +583,53 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
+        /*
+         * Memory references from any prior RCU read-side critical sections
+         * executed by the interrupted code must be see before any RCU
+         * grace-period manupulations below.
+         */
+        smp_mb(); /* See above block comment. */
        __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
        __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
+        /*
+         * Memory references from any later RCU read-side critical sections
+         * executed by the interrupted code must be see after any RCU
+         * grace-period manupulations above.
+         */
+        smp_mb(); /* See above block comment. */
 }
 static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
-        /* This cpu has pending rcu entries and the grace period
+        /* Check for CPU stalls, if enabled. */
-         * for them has completed.
+        check_cpu_stall(rcp);
-         */
-        if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
-                return 1;
-        /* This cpu has no pending entries, but there are new entries */
+        if (rdp->nxtlist) {
-        if (!rdp->curlist && rdp->nxtlist)
+                long completed_snap = ACCESS_ONCE(rcp->completed);
-                return 1;
+                /*
+                 * This cpu has pending rcu entries and the grace period
+                 * for them has completed.
+                 */
+                if (!rcu_batch_before(completed_snap, rdp->batch))
+                        return 1;
+                if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
+                                rdp->nxttail[0] != rdp->nxttail[1])
+                        return 1;
+                if (rdp->nxttail[0] != &rdp->nxtlist)
+                        return 1;
+                /*
+                 * This cpu has pending rcu entries and the new batch
+                 * for then hasn't been started nor scheduled start
+                 */
+                if (rcu_batch_after(rdp->batch, rcp->pending))
+                        return 1;
+        }
        /* This cpu has finished callbacks to invoke */
        if (rdp->donelist)
@@ -512,9 +665,15 @@ int rcu_needs_cpu(int cpu)
        struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
        struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
-        return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
+        return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
 }
+/*
+ * Top-level function driving RCU grace-period detection, normally
+ * invoked from the scheduler-clock interrupt.  This function simply
+ * increments counters that are read only from softirq by this same
+ * CPU, so there are no memory barriers required.
+ */
 void rcu_check_callbacks(int cpu, int user)
 {
        if (user ||
@@ -558,14 +717,17 @@ void rcu_check_callbacks(int cpu, int user)
 static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
                                                struct rcu_data *rdp)
 {
+        unsigned long flags;
+        spin_lock_irqsave(&rcp->lock, flags);
        memset(rdp, 0, sizeof(*rdp));
-        rdp->curtail = &rdp->curlist;
+        rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
-        rdp->nxttail = &rdp->nxtlist;
        rdp->donetail = &rdp->donelist;
        rdp->quiescbatch = rcp->completed;
        rdp->qs_pending = 0;
        rdp->cpu = cpu;
        rdp->blimit = blimit;
+        spin_unlock_irqrestore(&rcp->lock, flags);
 }
 static void __cpuinit rcu_online_cpu(int cpu)
@@ -610,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = {
 */
 void __init __rcu_init(void)
 {
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+        printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
        rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
                        (void *)(long)smp_processor_id());
        /* Register notifier for non-boot CPUs */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f14f372cf6f5..467d5940f624 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -77,6 +77,7 @@ void wakeme_after_rcu(struct rcu_head  *head)
 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
 * and may be nested.
 */
+void synchronize_rcu(void);     /* Makes kernel-doc tools happy */
 synchronize_rcu_xxx(synchronize_rcu, call_rcu)
 EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 27827931ca0d..ca4bbbe04aa4 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -59,14 +59,6 @@
 #include <linux/rcupreempt_trace.h>
 /*
- * Macro that prevents the compiler from reordering accesses, but does
- * absolutely -nothing- to prevent CPUs from reordering.  This is used
- * only to mediate communication between mainline code and hardware
- * interrupt and NMI handlers.
- */
-#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
-/*
 * PREEMPT_RCU data structures.
 */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 5edf82c34bbc..35c2d3360ecf 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -308,11 +308,16 @@ out:
 static int __init rcupreempt_trace_init(void)
 {
+        int ret;
        mutex_init(&rcupreempt_trace_mutex);
        rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
        if (!rcupreempt_trace_buf)
                return 1;
-        return rcupreempt_debugfs_init();
+        ret = rcupreempt_debugfs_init();
+        if (ret)
+                kfree(rcupreempt_trace_buf);
+        return ret;
 }
 static void __exit rcupreempt_trace_cleanup(void)
diff --git a/kernel/relay.c b/kernel/relay.c
index 7de644cdec43..8d13a7855c08 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -407,6 +407,35 @@ void relay_reset(struct rchan *chan)
 }
 EXPORT_SYMBOL_GPL(relay_reset);
+static inline void relay_set_buf_dentry(struct rchan_buf *buf,
+                                        struct dentry *dentry)
+{
+        buf->dentry = dentry;
+        buf->dentry->d_inode->i_size = buf->early_bytes;
+}
+static struct dentry *relay_create_buf_file(struct rchan *chan,
+                                            struct rchan_buf *buf,
+                                            unsigned int cpu)
+{
+        struct dentry *dentry;
+        char *tmpname;
+        tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
+        if (!tmpname)
+                return NULL;
+        snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
+        /* Create file in fs */
+        dentry = chan->cb->create_buf_file(tmpname, chan->parent,
+                                           S_IRUSR, buf,
+                                           &chan->is_global);
+        kfree(tmpname);
+        return dentry;
+}
 /*
 *      relay_open_buf - create a new relay channel buffer
 *
@@ -416,45 +445,34 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
 {
        struct rchan_buf *buf = NULL;
        struct dentry *dentry;
-        char *tmpname;
        if (chan->is_global)
                return chan->buf[0];
-        tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
-        if (!tmpname)
-                goto end;
-        snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
        buf = relay_create_buf(chan);
        if (!buf)
-                goto free_name;
+                return NULL;
+        if (chan->has_base_filename) {
+                dentry = relay_create_buf_file(chan, buf, cpu);
+                if (!dentry)
+                        goto free_buf;
+                relay_set_buf_dentry(buf, dentry);
+        }
        buf->cpu = cpu;
        __relay_reset(buf, 1);
-        /* Create file in fs */
-        dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR,
-                                           buf, &chan->is_global);
-        if (!dentry)
-                goto free_buf;
-        buf->dentry = dentry;
        if(chan->is_global) {
                chan->buf[0] = buf;
                buf->cpu = 0;
        }
-        goto free_name;
+        return buf;
 free_buf:
        relay_destroy_buf(buf);
-        buf = NULL;
+        return NULL;
-free_name:
-        kfree(tmpname);
-end:
-        return buf;
 }
 /**
@@ -537,8 +555,8 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
 /**
 *      relay_open - create a new relay channel
- *      @base_filename: base name of files to create
+ *      @base_filename: base name of files to create, %NULL for buffering only
- *      @parent: dentry of parent directory, %NULL for root directory
+ *      @parent: dentry of parent directory, %NULL for root directory or buffer
 *      @subbuf_size: size of sub-buffers
 *      @n_subbufs: number of sub-buffers
 *      @cb: client callback functions
@@ -560,8 +578,6 @@ struct rchan *relay_open(const char *base_filename,
 {
        unsigned int i;
        struct rchan *chan;
-        if (!base_filename)
-                return NULL;
        if (!(subbuf_size && n_subbufs))
                return NULL;
@@ -576,7 +592,10 @@ struct rchan *relay_open(const char *base_filename,
        chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
        chan->parent = parent;
        chan->private_data = private_data;
-        strlcpy(chan->base_filename, base_filename, NAME_MAX);
+        if (base_filename) {
+                chan->has_base_filename = 1;
+                strlcpy(chan->base_filename, base_filename, NAME_MAX);
+        }
        setup_callbacks(chan, cb);
        kref_init(&chan->kref);
@@ -604,6 +623,94 @@ free_bufs:
 }
 EXPORT_SYMBOL_GPL(relay_open);
+struct rchan_percpu_buf_dispatcher {
+        struct rchan_buf *buf;
+        struct dentry *dentry;
+};
+/* Called in atomic context. */
+static void __relay_set_buf_dentry(void *info)
+{
+        struct rchan_percpu_buf_dispatcher *p = info;
+        relay_set_buf_dentry(p->buf, p->dentry);
+}
+/**
+ *      relay_late_setup_files - triggers file creation
+ *      @chan: channel to operate on
+ *      @base_filename: base name of files to create
+ *      @parent: dentry of parent directory, %NULL for root directory
+ *
+ *      Returns 0 if successful, non-zero otherwise.
+ *
+ *      Use to setup files for a previously buffer-only channel.
+ *      Useful to do early tracing in kernel, before VFS is up, for example.
+ */
+int relay_late_setup_files(struct rchan *chan,
+                           const char *base_filename,
+                           struct dentry *parent)
+{
+        int err = 0;
+        unsigned int i, curr_cpu;
+        unsigned long flags;
+        struct dentry *dentry;
+        struct rchan_percpu_buf_dispatcher disp;
+        if (!chan || !base_filename)
+                return -EINVAL;
+        strlcpy(chan->base_filename, base_filename, NAME_MAX);
+        mutex_lock(&relay_channels_mutex);
+        /* Is chan already set up? */
+        if (unlikely(chan->has_base_filename))
+                return -EEXIST;
+        chan->has_base_filename = 1;
+        chan->parent = parent;
+        curr_cpu = get_cpu();
+        /*
+         * The CPU hotplug notifier ran before us and created buffers with
+         * no files associated. So it's safe to call relay_setup_buf_file()
+         * on all currently online CPUs.
+         */
+        for_each_online_cpu(i) {
+                if (unlikely(!chan->buf[i])) {
+                        printk(KERN_ERR "relay_late_setup_files: CPU %u "
+                                        "has no buffer, it must have!\n", i);
+                        BUG();
+                        err = -EINVAL;
+                        break;
+                }
+                dentry = relay_create_buf_file(chan, chan->buf[i], i);
+                if (unlikely(!dentry)) {
+                        err = -EINVAL;
+                        break;
+                }
+                if (curr_cpu == i) {
+                        local_irq_save(flags);
+                        relay_set_buf_dentry(chan->buf[i], dentry);
+                        local_irq_restore(flags);
+                } else {
+                        disp.buf = chan->buf[i];
+                        disp.dentry = dentry;
+                        smp_mb();
+                        /* relay_channels_mutex must be held, so wait. */
+                        err = smp_call_function_single(i,
+                                                       __relay_set_buf_dentry,
+                                                       &disp, 1);
+                }
+                if (unlikely(err))
+                        break;
+        }
+        put_cpu();
+        mutex_unlock(&relay_channels_mutex);
+        return err;
+}
 /**
 *      relay_switch_subbuf - switch to a new sub-buffer
 *      @buf: channel buffer
@@ -627,8 +734,13 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
                old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
                buf->padding[old_subbuf] = buf->prev_padding;
                buf->subbufs_produced++;
-                buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
+                if (buf->dentry)
-                        buf->padding[old_subbuf];
+                        buf->dentry->d_inode->i_size +=
+                                buf->chan->subbuf_size -
+                                buf->padding[old_subbuf];
+                else
+                        buf->early_bytes += buf->chan->subbuf_size -
+                                            buf->padding[old_subbuf];
                smp_mb();
                if (waitqueue_active(&buf->read_wait))
                        /*
@@ -832,6 +944,10 @@ static void relay_file_read_consume(struct rchan_buf *buf,
        size_t n_subbufs = buf->chan->n_subbufs;
        size_t read_subbuf;
+        if (buf->subbufs_produced == buf->subbufs_consumed &&
+            buf->offset == buf->bytes_consumed)
+                return;
        if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
                relay_subbufs_consumed(buf->chan, buf->cpu, 1);
                buf->bytes_consumed = 0;
@@ -863,6 +979,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
        relay_file_read_consume(buf, read_pos, 0);
+        consumed = buf->subbufs_consumed;
        if (unlikely(buf->offset > subbuf_size)) {
                if (produced == consumed)
                        return 0;
@@ -881,8 +999,12 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
        if (consumed > produced)
                produced += n_subbufs * subbuf_size;
-        if (consumed == produced)
+        if (consumed == produced) {
+                if (buf->offset == subbuf_size &&
+                    buf->subbufs_produced > buf->subbufs_consumed)
+                        return 1;
                return 0;
+        }
        return 1;
 }
@@ -1237,4 +1359,4 @@ static __init int relay_init(void)
        return 0;
 }
-module_init(relay_init);
+early_initcall(relay_init);
diff --git a/kernel/resource.c b/kernel/resource.c
index 74af2d7cb5a1..414d6fc9131e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -362,35 +362,21 @@ int allocate_resource(struct resource *root, struct resource *new,
 EXPORT_SYMBOL(allocate_resource);
-/**
+/*
- * insert_resource - Inserts a resource in the resource tree
+ * Insert a resource into the resource tree. If successful, return NULL,
- * @parent: parent of the new resource
+ * otherwise return the conflicting resource (compare to __request_resource())
- * @new: new resource to insert
- *
- * Returns 0 on success, -EBUSY if the resource can't be inserted.
- *
- * This function is equivalent to request_resource when no conflict
- * happens. If a conflict happens, and the conflicting resources
- * entirely fit within the range of the new resource, then the new
- * resource is inserted and the conflicting resources become children of
- * the new resource.
 */
-int insert_resource(struct resource *parent, struct resource *new)
+static struct resource * __insert_resource(struct resource *parent, struct resource *new)
 {
-        int result;
        struct resource *first, *next;
-        write_lock(&resource_lock);
        for (;; parent = first) {
-                result = 0;
                first = __request_resource(parent, new);
                if (!first)
-                        goto out;
+                        return first;
-                result = -EBUSY;
                if (first == parent)
-                        goto out;
+                        return first;
                if ((first->start > new->start) || (first->end < new->end))
                        break;
@@ -401,15 +387,13 @@ int insert_resource(struct resource *parent, struct resource *new)
        for (next = first; ; next = next->sibling) {
                /* Partial overlap? Bad, and unfixable */
                if (next->start < new->start || next->end > new->end)
-                        goto out;
+                        return next;
                if (!next->sibling)
                        break;
                if (next->sibling->start > new->end)
                        break;
        }
-        result = 0;
        new->parent = parent;
        new->sibling = next->sibling;
        new->child = first;
@@ -426,10 +410,64 @@ int insert_resource(struct resource *parent, struct resource *new)
                        next = next->sibling;
                next->sibling = new;
        }
+        return NULL;
+}
- out:
+/**
+ * insert_resource - Inserts a resource in the resource tree
+ * @parent: parent of the new resource
+ * @new: new resource to insert
+ *
+ * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ *
+ * This function is equivalent to request_resource when no conflict
+ * happens. If a conflict happens, and the conflicting resources
+ * entirely fit within the range of the new resource, then the new
+ * resource is inserted and the conflicting resources become children of
+ * the new resource.
+ */
+int insert_resource(struct resource *parent, struct resource *new)
+{
+        struct resource *conflict;
+        write_lock(&resource_lock);
+        conflict = __insert_resource(parent, new);
+        write_unlock(&resource_lock);
+        return conflict ? -EBUSY : 0;
+}
+/**
+ * insert_resource_expand_to_fit - Insert a resource into the resource tree
+ * @root: root resource descriptor
+ * @new: new resource to insert
+ *
+ * Insert a resource into the resource tree, possibly expanding it in order
+ * to make it encompass any conflicting resources.
+ */
+void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
+{
+        if (new->parent)
+                return;
+        write_lock(&resource_lock);
+        for (;;) {
+                struct resource *conflict;
+                conflict = __insert_resource(root, new);
+                if (!conflict)
+                        break;
+                if (conflict == root)
+                        break;
+                /* Ok, expand resource to cover the conflict, then try again .. */
+                if (conflict->start < new->start)
+                        new->start = conflict->start;
+                if (conflict->end > new->end)
+                        new->end = conflict->end;
+                printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
+        }
        write_unlock(&resource_lock);
-        return result;
 }
 /**
@@ -478,6 +516,74 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
        return result;
 }
+static void __init __reserve_region_with_split(struct resource *root,
+                resource_size_t start, resource_size_t end,
+                const char *name)
+{
+        struct resource *parent = root;
+        struct resource *conflict;
+        struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
+        if (!res)
+                return;
+        res->name = name;
+        res->start = start;
+        res->end = end;
+        res->flags = IORESOURCE_BUSY;
+        for (;;) {
+                conflict = __request_resource(parent, res);
+                if (!conflict)
+                        break;
+                if (conflict != parent) {
+                        parent = conflict;
+                        if (!(conflict->flags & IORESOURCE_BUSY))
+                                continue;
+                }
+                /* Uhhuh, that didn't work out.. */
+                kfree(res);
+                res = NULL;
+                break;
+        }
+        if (!res) {
+                printk(KERN_DEBUG "    __reserve_region_with_split: (%s) [%llx, %llx], res: (%s) [%llx, %llx]\n",
+                         conflict->name, conflict->start, conflict->end,
+                         name, start, end);
+                /* failed, split and try again */
+                /* conflict coverred whole area */
+                if (conflict->start <= start && conflict->end >= end)
+                        return;
+                if (conflict->start > start)
+                        __reserve_region_with_split(root, start, conflict->start-1, name);
+                if (!(conflict->flags & IORESOURCE_BUSY)) {
+                        resource_size_t common_start, common_end;
+                        common_start = max(conflict->start, start);
+                        common_end = min(conflict->end, end);
+                        if (common_start < common_end)
+                                __reserve_region_with_split(root, common_start, common_end, name);
+                }
+                if (conflict->end < end)
+                        __reserve_region_with_split(root, conflict->end+1, end, name);
+        }
+}
+void reserve_region_with_split(struct resource *root,
+                resource_size_t start, resource_size_t end,
+                const char *name)
+{
+        write_lock(&resource_lock);
+        __reserve_region_with_split(root, start, end, name);
+        write_unlock(&resource_lock);
+}
 EXPORT_SYMBOL(adjust_resource);
 /**
@@ -490,7 +596,7 @@ resource_size_t resource_alignment(struct resource *res)
 {
        switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) {
        case IORESOURCE_SIZEALIGN:
-                return res->end - res->start + 1;
+                return resource_size(res);
        case IORESOURCE_STARTALIGN:
                return res->start;
        default:
diff --git a/kernel/sched.c b/kernel/sched.c
index 0047bd9b96aa..6f230596bd0c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -201,14 +201,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
        hrtimer_init(&rt_b->rt_period_timer,
                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        rt_b->rt_period_timer.function = sched_rt_period_timer;
-        rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+        rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
+}
+static inline int rt_bandwidth_enabled(void)
+{
+        return sysctl_sched_rt_runtime >= 0;
 }
 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
        ktime_t now;
-        if (rt_b->rt_runtime == RUNTIME_INF)
+        if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
                return;
        if (hrtimer_active(&rt_b->rt_period_timer))
@@ -298,9 +303,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 #endif /* CONFIG_RT_GROUP_SCHED */
-#else /* !CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_USER_SCHED */
 #define root_task_group init_task_group
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_USER_SCHED */
 /* task_group_lock serializes add/remove of task groups and also changes to
 * a task group's cpu shares.
@@ -600,14 +605,13 @@ struct rq {
        /* BKL stats */
        unsigned int bkl_count;
 #endif
-        struct lock_class_key rq_lock_key;
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
+static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
 {
-        rq->curr->sched_class->check_preempt_curr(rq, p);
+        rq->curr->sched_class->check_preempt_curr(rq, p, sync);
 }
 static inline int cpu_of(struct rq *rq)
@@ -809,9 +813,9 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
 /*
 * ratelimit for updating the group shares.
- * default: 0.5ms
+ * default: 0.25ms
 */
-const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
+unsigned int sysctl_sched_shares_ratelimit = 250000;
 /*
 * period over which we measure -rt task cpu usage in us.
@@ -834,7 +838,7 @@ static inline u64 global_rt_period(void)
 static inline u64 global_rt_runtime(void)
 {
-        if (sysctl_sched_rt_period < 0)
+        if (sysctl_sched_rt_runtime < 0)
                return RUNTIME_INF;
        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
@@ -1088,7 +1092,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
        return NOTIFY_DONE;
 }
-static void init_hrtick(void)
+static __init void init_hrtick(void)
 {
        hotcpu_notifier(hotplug_hrtick, 0);
 }
@@ -1103,7 +1107,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
        hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
 }
-static void init_hrtick(void)
+static inline void init_hrtick(void)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1120,9 +1124,9 @@ static void init_rq_hrtick(struct rq *rq)
        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        rq->hrtick_timer.function = hrtick;
-        rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+        rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 }
-#else
+#else   /* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
 {
 }
@@ -1134,7 +1138,7 @@ static inline void init_rq_hrtick(struct rq *rq)
 static inline void init_hrtick(void)
 {
 }
-#endif
+#endif  /* CONFIG_SCHED_HRTICK */
 /*
 * resched_task - mark a task 'to be rescheduled now'.
@@ -1381,38 +1385,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
        update_load_sub(&rq->load, load);
 }
-#ifdef CONFIG_SMP
+#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
-static unsigned long source_load(int cpu, int type);
+typedef int (*tg_visitor)(struct task_group *, void *);
-static unsigned long target_load(int cpu, int type);
-static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        if (rq->nr_running)
-                rq->avg_load_per_task = rq->load.weight / rq->nr_running;
-        return rq->avg_load_per_task;
-}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
 /*
 * Iterate the full tree, calling @down when first entering a node and @up when
 * leaving it for the final time.
 */
-static void
+static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
-walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
 {
        struct task_group *parent, *child;
+        int ret;
        rcu_read_lock();
        parent = &root_task_group;
 down:
-        (*down)(parent, cpu, sd);
+        ret = (*down)(parent, data);
+        if (ret)
+                goto out_unlock;
        list_for_each_entry_rcu(child, &parent->children, siblings) {
                parent = child;
                goto down;
@@ -1420,15 +1410,43 @@ down:
 up:
                continue;
        }
-        (*up)(parent, cpu, sd);
+        ret = (*up)(parent, data);
+        if (ret)
+                goto out_unlock;
        child = parent;
        parent = parent->parent;
        if (parent)
                goto up;
+out_unlock:
        rcu_read_unlock();
+        return ret;
 }
+static int tg_nop(struct task_group *tg, void *data)
+{
+        return 0;
+}
+#endif
+#ifdef CONFIG_SMP
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        if (rq->nr_running)
+                rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+        return rq->avg_load_per_task;
+}
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 /*
@@ -1487,11 +1505,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
 * This needs to be done in a bottom-up fashion because the rq weight of a
 * parent group depends on the shares of its child groups.
 */
-static void
+static int tg_shares_up(struct task_group *tg, void *data)
-tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
        unsigned long rq_weight = 0;
        unsigned long shares = 0;
+        struct sched_domain *sd = data;
        int i;
        for_each_cpu_mask(i, sd->span) {
@@ -1516,6 +1534,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
                __update_group_shares_cpu(tg, i, shares, rq_weight);
                spin_unlock_irqrestore(&rq->lock, flags);
        }
+        return 0;
 }
 /*
@@ -1523,10 +1543,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
 * This needs to be done in a top-down fashion because the load of a child
 * group is a fraction of its parents load.
 */
-static void
+static int tg_load_down(struct task_group *tg, void *data)
-tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
        unsigned long load;
+        long cpu = (long)data;
        if (!tg->parent) {
                load = cpu_rq(cpu)->load.weight;
@@ -1537,11 +1557,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
        }
        tg->cfs_rq[cpu]->h_load = load;
-}
-static void
+        return 0;
-tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
 }
 static void update_shares(struct sched_domain *sd)
@@ -1551,7 +1568,7 @@ static void update_shares(struct sched_domain *sd)
        if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
                sd->last_update = now;
-                walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+                walk_tg_tree(tg_nop, tg_shares_up, sd);
        }
 }
@@ -1562,9 +1579,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
        spin_lock(&rq->lock);
 }
-static void update_h_load(int cpu)
+static void update_h_load(long cpu)
 {
-        walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
+        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 #else
@@ -1867,16 +1884,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 /*
 * wait_task_inactive - wait for a thread to unschedule.
 *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change.  If it changes, i.e. @p might have woken up,
+ * then return zero.  When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count).  If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
 * The caller must ensure that the task *will* unschedule sometime soon,
 * else this function might spin for a *long* time. This function can't
 * be called with interrupts off, or it may introduce deadlock with
 * smp_call_function() if an IPI is sent by the same process we are
 * waiting to become inactive.
 */
-void wait_task_inactive(struct task_struct *p)
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
        unsigned long flags;
        int running, on_rq;
+        unsigned long ncsw;
        struct rq *rq;
        for (;;) {
@@ -1899,8 +1924,11 @@ void wait_task_inactive(struct task_struct *p)
                 * return false if the runqueue has changed and p
                 * is actually now running somewhere else!
                 */
-                while (task_running(rq, p))
+                while (task_running(rq, p)) {
+                        if (match_state && unlikely(p->state != match_state))
+                                return 0;
                        cpu_relax();
+                }
                /*
                 * Ok, time to look more closely! We need the rq
@@ -1910,9 +1938,18 @@ void wait_task_inactive(struct task_struct *p)
                rq = task_rq_lock(p, &flags);
                running = task_running(rq, p);
                on_rq = p->se.on_rq;
+                ncsw = 0;
+                if (!match_state || p->state == match_state)
+                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
                task_rq_unlock(rq, &flags);
                /*
+                 * If it changed from the expected state, bail out now.
+                 */
+                if (unlikely(!ncsw))
+                        break;
+                /*
                 * Was it really running after all now that we
                 * checked with the proper locks actually held?
                 *
@@ -1944,6 +1981,8 @@ void wait_task_inactive(struct task_struct *p)
                 */
                break;
        }
+        return ncsw;
 }
 /***
@@ -2261,7 +2300,7 @@ out_running:
        trace_mark(kernel_sched_wakeup,
                "pid %d state %ld ## rq %p task %p rq->curr %p",
                p->pid, p->state, rq, p, rq->curr);
-        check_preempt_curr(rq, p);
+        check_preempt_curr(rq, p, sync);
        p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
@@ -2396,7 +2435,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
        trace_mark(kernel_sched_wakeup_new,
                "pid %d state %ld ## rq %p task %p rq->curr %p",
                p->pid, p->state, rq, p, rq->curr);
-        check_preempt_curr(rq, p);
+        check_preempt_curr(rq, p, 0);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
                p->sched_class->task_wake_up(rq, p);
@@ -2734,10 +2773,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
        } else {
                if (rq1 < rq2) {
                        spin_lock(&rq1->lock);
-                        spin_lock(&rq2->lock);
+                        spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
                } else {
                        spin_lock(&rq2->lock);
-                        spin_lock(&rq1->lock);
+                        spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
                }
        }
        update_rq_clock(rq1);
@@ -2780,14 +2819,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
                if (busiest < this_rq) {
                        spin_unlock(&this_rq->lock);
                        spin_lock(&busiest->lock);
-                        spin_lock(&this_rq->lock);
+                        spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
                        ret = 1;
                } else
-                        spin_lock(&busiest->lock);
+                        spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
        }
        return ret;
 }
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+        __releases(busiest->lock)
+{
+        spin_unlock(&busiest->lock);
+        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
 /*
 * If dest_cpu is allowed for this process, migrate the task to it.
 * This is accomplished by forcing the cpu_allowed mask to only
@@ -2849,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
         * Note that idle threads have a prio of MAX_PRIO, for this test
         * to be always true for them.
         */
-        check_preempt_curr(this_rq, p);
+        check_preempt_curr(this_rq, p, 0);
 }
 /*
@@ -3612,7 +3658,7 @@ redo:
                ld_moved = move_tasks(this_rq, this_cpu, busiest,
                                        imbalance, sd, CPU_NEWLY_IDLE,
                                        &all_pinned);
-                spin_unlock(&busiest->lock);
+                double_unlock_balance(this_rq, busiest);
                if (unlikely(all_pinned)) {
                        cpu_clear(cpu_of(busiest), *cpus);
@@ -3727,7 +3773,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
                else
                        schedstat_inc(sd, alb_failed);
        }
-        spin_unlock(&target_rq->lock);
+        double_unlock_balance(busiest_rq, target_rq);
 }
 #ifdef CONFIG_NO_HZ
@@ -4148,6 +4194,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
 }
 /*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+cputime_t task_utime(struct task_struct *p)
+{
+        return p->utime;
+}
+cputime_t task_stime(struct task_struct *p)
+{
+        return p->stime;
+}
+#else
+cputime_t task_utime(struct task_struct *p)
+{
+        clock_t utime = cputime_to_clock_t(p->utime),
+                total = utime + cputime_to_clock_t(p->stime);
+        u64 temp;
+        /*
+         * Use CFS's precise accounting:
+         */
+        temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
+        if (total) {
+                temp *= utime;
+                do_div(temp, total);
+        }
+        utime = (clock_t)temp;
+        p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
+        return p->prev_utime;
+}
+cputime_t task_stime(struct task_struct *p)
+{
+        clock_t stime;
+        /*
+         * Use CFS's precise accounting. (we subtract utime from
+         * the total, to make sure the total observed by userspace
+         * grows monotonically - apps rely on that):
+         */
+        stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
+                        cputime_to_clock_t(task_utime(p));
+        if (stime >= 0)
+                p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+        return p->prev_stime;
+}
+#endif
+inline cputime_t task_gtime(struct task_struct *p)
+{
+        return p->gtime;
+}
+/*
 * This function gets called by the timer code, with HZ frequency.
 * We call it with interrupts disabled.
 *
@@ -4537,6 +4642,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ */
 void complete(struct completion *x)
 {
        unsigned long flags;
@@ -4548,6 +4662,12 @@ void complete(struct completion *x)
 }
 EXPORT_SYMBOL(complete);
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ */
 void complete_all(struct completion *x)
 {
        unsigned long flags;
@@ -4568,10 +4688,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
                wait.flags |= WQ_FLAG_EXCLUSIVE;
                __add_wait_queue_tail(&x->wait, &wait);
                do {
-                        if ((state == TASK_INTERRUPTIBLE &&
+                        if (signal_pending_state(state, current)) {
-                             signal_pending(current)) ||
-                            (state == TASK_KILLABLE &&
-                             fatal_signal_pending(current))) {
                                timeout = -ERESTARTSYS;
                                break;
                        }
@@ -4599,12 +4716,31 @@ wait_for_common(struct completion *x, long timeout, int state)
        return timeout;
 }
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
 void __sched wait_for_completion(struct completion *x)
 {
        wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion);
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ */
 unsigned long __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 {
@@ -4612,6 +4748,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 }
 EXPORT_SYMBOL(wait_for_completion_timeout);
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ */
 int __sched wait_for_completion_interruptible(struct completion *x)
 {
        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4621,6 +4764,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible);
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ */
 unsigned long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
                                          unsigned long timeout)
@@ -4629,6 +4780,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ */
 int __sched wait_for_completion_killable(struct completion *x)
 {
        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@ -4638,6 +4796,52 @@ int __sched wait_for_completion_killable(struct completion *x)
 }
 EXPORT_SYMBOL(wait_for_completion_killable);
+/**
+ *      try_wait_for_completion - try to decrement a completion without blocking
+ *      @x:     completion structure
+ *
+ *      Returns: 0 if a decrement cannot be done without blocking
+ *               1 if a decrement succeeded.
+ *
+ *      If a completion is being used as a counting completion,
+ *      attempt to decrement the counter without blocking. This
+ *      enables us to avoid waiting if the resource the completion
+ *      is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+        int ret = 1;
+        spin_lock_irq(&x->wait.lock);
+        if (!x->done)
+                ret = 0;
+        else
+                x->done--;
+        spin_unlock_irq(&x->wait.lock);
+        return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+/**
+ *      completion_done - Test to see if a completion has any waiters
+ *      @x:     completion structure
+ *
+ *      Returns: 0 if there are waiters (wait_for_completion() in progress)
+ *               1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+        int ret = 1;
+        spin_lock_irq(&x->wait.lock);
+        if (!x->done)
+                ret = 0;
+        spin_unlock_irq(&x->wait.lock);
+        return ret;
+}
+EXPORT_SYMBOL(completion_done);
 static long __sched
 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
 {
@@ -4979,19 +5183,22 @@ recheck:
                        return -EPERM;
        }
+        if (user) {
 #ifdef CONFIG_RT_GROUP_SCHED
-        /*
+                /*
-         * Do not allow realtime tasks into groups that have no runtime
+                 * Do not allow realtime tasks into groups that have no runtime
-         * assigned.
+                 * assigned.
-         */
+                 */
-        if (user
+                if (rt_bandwidth_enabled() && rt_policy(policy) &&
-            && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+                                task_group(p)->rt_bandwidth.rt_runtime == 0)
-                return -EPERM;
+                        return -EPERM;
 #endif
-        retval = security_task_setscheduler(p, policy, param);
+                retval = security_task_setscheduler(p, policy, param);
-        if (retval)
+                if (retval)
-                return retval;
+                        return retval;
+        }
        /*
         * make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
@@ -5707,6 +5914,8 @@ static inline void sched_init_granularity(void)
                sysctl_sched_latency = limit;
        sysctl_sched_wakeup_granularity *= factor;
+        sysctl_sched_shares_ratelimit *= factor;
 }
 #ifdef CONFIG_SMP
@@ -5817,7 +6026,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        set_task_cpu(p, dest_cpu);
        if (on_rq) {
                activate_task(rq_dest, p, 0);
-                check_preempt_curr(rq_dest, p);
+                check_preempt_curr(rq_dest, p, 0);
        }
 done:
        ret = 1;
@@ -6142,7 +6351,7 @@ set_table_entry(struct ctl_table *entry,
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-        struct ctl_table *table = sd_alloc_ctl_entry(12);
+        struct ctl_table *table = sd_alloc_ctl_entry(13);
        if (table == NULL)
                return NULL;
@@ -6170,7 +6379,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
                sizeof(int), 0644, proc_dointvec_minmax);
        set_table_entry(&table[10], "flags", &sd->flags,
                sizeof(int), 0644, proc_dointvec_minmax);
-        /* &table[11] is terminator */
+        set_table_entry(&table[11], "name", sd->name,
+                CORENAME_MAX_SIZE, 0444, proc_dostring);
+        /* &table[12] is terminator */
        return table;
 }
@@ -6389,7 +6600,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
        .priority = 10
 };
-void __init migration_init(void)
+static int __init migration_init(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
        int err;
@@ -6399,7 +6610,10 @@ void __init migration_init(void)
        BUG_ON(err == NOTIFY_BAD);
        migration_call(&migration_notifier, CPU_ONLINE, cpu);
        register_cpu_notifier(&migration_notifier);
+        return err;
 }
+early_initcall(migration_init);
 #endif
 #ifdef CONFIG_SMP
@@ -7051,13 +7265,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
 */
+#ifdef CONFIG_SCHED_DEBUG
+# define SD_INIT_NAME(sd, type)         sd->name = #type
+#else
+# define SD_INIT_NAME(sd, type)         do { } while (0)
+#endif
 #define SD_INIT(sd, type)       sd_init_##type(sd)
 #define SD_INIT_FUNC(type)      \
 static noinline void sd_init_##type(struct sched_domain *sd)    \
 {                                                               \
        memset(sd, 0, sizeof(*sd));                             \
        *sd = SD_##type##_INIT;                                 \
        sd->level = SD_LV_##type;                               \
+        SD_INIT_NAME(sd, type);                                 \
 }
 SD_INIT_FUNC(CPU)
@@ -7553,24 +7775,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 * and partition_sched_domains() will fallback to the single partition
 * 'fallback_doms', it also forces the domains to be rebuilt.
 *
+ * If doms_new==NULL it will be replaced with cpu_online_map.
+ * ndoms_new==0 is a special case for destroying existing domains.
+ * It will not create the default domain.
+ *
 * Call with hotplug lock held
 */
 void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
                             struct sched_domain_attr *dattr_new)
 {
-        int i, j;
+        int i, j, n;
        mutex_lock(&sched_domains_mutex);
        /* always unregister in case we don't destroy any domains */
        unregister_sched_domain_sysctl();
-        if (doms_new == NULL)
+        n = doms_new ? ndoms_new : 0;
-                ndoms_new = 0;
        /* Destroy deleted domains */
        for (i = 0; i < ndoms_cur; i++) {
-                for (j = 0; j < ndoms_new; j++) {
+                for (j = 0; j < n; j++) {
                        if (cpus_equal(doms_cur[i], doms_new[j])
                            && dattrs_equal(dattr_cur, i, dattr_new, j))
                                goto match1;
@@ -7583,7 +7808,6 @@ match1:
        if (doms_new == NULL) {
                ndoms_cur = 0;
-                ndoms_new = 1;
                doms_new = &fallback_doms;
                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
                dattr_new = NULL;
@@ -7620,8 +7844,13 @@ match2:
 int arch_reinit_sched_domains(void)
 {
        get_online_cpus();
+        /* Destroy domains first to force the rebuild */
+        partition_sched_domains(0, NULL, NULL);
        rebuild_sched_domains();
        put_online_cpus();
        return 0;
 }
@@ -7643,34 +7872,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 }
 #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sys_device *dev,
+static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
-                                struct sysdev_attribute *attr, char *page)
+                                           char *page)
 {
        return sprintf(page, "%u\n", sched_mc_power_savings);
 }
-static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
+static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
-                                            struct sysdev_attribute *attr,
                                            const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 0);
 }
-static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
+static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
-                   sched_mc_power_savings_store);
+                         sched_mc_power_savings_show,
+                         sched_mc_power_savings_store);
 #endif
 #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sys_device *dev,
+static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
-                                struct sysdev_attribute *attr, char *page)
+                                            char *page)
 {
        return sprintf(page, "%u\n", sched_smt_power_savings);
 }
-static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
+static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
-                                             struct sysdev_attribute *attr,
                                             const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 1);
 }
-static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+                   sched_smt_power_savings_show,
                   sched_smt_power_savings_store);
 #endif
@@ -7705,7 +7934,7 @@ static int update_sched_domains(struct notifier_block *nfb,
        case CPU_ONLINE_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                partition_sched_domains(0, NULL, NULL);
+                partition_sched_domains(1, NULL, NULL);
                return NOTIFY_OK;
        default:
@@ -7970,7 +8199,6 @@ void __init sched_init(void)
                rq = cpu_rq(i);
                spin_lock_init(&rq->lock);
-                lockdep_set_class(&rq->lock, &rq->rq_lock_key);
                rq->nr_running = 0;
                init_cfs_rq(&rq->cfs, rq);
                init_rt_rq(&rq->rt, rq);
@@ -8093,20 +8321,25 @@ void __might_sleep(char *file, int line)
 #ifdef in_atomic
        static unsigned long prev_jiffy;        /* ratelimiting */
-        if ((in_atomic() || irqs_disabled()) &&
+        if ((!in_atomic() && !irqs_disabled()) ||
-            system_state == SYSTEM_RUNNING && !oops_in_progress) {
+                    system_state != SYSTEM_RUNNING || oops_in_progress)
-                if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+                return;
-                        return;
+        if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-                prev_jiffy = jiffies;
+                return;
-                printk(KERN_ERR "BUG: sleeping function called from invalid"
+        prev_jiffy = jiffies;
-                                " context at %s:%d\n", file, line);
-                printk("in_atomic():%d, irqs_disabled():%d\n",
+        printk(KERN_ERR
-                        in_atomic(), irqs_disabled());
+                "BUG: sleeping function called from invalid context at %s:%d\n",
-                debug_show_held_locks(current);
+                        file, line);
-                if (irqs_disabled())
+        printk(KERN_ERR
-                        print_irqtrace_events(current);
+                "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
-                dump_stack();
+                        in_atomic(), irqs_disabled(),
-        }
+                        current->pid, current->comm);
+        debug_show_held_locks(current);
+        if (irqs_disabled())
+                print_irqtrace_events(current);
+        dump_stack();
 #endif
 }
 EXPORT_SYMBOL(__might_sleep);
@@ -8427,8 +8660,8 @@ struct task_group *sched_create_group(struct task_group *parent)
        WARN_ON(!parent); /* root should already exist */
        tg->parent = parent;
-        list_add_rcu(&tg->siblings, &parent->children);
        INIT_LIST_HEAD(&tg->children);
+        list_add_rcu(&tg->siblings, &parent->children);
        spin_unlock_irqrestore(&task_group_lock, flags);
        return tg;
@@ -8604,73 +8837,95 @@ static DEFINE_MUTEX(rt_constraints_mutex);
 static unsigned long to_ratio(u64 period, u64 runtime)
 {
        if (runtime == RUNTIME_INF)
-                return 1ULL << 16;
+                return 1ULL << 20;
-        return div64_u64(runtime << 16, period);
+        return div64_u64(runtime << 20, period);
 }
-#ifdef CONFIG_CGROUP_SCHED
+/* Must be called with tasklist_lock held */
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+static inline int tg_has_rt_tasks(struct task_group *tg)
 {
-        struct task_group *tgi, *parent = tg->parent;
+        struct task_struct *g, *p;
-        unsigned long total = 0;
-        if (!parent) {
+        do_each_thread(g, p) {
-                if (global_rt_period() < period)
+                if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
-                        return 0;
+                        return 1;
+        } while_each_thread(g, p);
-                return to_ratio(period, runtime) <
+        return 0;
-                        to_ratio(global_rt_period(), global_rt_runtime());
+}
-        }
-        if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
+struct rt_schedulable_data {
-                return 0;
+        struct task_group *tg;
+        u64 rt_period;
+        u64 rt_runtime;
+};
-        rcu_read_lock();
+static int tg_schedulable(struct task_group *tg, void *data)
-        list_for_each_entry_rcu(tgi, &parent->children, siblings) {
+{
-                if (tgi == tg)
+        struct rt_schedulable_data *d = data;
-                        continue;
+        struct task_group *child;
+        unsigned long total, sum = 0;
+        u64 period, runtime;
+        period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+        runtime = tg->rt_bandwidth.rt_runtime;
-                total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
+        if (tg == d->tg) {
-                                tgi->rt_bandwidth.rt_runtime);
+                period = d->rt_period;
+                runtime = d->rt_runtime;
        }
-        rcu_read_unlock();
-        return total + to_ratio(period, runtime) <=
+        /*
-                to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
+         * Cannot have more runtime than the period.
-                                parent->rt_bandwidth.rt_runtime);
+         */
-}
+        if (runtime > period && runtime != RUNTIME_INF)
-#elif defined CONFIG_USER_SCHED
+                return -EINVAL;
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-{
-        struct task_group *tgi;
-        unsigned long total = 0;
-        unsigned long global_ratio =
-                to_ratio(global_rt_period(), global_rt_runtime());
-        rcu_read_lock();
+        /*
-        list_for_each_entry_rcu(tgi, &task_groups, list) {
+         * Ensure we don't starve existing RT tasks.
-                if (tgi == tg)
+         */
-                        continue;
+        if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+                return -EBUSY;
+        total = to_ratio(period, runtime);
+        /*
+         * Nobody can have more than the global setting allows.
+         */
+        if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+                return -EINVAL;
+        /*
+         * The sum of our children's runtime should not exceed our own.
+         */
+        list_for_each_entry_rcu(child, &tg->children, siblings) {
+                period = ktime_to_ns(child->rt_bandwidth.rt_period);
+                runtime = child->rt_bandwidth.rt_runtime;
-                total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
+                if (child == d->tg) {
-                                tgi->rt_bandwidth.rt_runtime);
+                        period = d->rt_period;
+                        runtime = d->rt_runtime;
+                }
+                sum += to_ratio(period, runtime);
        }
-        rcu_read_unlock();
-        return total + to_ratio(period, runtime) < global_ratio;
+        if (sum > total)
+                return -EINVAL;
+        return 0;
 }
-#endif
-/* Must be called with tasklist_lock held */
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-static inline int tg_has_rt_tasks(struct task_group *tg)
 {
-        struct task_struct *g, *p;
+        struct rt_schedulable_data data = {
-        do_each_thread(g, p) {
+                .tg = tg,
-                if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+                .rt_period = period,
-                        return 1;
+                .rt_runtime = runtime,
-        } while_each_thread(g, p);
+        };
-        return 0;
+        return walk_tg_tree(tg_schedulable, tg_nop, &data);
 }
 static int tg_set_bandwidth(struct task_group *tg,
@@ -8680,14 +8935,9 @@ static int tg_set_bandwidth(struct task_group *tg,
        mutex_lock(&rt_constraints_mutex);
        read_lock(&tasklist_lock);
-        if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
+        err = __rt_schedulable(tg, rt_period, rt_runtime);
-                err = -EBUSY;
+        if (err)
-                goto unlock;
-        }
-        if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
-                err = -EINVAL;
                goto unlock;
-        }
        spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
        tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8756,16 +9006,25 @@ long sched_group_rt_period(struct task_group *tg)
 static int sched_rt_global_constraints(void)
 {
-        struct task_group *tg = &root_task_group;
+        u64 runtime, period;
-        u64 rt_runtime, rt_period;
        int ret = 0;
-        rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+        if (sysctl_sched_rt_period <= 0)
-        rt_runtime = tg->rt_bandwidth.rt_runtime;
+                return -EINVAL;
+        runtime = global_rt_runtime();
+        period = global_rt_period();
+        /*
+         * Sanity check on the sysctl variables.
+         */
+        if (runtime > period && runtime != RUNTIME_INF)
+                return -EINVAL;
        mutex_lock(&rt_constraints_mutex);
-        if (!__rt_schedulable(tg, rt_period, rt_runtime))
+        read_lock(&tasklist_lock);
-                ret = -EINVAL;
+        ret = __rt_schedulable(NULL, 0, 0);
+        read_unlock(&tasklist_lock);
        mutex_unlock(&rt_constraints_mutex);
        return ret;
@@ -8776,6 +9035,9 @@ static int sched_rt_global_constraints(void)
        unsigned long flags;
        int i;
+        if (sysctl_sched_rt_period <= 0)
+                return -EINVAL;
        spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
        for_each_possible_cpu(i) {
                struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@ -8836,7 +9098,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
        if (!cgrp->parent) {
                /* This is early initialization for the top cgroup */
-                init_task_group.css.cgroup = cgrp;
                return &init_task_group.css;
        }
@@ -8845,9 +9106,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
        if (IS_ERR(tg))
                return ERR_PTR(-ENOMEM);
-        /* Bind the cgroup to task_group object we just created */
-        tg->css.cgroup = cgrp;
        return &tg->css;
 }
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 22ed55d1167f..e8ab096ddfe3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -12,19 +12,17 @@
 *
 * Create a semi stable clock from a mixture of other events, including:
 *  - gtod
- *  - jiffies
 *  - sched_clock()
 *  - explicit idle events
 *
 * We use gtod as base and the unstable clock deltas. The deltas are filtered,
- * making it monotonic and keeping it within an expected window.  This window
+ * making it monotonic and keeping it within an expected window.
- * is set up using jiffies.
 *
 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
 * that is otherwise invisible (TSC gets stopped).
 *
 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
- * consistent between cpus (never more than 1 jiffies difference).
+ * consistent between cpus (never more than 2 jiffies difference).
 */
 #include <linux/sched.h>
 #include <linux/percpu.h>
@@ -32,13 +30,19 @@
 #include <linux/ktime.h>
 #include <linux/module.h>
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ * This is default implementation.
+ * Architectures and sub-architectures can override this.
+ */
+unsigned long long __attribute__((weak)) sched_clock(void)
+{
+        return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+}
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static __read_mostly int sched_clock_running;
-#define MULTI_SHIFT 15
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-/* Max is double, Min is 1/2 */
-#define MAX_MULTI (2LL << MULTI_SHIFT)
-#define MIN_MULTI (1LL << (MULTI_SHIFT-1))
 struct sched_clock_data {
        /*
@@ -48,15 +52,9 @@ struct sched_clock_data {
         */
        raw_spinlock_t          lock;
-        unsigned long           tick_jiffies;
-        u64                     prev_raw;
        u64                     tick_raw;
        u64                     tick_gtod;
        u64                     clock;
-        s64                     multi;
-#ifdef CONFIG_NO_HZ
-        int                     check_max;
-#endif
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
@@ -71,121 +69,69 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
        return &per_cpu(sched_clock_data, cpu);
 }
-static __read_mostly int sched_clock_running;
 void sched_clock_init(void)
 {
        u64 ktime_now = ktime_to_ns(ktime_get());
-        unsigned long now_jiffies = jiffies;
        int cpu;
        for_each_possible_cpu(cpu) {
                struct sched_clock_data *scd = cpu_sdc(cpu);
                scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
-                scd->tick_jiffies = now_jiffies;
-                scd->prev_raw = 0;
                scd->tick_raw = 0;
                scd->tick_gtod = ktime_now;
                scd->clock = ktime_now;
-                scd->multi = 1 << MULTI_SHIFT;
-#ifdef CONFIG_NO_HZ
-                scd->check_max = 1;
-#endif
        }
        sched_clock_running = 1;
 }
-#ifdef CONFIG_NO_HZ
 /*
- * The dynamic ticks makes the delta jiffies inaccurate. This
+ * min,max except they take wrapping into account
- * prevents us from checking the maximum time update.
- * Disable the maximum check during stopped ticks.
 */
-void sched_clock_tick_stop(int cpu)
-{
-        struct sched_clock_data *scd = cpu_sdc(cpu);
-        scd->check_max = 0;
-}
-void sched_clock_tick_start(int cpu)
+static inline u64 wrap_min(u64 x, u64 y)
 {
-        struct sched_clock_data *scd = cpu_sdc(cpu);
+        return (s64)(x - y) < 0 ? x : y;
-        scd->check_max = 1;
 }
-static int check_max(struct sched_clock_data *scd)
+static inline u64 wrap_max(u64 x, u64 y)
 {
-        return scd->check_max;
+        return (s64)(x - y) > 0 ? x : y;
 }
-#else
-static int check_max(struct sched_clock_data *scd)
-{
-        return 1;
-}
-#endif /* CONFIG_NO_HZ */
 /*
 * update the percpu scd from the raw @now value
 *
 *  - filter out backward motion
- *  - use jiffies to generate a min,max window to clip the raw values
+ *  - use the GTOD tick value to create a window to filter crazy TSC values
 */
-static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time)
+static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
 {
-        unsigned long now_jiffies = jiffies;
+        s64 delta = now - scd->tick_raw;
-        long delta_jiffies = now_jiffies - scd->tick_jiffies;
+        u64 clock, min_clock, max_clock;
-        u64 clock = scd->clock;
-        u64 min_clock, max_clock;
-        s64 delta = now - scd->prev_raw;
        WARN_ON_ONCE(!irqs_disabled());
-        /*
+        if (unlikely(delta < 0))
-         * At schedule tick the clock can be just under the gtod. We don't
+                delta = 0;
-         * want to push it too prematurely.
-         */
-        min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC);
-        if (min_clock > TICK_NSEC)
-                min_clock -= TICK_NSEC / 2;
-        if (unlikely(delta < 0)) {
-                clock++;
-                goto out;
-        }
        /*
-         * The clock must stay within a jiffie of the gtod.
+         * scd->clock = clamp(scd->tick_gtod + delta,
-         * But since we may be at the start of a jiffy or the end of one
+         *                    max(scd->tick_gtod, scd->clock),
-         * we add another jiffy buffer.
+         *                    scd->tick_gtod + TICK_NSEC);
         */
-        max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC;
-        delta *= scd->multi;
+        clock = scd->tick_gtod + delta;
-        delta >>= MULTI_SHIFT;
+        min_clock = wrap_max(scd->tick_gtod, scd->clock);
+        max_clock = scd->tick_gtod + TICK_NSEC;
-        if (unlikely(clock + delta > max_clock) && check_max(scd)) {
+        clock = wrap_max(clock, min_clock);
-                if (clock < max_clock)
+        clock = wrap_min(clock, max_clock);
-                        clock = max_clock;
-                else
-                        clock++;
-        } else {
-                clock += delta;
-        }
- out:
+        scd->clock = clock;
-        if (unlikely(clock < min_clock))
-                clock = min_clock;
-        if (time)
+        return scd->clock;
-                *time = clock;
-        else {
-                scd->prev_raw = now;
-                scd->clock = clock;
-        }
 }
 static void lock_double_clock(struct sched_clock_data *data1,
@@ -203,7 +149,7 @@ static void lock_double_clock(struct sched_clock_data *data1,
 u64 sched_clock_cpu(int cpu)
 {
        struct sched_clock_data *scd = cpu_sdc(cpu);
-        u64 now, clock;
+        u64 now, clock, this_clock, remote_clock;
        if (unlikely(!sched_clock_running))
                return 0ull;
@@ -212,43 +158,44 @@ u64 sched_clock_cpu(int cpu)
        now = sched_clock();
        if (cpu != raw_smp_processor_id()) {
-                /*
-                 * in order to update a remote cpu's clock based on our
-                 * unstable raw time rebase it against:
-                 *   tick_raw           (offset between raw counters)
-                 *   tick_gotd          (tick offset between cpus)
-                 */
                struct sched_clock_data *my_scd = this_scd();
                lock_double_clock(scd, my_scd);
-                now -= my_scd->tick_raw;
+                this_clock = __update_sched_clock(my_scd, now);
-                now += scd->tick_raw;
+                remote_clock = scd->clock;
-                now += my_scd->tick_gtod;
+                /*
-                now -= scd->tick_gtod;
+                 * Use the opportunity that we have both locks
+                 * taken to couple the two clocks: we take the
+                 * larger time as the latest time for both
+                 * runqueues. (this creates monotonic movement)
+                 */
+                if (likely((s64)(remote_clock - this_clock) < 0)) {
+                        clock = this_clock;
+                        scd->clock = clock;
+                } else {
+                        /*
+                         * Should be rare, but possible:
+                         */
+                        clock = remote_clock;
+                        my_scd->clock = remote_clock;
+                }
                __raw_spin_unlock(&my_scd->lock);
-                __update_sched_clock(scd, now, &clock);
-                __raw_spin_unlock(&scd->lock);
        } else {
                __raw_spin_lock(&scd->lock);
-                __update_sched_clock(scd, now, NULL);
+                clock = __update_sched_clock(scd, now);
-                clock = scd->clock;
-                __raw_spin_unlock(&scd->lock);
        }
+        __raw_spin_unlock(&scd->lock);
        return clock;
 }
 void sched_clock_tick(void)
 {
        struct sched_clock_data *scd = this_scd();
-        unsigned long now_jiffies = jiffies;
-        s64 mult, delta_gtod, delta_raw;
        u64 now, now_gtod;
        if (unlikely(!sched_clock_running))
@@ -260,29 +207,9 @@ void sched_clock_tick(void)
        now = sched_clock();
        __raw_spin_lock(&scd->lock);
-        __update_sched_clock(scd, now, NULL);
-        /*
-         * update tick_gtod after __update_sched_clock() because that will
-         * already observe 1 new jiffy; adding a new tick_gtod to that would
-         * increase the clock 2 jiffies.
-         */
-        delta_gtod = now_gtod - scd->tick_gtod;
-        delta_raw = now - scd->tick_raw;
-        if ((long)delta_raw > 0) {
-                mult = delta_gtod << MULTI_SHIFT;
-                do_div(mult, delta_raw);
-                scd->multi = mult;
-                if (scd->multi > MAX_MULTI)
-                        scd->multi = MAX_MULTI;
-                else if (scd->multi < MIN_MULTI)
-                        scd->multi = MIN_MULTI;
-        } else
-                scd->multi = 1 << MULTI_SHIFT;
        scd->tick_raw = now;
        scd->tick_gtod = now_gtod;
-        scd->tick_jiffies = now_jiffies;
+        __update_sched_clock(scd, now);
        __raw_spin_unlock(&scd->lock);
 }
@@ -300,37 +227,28 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
 */
 void sched_clock_idle_wakeup_event(u64 delta_ns)
 {
-        struct sched_clock_data *scd = this_scd();
+        sched_clock_tick();
-        u64 now = sched_clock();
-        /*
-         * Override the previous timestamp and ignore all
-         * sched_clock() deltas that occured while we idled,
-         * and use the PM-provided delta_ns to advance the
-         * rq clock:
-         */
-        __raw_spin_lock(&scd->lock);
-        scd->prev_raw = now;
-        scd->clock += delta_ns;
-        scd->multi = 1 << MULTI_SHIFT;
-        __raw_spin_unlock(&scd->lock);
        touch_softlockup_watchdog();
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-#endif
+#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
-/*
+void sched_clock_init(void)
- * Scheduler clock - returns current time in nanosec units.
- * This is default implementation.
- * Architectures and sub-architectures can override this.
- */
-unsigned long long __attribute__((weak)) sched_clock(void)
 {
-        return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+        sched_clock_running = 1;
 }
+u64 sched_clock_cpu(int cpu)
+{
+        if (unlikely(!sched_clock_running))
+                return 0;
+        return sched_clock();
+}
+#endif
 unsigned long long cpu_clock(int cpu)
 {
        unsigned long long clock;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cf2cd6ce4cb2..18fd17172eb6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -409,64 +409,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 /*
- * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
- * that it favours >=0 over <0.
- *
- *   -20         |
- *               |
- *     0 --------+-------
- *             .'
- *    19     .'
- *
- */
-static unsigned long
-calc_delta_asym(unsigned long delta, struct sched_entity *se)
-{
-        struct load_weight lw = {
-                .weight = NICE_0_LOAD,
-                .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
-        };
-        for_each_sched_entity(se) {
-                struct load_weight *se_lw = &se->load;
-                unsigned long rw = cfs_rq_of(se)->load.weight;
-#ifdef CONFIG_FAIR_SCHED_GROUP
-                struct cfs_rq *cfs_rq = se->my_q;
-                struct task_group *tg = NULL
-                if (cfs_rq)
-                        tg = cfs_rq->tg;
-                if (tg && tg->shares < NICE_0_LOAD) {
-                        /*
-                         * scale shares to what it would have been had
-                         * tg->weight been NICE_0_LOAD:
-                         *
-                         *   weight = 1024 * shares / tg->weight
-                         */
-                        lw.weight *= se->load.weight;
-                        lw.weight /= tg->shares;
-                        lw.inv_weight = 0;
-                        se_lw = &lw;
-                        rw += lw.weight - se->load.weight;
-                } else
-#endif
-                if (se->load.weight < NICE_0_LOAD) {
-                        se_lw = &lw;
-                        rw += NICE_0_LOAD - se->load.weight;
-                }
-                delta = calc_delta_mine(delta, rw, se_lw);
-        }
-        return delta;
-}
-/*
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
 */
@@ -586,11 +528,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        update_load_add(&cfs_rq->load, se->load.weight);
        if (!parent_entity(se))
                inc_cpu_load(rq_of(cfs_rq), se->load.weight);
-        if (entity_is_task(se))
+        if (entity_is_task(se)) {
                add_cfs_task_weight(cfs_rq, se->load.weight);
+                list_add(&se->group_node, &cfs_rq->tasks);
+        }
        cfs_rq->nr_running++;
        se->on_rq = 1;
-        list_add(&se->group_node, &cfs_rq->tasks);
 }
 static void
@@ -599,11 +542,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        update_load_sub(&cfs_rq->load, se->load.weight);
        if (!parent_entity(se))
                dec_cpu_load(rq_of(cfs_rq), se->load.weight);
-        if (entity_is_task(se))
+        if (entity_is_task(se)) {
                add_cfs_task_weight(cfs_rq, -se->load.weight);
+                list_del_init(&se->group_node);
+        }
        cfs_rq->nr_running--;
        se->on_rq = 0;
-        list_del_init(&se->group_node);
 }
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -899,7 +843,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
                 * doesn't make sense. Rely on vruntime for fairness.
                 */
                if (rq->curr != p)
-                        delta = max(10000LL, delta);
+                        delta = max_t(s64, 10000LL, delta);
                hrtick_start(rq, delta);
        }
@@ -1085,7 +1029,6 @@ static long effective_load(struct task_group *tg, int cpu,
                long wl, long wg)
 {
        struct sched_entity *se = tg->se[cpu];
-        long more_w;
        if (!tg->parent)
                return wl;
@@ -1097,18 +1040,17 @@ static long effective_load(struct task_group *tg, int cpu,
        if (!wl && sched_feat(ASYM_EFF_LOAD))
                return wl;
-        /*
-         * Instead of using this increment, also add the difference
-         * between when the shares were last updated and now.
-         */
-        more_w = se->my_q->load.weight - se->my_q->rq_weight;
-        wl += more_w;
-        wg += more_w;
        for_each_sched_entity(se) {
-#define D(n) (likely(n) ? (n) : 1)
                long S, rw, s, a, b;
+                long more_w;
+                /*
+                 * Instead of using this increment, also add the difference
+                 * between when the shares were last updated and now.
+                 */
+                more_w = se->my_q->load.weight - se->my_q->rq_weight;
+                wl += more_w;
+                wg += more_w;
                S = se->my_q->tg->shares;
                s = se->my_q->shares;
@@ -1117,7 +1059,11 @@ static long effective_load(struct task_group *tg, int cpu,
                a = S*(rw + wl);
                b = S*rw + s*wg;
-                wl = s*(a-b)/D(b);
+                wl = s*(a-b);
+                if (likely(b))
+                        wl /= b;
                /*
                 * Assume the group is already running and will
                 * thus already be accounted for in the weight.
@@ -1126,7 +1072,6 @@ static long effective_load(struct task_group *tg, int cpu,
                 * alter the group weight.
                 */
                wg = 0;
-#undef D
        }
        return wl;
@@ -1143,7 +1088,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 #endif
 static int
-wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
+wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
            struct task_struct *p, int prev_cpu, int this_cpu, int sync,
            int idx, unsigned long load, unsigned long this_load,
            unsigned int imbalance)
@@ -1158,6 +1103,11 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
        if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
                return 0;
+        if (!sync && sched_feat(SYNC_WAKEUPS) &&
+            curr->se.avg_overlap < sysctl_sched_migration_cost &&
+            p->se.avg_overlap < sysctl_sched_migration_cost)
+                sync = 1;
        /*
         * If sync wakeup then subtract the (maximum possible)
         * effect of the currently running task from the load
@@ -1182,17 +1132,14 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
         * a reasonable amount of time then attract this newly
         * woken task:
         */
-        if (sync && balanced) {
+        if (sync && balanced)
-                if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
+                return 1;
-                    p->se.avg_overlap < sysctl_sched_migration_cost)
-                        return 1;
-        }
        schedstat_inc(p, se.nr_wakeups_affine_attempts);
        tl_per_task = cpu_avg_load_per_task(this_cpu);
-        if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
+        if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
-                        balanced) {
+                        tl_per_task)) {
                /*
                 * This domain has SD_WAKE_AFFINE and
                 * p is cache cold in this domain, and
@@ -1211,16 +1158,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
        struct sched_domain *sd, *this_sd = NULL;
        int prev_cpu, this_cpu, new_cpu;
        unsigned long load, this_load;
-        struct rq *rq, *this_rq;
+        struct rq *this_rq;
        unsigned int imbalance;
        int idx;
        prev_cpu        = task_cpu(p);
-        rq              = task_rq(p);
        this_cpu        = smp_processor_id();
        this_rq         = cpu_rq(this_cpu);
        new_cpu         = prev_cpu;
+        if (prev_cpu == this_cpu)
+                goto out;
        /*
         * 'this_sd' is the first domain that both
         * this_cpu and prev_cpu are present in:
@@ -1248,13 +1196,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
        load = source_load(prev_cpu, idx);
        this_load = target_load(this_cpu, idx);
-        if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
+        if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
                                     load, this_load, imbalance))
                return this_cpu;
-        if (prev_cpu == this_cpu)
-                goto out;
        /*
         * Start passive balancing when half the imbalance_pct
         * limit is reached.
@@ -1281,62 +1226,20 @@ static unsigned long wakeup_gran(struct sched_entity *se)
         * + nice tasks.
         */
        if (sched_feat(ASYM_GRAN))
-                gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
+                gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
-        else
-                gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
        return gran;
 }
 /*
- * Should 'se' preempt 'curr'.
- *
- *             |s1
- *        |s2
- *   |s3
- *         g
- *      |<--->|c
- *
- *  w(c, s1) = -1
- *  w(c, s2) =  0
- *  w(c, s3) =  1
- *
- */
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
-{
-        s64 gran, vdiff = curr->vruntime - se->vruntime;
-        if (vdiff < 0)
-                return -1;
-        gran = wakeup_gran(curr);
-        if (vdiff > gran)
-                return 1;
-        return 0;
-}
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-        int depth = 0;
-        for_each_sched_entity(se)
-                depth++;
-        return depth;
-}
-/*
 * Preempt the current task with a newly woken task if needed:
 */
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 {
        struct task_struct *curr = rq->curr;
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
        struct sched_entity *se = &curr->se, *pse = &p->se;
-        int se_depth, pse_depth;
+        s64 delta_exec;
        if (unlikely(rt_prio(p->prio))) {
                update_rq_clock(rq);
@@ -1351,6 +1254,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
        cfs_rq_of(pse)->next = pse;
        /*
+         * We can come here with TIF_NEED_RESCHED already set from new task
+         * wake up path.
+         */
+        if (test_tsk_need_resched(curr))
+                return;
+        /*
         * Batch tasks do not preempt (their preemption is driven by
         * the tick):
         */
@@ -1360,33 +1270,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
-        /*
+        if (sched_feat(WAKEUP_OVERLAP) && (sync ||
-         * preemption test can be made between sibling entities who are in the
+                        (se->avg_overlap < sysctl_sched_migration_cost &&
-         * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
+                         pse->avg_overlap < sysctl_sched_migration_cost))) {
-         * both tasks until we find their ancestors who are siblings of common
+                resched_task(curr);
-         * parent.
+                return;
-         */
-        /* First walk up until both entities are at same depth */
-        se_depth = depth_se(se);
-        pse_depth = depth_se(pse);
-        while (se_depth > pse_depth) {
-                se_depth--;
-                se = parent_entity(se);
-        }
-        while (pse_depth > se_depth) {
-                pse_depth--;
-                pse = parent_entity(pse);
-        }
-        while (!is_same_group(se, pse)) {
-                se = parent_entity(se);
-                pse = parent_entity(pse);
        }
-        if (wakeup_preempt_entity(se, pse) == 1)
+        delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+        if (delta_exec > wakeup_gran(pse))
                resched_task(curr);
 }
@@ -1442,18 +1334,13 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
        struct task_struct *p = NULL;
        struct sched_entity *se;
-        while (next != &cfs_rq->tasks) {
+        if (next == &cfs_rq->tasks)
-                se = list_entry(next, struct sched_entity, group_node);
+                return NULL;
-                next = next->next;
-                /* Skip over entities that are not tasks */
+        se = list_entry(next, struct sched_entity, group_node);
-                if (entity_is_task(se)) {
+        p = task_of(se);
-                        p = task_of(se);
+        cfs_rq->balance_iterator = next->next;
-                        break;
-                }
-        }
-        cfs_rq->balance_iterator = next;
        return p;
 }
@@ -1502,7 +1389,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        rcu_read_lock();
        update_h_load(busiest_cpu);
-        list_for_each_entry(tg, &task_groups, list) {
+        list_for_each_entry_rcu(tg, &task_groups, list) {
                struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
                unsigned long busiest_h_load = busiest_cfs_rq->h_load;
                unsigned long busiest_weight = busiest_cfs_rq->load.weight;
@@ -1615,10 +1502,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
                 * 'current' within the tree based on its new key value.
                 */
                swap(curr->vruntime, se->vruntime);
+                resched_task(rq->curr);
        }
        enqueue_task_fair(rq, p, 0);
-        resched_task(rq->curr);
 }
 /*
@@ -1637,7 +1524,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p,
                if (p->prio > oldprio)
                        resched_task(rq->curr);
        } else
-                check_preempt_curr(rq, p);
+                check_preempt_curr(rq, p, 0);
 }
 /*
@@ -1654,7 +1541,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p,
        if (running)
                resched_task(rq->curr);
        else
-                check_preempt_curr(rq, p);
+                check_preempt_curr(rq, p, 0);
 }
 /* Account for a task changing its policy or group.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 862b06bd560a..7c9e8f4a049f 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -8,6 +8,7 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
 SCHED_FEAT(HRTICK, 1)
 SCHED_FEAT(DOUBLE_TICK, 0)
 SCHED_FEAT(ASYM_GRAN, 1)
-SCHED_FEAT(LB_BIAS, 0)
+SCHED_FEAT(LB_BIAS, 1)
 SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
+SCHED_FEAT(WAKEUP_OVERLAP, 0)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3a4f92dbbe66..dec4ccabe2f5 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
 /*
 * Idle tasks are unconditionally rescheduled:
 */
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
 {
        resched_task(rq->idle);
 }
@@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p,
        if (running)
                resched_task(rq->curr);
        else
-                check_preempt_curr(rq, p);
+                check_preempt_curr(rq, p, 0);
 }
 static void prio_changed_idle(struct rq *rq, struct task_struct *p,
@@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
                if (p->prio > oldprio)
                        resched_task(rq->curr);
        } else
-                check_preempt_curr(rq, p);
+                check_preempt_curr(rq, p, 0);
 }
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 908c04f9dad0..cdf5740ab03e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
+        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
        struct sched_rt_entity *rt_se = rt_rq->rt_se;
-        if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
+        if (rt_rq->rt_nr_running) {
-                struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+                if (rt_se && !on_rt_rq(rt_se))
+                        enqueue_rt_entity(rt_se);
-                enqueue_rt_entity(rt_se);
                if (rt_rq->highest_prio < curr->prio)
                        resched_task(curr);
        }
@@ -199,6 +199,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
+        if (rt_rq->rt_nr_running)
+                resched_task(rq_of_rt_rq(rt_rq)->curr);
 }
 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -229,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_SMP
+/*
+ * We ran out of runtime, see if we can borrow some from our neighbours.
+ */
 static int do_balance_runtime(struct rt_rq *rt_rq)
 {
        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -248,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
                        continue;
                spin_lock(&iter->rt_runtime_lock);
+                /*
+                 * Either all rqs have inf runtime and there's nothing to steal
+                 * or __disable_runtime() below sets a specific rq to inf to
+                 * indicate its been disabled and disalow stealing.
+                 */
                if (iter->rt_runtime == RUNTIME_INF)
                        goto next;
+                /*
+                 * From runqueues with spare time, take 1/n part of their
+                 * spare time, but no more than our period.
+                 */
                diff = iter->rt_runtime - iter->rt_time;
                if (diff > 0) {
                        diff = div_u64((u64)diff, weight);
@@ -272,6 +286,9 @@ next:
        return more;
 }
+/*
+ * Ensure this RQ takes back all the runtime it lend to its neighbours.
+ */
 static void __disable_runtime(struct rq *rq)
 {
        struct root_domain *rd = rq->rd;
@@ -287,18 +304,34 @@ static void __disable_runtime(struct rq *rq)
                spin_lock(&rt_b->rt_runtime_lock);
                spin_lock(&rt_rq->rt_runtime_lock);
+                /*
+                 * Either we're all inf and nobody needs to borrow, or we're
+                 * already disabled and thus have nothing to do, or we have
+                 * exactly the right amount of runtime to take out.
+                 */
                if (rt_rq->rt_runtime == RUNTIME_INF ||
                                rt_rq->rt_runtime == rt_b->rt_runtime)
                        goto balanced;
                spin_unlock(&rt_rq->rt_runtime_lock);
+                /*
+                 * Calculate the difference between what we started out with
+                 * and what we current have, that's the amount of runtime
+                 * we lend and now have to reclaim.
+                 */
                want = rt_b->rt_runtime - rt_rq->rt_runtime;
+                /*
+                 * Greedy reclaim, take back as much as we can.
+                 */
                for_each_cpu_mask(i, rd->span) {
                        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
                        s64 diff;
-                        if (iter == rt_rq)
+                        /*
+                         * Can't reclaim from ourselves or disabled runqueues.
+                         */
+                        if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
                                continue;
                        spin_lock(&iter->rt_runtime_lock);
@@ -317,8 +350,16 @@ static void __disable_runtime(struct rq *rq)
                }
                spin_lock(&rt_rq->rt_runtime_lock);
+                /*
+                 * We cannot be left wanting - that would mean some runtime
+                 * leaked out of the system.
+                 */
                BUG_ON(want);
 balanced:
+                /*
+                 * Disable all the borrow logic by pretending we have inf
+                 * runtime - in which case borrowing doesn't make sense.
+                 */
                rt_rq->rt_runtime = RUNTIME_INF;
                spin_unlock(&rt_rq->rt_runtime_lock);
                spin_unlock(&rt_b->rt_runtime_lock);
@@ -341,6 +382,9 @@ static void __enable_runtime(struct rq *rq)
        if (unlikely(!scheduler_running))
                return;
+        /*
+         * Reset each runqueue's bandwidth settings
+         */
        for_each_leaf_rt_rq(rt_rq, rq) {
                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -348,6 +392,7 @@ static void __enable_runtime(struct rq *rq)
                spin_lock(&rt_rq->rt_runtime_lock);
                rt_rq->rt_runtime = rt_b->rt_runtime;
                rt_rq->rt_time = 0;
+                rt_rq->rt_throttled = 0;
                spin_unlock(&rt_rq->rt_runtime_lock);
                spin_unlock(&rt_b->rt_runtime_lock);
        }
@@ -386,7 +431,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
        int i, idle = 1;
        cpumask_t span;
-        if (rt_b->rt_runtime == RUNTIME_INF)
+        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                return 1;
        span = sched_rt_period_mask();
@@ -438,9 +483,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 {
        u64 runtime = sched_rt_runtime(rt_rq);
-        if (runtime == RUNTIME_INF)
-                return 0;
        if (rt_rq->rt_throttled)
                return rt_rq_throttled(rt_rq);
@@ -487,13 +529,18 @@ static void update_curr_rt(struct rq *rq)
        curr->se.exec_start = rq->clock;
        cpuacct_charge(curr, delta_exec);
+        if (!rt_bandwidth_enabled())
+                return;
        for_each_sched_rt_entity(rt_se) {
                rt_rq = rt_rq_of_se(rt_se);
                spin_lock(&rt_rq->rt_runtime_lock);
-                rt_rq->rt_time += delta_exec;
+                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
-                if (sched_rt_runtime_exceeded(rt_rq))
+                        rt_rq->rt_time += delta_exec;
-                        resched_task(curr);
+                        if (sched_rt_runtime_exceeded(rt_rq))
+                                resched_task(curr);
+                }
                spin_unlock(&rt_rq->rt_runtime_lock);
        }
 }
@@ -782,7 +829,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 /*
 * Preempt the current task with a newly woken task if needed:
 */
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
+static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
 {
        if (p->prio < rq->curr->prio) {
                resched_task(rq->curr);
@@ -861,6 +908,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 #define RT_MAX_TRIES 3
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
@@ -1022,7 +1071,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                        break;
                /* try again */
-                spin_unlock(&lowest_rq->lock);
+                double_unlock_balance(rq, lowest_rq);
                lowest_rq = NULL;
        }
@@ -1091,7 +1140,7 @@ static int push_rt_task(struct rq *rq)
        resched_task(lowest_rq->curr);
-        spin_unlock(&lowest_rq->lock);
+        double_unlock_balance(rq, lowest_rq);
        ret = 1;
 out:
@@ -1197,7 +1246,7 @@ static int pull_rt_task(struct rq *this_rq)
                }
 skip:
-                spin_unlock(&src_rq->lock);
+                double_unlock_balance(this_rq, src_rq);
        }
        return ret;
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index aaaeae8244e7..94a62c0d4ade 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -212,9 +212,7 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
        waiter.up = 0;
        for (;;) {
-                if (state == TASK_INTERRUPTIBLE && signal_pending(task))
+                if (signal_pending_state(state, task))
-                        goto interrupted;
-                if (state == TASK_KILLABLE && fatal_signal_pending(task))
                        goto interrupted;
                if (timeout <= 0)
                        goto timed_out;
diff --git a/kernel/signal.c b/kernel/signal.c
index 82c3545596c5..e661b01d340f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,6 +22,7 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/signalfd.h>
+#include <linux/tracehook.h>
 #include <linux/capability.h>
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
@@ -39,24 +40,21 @@
 static struct kmem_cache *sigqueue_cachep;
-static int __sig_ignored(struct task_struct *t, int sig)
+static void __user *sig_handler(struct task_struct *t, int sig)
 {
-        void __user *handler;
+        return t->sighand->action[sig - 1].sa.sa_handler;
+}
+static int sig_handler_ignored(void __user *handler, int sig)
+{
        /* Is it explicitly or implicitly ignored? */
-        handler = t->sighand->action[sig - 1].sa.sa_handler;
        return handler == SIG_IGN ||
                (handler == SIG_DFL && sig_kernel_ignore(sig));
 }
 static int sig_ignored(struct task_struct *t, int sig)
 {
-        /*
+        void __user *handler;
-         * Tracers always want to know about signals..
-         */
-        if (t->ptrace & PT_PTRACED)
-                return 0;
        /*
         * Blocked signals are never ignored, since the
@@ -66,7 +64,14 @@ static int sig_ignored(struct task_struct *t, int sig)
        if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
                return 0;
-        return __sig_ignored(t, sig);
+        handler = sig_handler(t, sig);
+        if (!sig_handler_ignored(handler, sig))
+                return 0;
+        /*
+         * Tracers may want to know about even ignored signals.
+         */
+        return !tracehook_consider_ignored_signal(t, sig, handler);
 }
 /*
@@ -129,7 +134,9 @@ void recalc_sigpending_and_wake(struct task_struct *t)
 void recalc_sigpending(void)
 {
-        if (!recalc_sigpending_tsk(current) && !freezing(current))
+        if (unlikely(tracehook_force_sigpending()))
+                set_thread_flag(TIF_SIGPENDING);
+        else if (!recalc_sigpending_tsk(current) && !freezing(current))
                clear_thread_flag(TIF_SIGPENDING);
 }
@@ -295,12 +302,12 @@ flush_signal_handlers(struct task_struct *t, int force_default)
 int unhandled_signal(struct task_struct *tsk, int sig)
 {
+        void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
        if (is_global_init(tsk))
                return 1;
-        if (tsk->ptrace & PT_PTRACED)
+        if (handler != SIG_IGN && handler != SIG_DFL)
                return 0;
-        return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
+        return !tracehook_consider_fatal_signal(tsk, sig, handler);
-                (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
 }
@@ -591,9 +598,6 @@ static int check_kill_permission(int sig, struct siginfo *info,
        return security_task_kill(t, info, sig, 0);
 }
-/* forward decl */
-static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
 /*
 * Handle magic process-wide effects of stop/continue signals. Unlike
 * the signal actions, these happen immediately at signal-generation
@@ -756,7 +760,8 @@ static void complete_signal(int sig, struct task_struct *p, int group)
        if (sig_fatal(p, sig) &&
            !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
            !sigismember(&t->real_blocked, sig) &&
-            (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
+            (sig == SIGKILL ||
+             !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) {
                /*
                 * This signal will be fatal to the whole group.
                 */
@@ -1299,6 +1304,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
                q->info.si_overrun++;
                goto out;
        }
+        q->info.si_overrun = 0;
        signalfd_notify(t, sig);
        pending = group ? &t->signal->shared_pending : &t->pending;
@@ -1323,13 +1329,16 @@ static inline void __wake_up_parent(struct task_struct *p,
 /*
 * Let a parent know about the death of a child.
 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
+ *
+ * Returns -1 if our parent ignored us and so we've switched to
+ * self-reaping, or else @sig.
 */
+int do_notify_parent(struct task_struct *tsk, int sig)
-void do_notify_parent(struct task_struct *tsk, int sig)
 {
        struct siginfo info;
        unsigned long flags;
        struct sighand_struct *psig;
+        int ret = sig;
        BUG_ON(sig == -1);
@@ -1394,14 +1403,16 @@ void do_notify_parent(struct task_struct *tsk, int sig)
                 * is implementation-defined: we do (if you don't want
                 * it, just use SIG_IGN instead).
                 */
-                tsk->exit_signal = -1;
+                ret = tsk->exit_signal = -1;
                if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
-                        sig = 0;
+                        sig = -1;
        }
        if (valid_signal(sig) && sig > 0)
                __group_send_sig_info(sig, &info, tsk->parent);
        __wake_up_parent(tsk, tsk->parent);
        spin_unlock_irqrestore(&psig->siglock, flags);
+        return ret;
 }
 static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
@@ -1599,7 +1610,7 @@ finish_stop(int stop_count)
         * a group stop in progress and we are the last to stop,
         * report to the parent.  When ptraced, every thread reports itself.
         */
-        if (stop_count == 0 || (current->ptrace & PT_PTRACED)) {
+        if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(current, CLD_STOPPED);
                read_unlock(&tasklist_lock);
@@ -1735,6 +1746,9 @@ relock:
                signal->flags &= ~SIGNAL_CLD_MASK;
                spin_unlock_irq(&sighand->siglock);
+                if (unlikely(!tracehook_notify_jctl(1, why)))
+                        goto relock;
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(current->group_leader, why);
                read_unlock(&tasklist_lock);
@@ -1748,17 +1762,33 @@ relock:
                    do_signal_stop(0))
                        goto relock;
-                signr = dequeue_signal(current, &current->blocked, info);
+                /*
-                if (!signr)
+                 * Tracing can induce an artifical signal and choose sigaction.
-                        break; /* will return 0 */
+                 * The return value in @signr determines the default action,
+                 * but @info->si_signo is the signal number we will report.
+                 */
+                signr = tracehook_get_signal(current, regs, info, return_ka);
+                if (unlikely(signr < 0))
+                        goto relock;
+                if (unlikely(signr != 0))
+                        ka = return_ka;
+                else {
+                        signr = dequeue_signal(current, &current->blocked,
+                                               info);
-                if (signr != SIGKILL) {
-                        signr = ptrace_signal(signr, info, regs, cookie);
                        if (!signr)
-                                continue;
+                                break; /* will return 0 */
+                        if (signr != SIGKILL) {
+                                signr = ptrace_signal(signr, info,
+                                                      regs, cookie);
+                                if (!signr)
+                                        continue;
+                        }
+                        ka = &sighand->action[signr-1];
                }
-                ka = &sighand->action[signr-1];
                if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
                        continue;
                if (ka->sa.sa_handler != SIG_DFL) {
@@ -1806,7 +1836,7 @@ relock:
                                spin_lock_irq(&sighand->siglock);
                        }
-                        if (likely(do_signal_stop(signr))) {
+                        if (likely(do_signal_stop(info->si_signo))) {
                                /* It released the siglock.  */
                                goto relock;
                        }
@@ -1827,7 +1857,7 @@ relock:
                if (sig_kernel_coredump(signr)) {
                        if (print_fatal_signals)
-                                print_fatal_signal(regs, signr);
+                                print_fatal_signal(regs, info->si_signo);
                        /*
                         * If it was able to dump core, this kills all
                         * other threads in the group and synchronizes with
@@ -1836,13 +1866,13 @@ relock:
                         * first and our do_group_exit call below will use
                         * that value and ignore the one we pass it.
                         */
-                        do_coredump((long)signr, signr, regs);
+                        do_coredump(info->si_signo, info->si_signo, regs);
                }
                /*
                 * Death signals, no core dump.
                 */
-                do_group_exit(signr);
+                do_group_exit(info->si_signo);
                /* NOTREACHED */
        }
        spin_unlock_irq(&sighand->siglock);
@@ -1884,7 +1914,7 @@ void exit_signals(struct task_struct *tsk)
 out:
        spin_unlock_irq(&tsk->sighand->siglock);
-        if (unlikely(group_stop)) {
+        if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) {
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(tsk, CLD_STOPPED);
                read_unlock(&tasklist_lock);
@@ -1895,7 +1925,6 @@ EXPORT_SYMBOL(recalc_sigpending);
 EXPORT_SYMBOL_GPL(dequeue_signal);
 EXPORT_SYMBOL(flush_signals);
 EXPORT_SYMBOL(force_sig);
-EXPORT_SYMBOL(ptrace_notify);
 EXPORT_SYMBOL(send_sig);
 EXPORT_SYMBOL(send_sig_info);
 EXPORT_SYMBOL(sigprocmask);
@@ -2299,7 +2328,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
                 *   (for example, SIGCHLD), shall cause the pending signal to
                 *   be discarded, whether or not it is blocked"
                 */
-                if (__sig_ignored(t, sig)) {
+                if (sig_handler_ignored(sig_handler(t, sig), sig)) {
                        sigemptyset(&mask);
                        sigaddset(&mask, sig);
                        rm_from_queue_full(&mask, &t->signal->shared_pending);
diff --git a/kernel/smp.c b/kernel/smp.c
index 462c785ca1ee..f362a8553777 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -33,7 +33,7 @@ struct call_single_queue {
        spinlock_t lock;
 };
-void __cpuinit init_call_single_data(void)
+static int __cpuinit init_call_single_data(void)
 {
        int i;
@@ -43,7 +43,9 @@ void __cpuinit init_call_single_data(void)
                spin_lock_init(&q->lock);
                INIT_LIST_HEAD(&q->list);
        }
+        return 0;
 }
+early_initcall(init_call_single_data);
 static void csd_flag_wait(struct call_single_data *data)
 {
@@ -133,7 +135,8 @@ void generic_smp_call_function_interrupt(void)
                         */
                        smp_wmb();
                        data->csd.flags &= ~CSD_FLAG_WAIT;
-                } else
+                }
+                if (data->csd.flags & CSD_FLAG_ALLOC)
                        call_rcu(&data->rcu_head, rcu_free_call_data);
        }
        rcu_read_unlock();
@@ -207,8 +210,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 {
        struct call_single_data d;
        unsigned long flags;
-        /* prevent preemption and reschedule on another processor */
+        /* prevent preemption and reschedule on another processor,
+           as well as CPU removal */
        int me = get_cpu();
+        int err = 0;
        /* Can deadlock when called with interrupts disabled */
        WARN_ON(irqs_disabled());
@@ -217,7 +222,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
                local_irq_save(flags);
                func(info);
                local_irq_restore(flags);
-        } else {
+        } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) {
                struct call_single_data *data = NULL;
                if (!wait) {
@@ -233,10 +238,12 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
                data->func = func;
                data->info = info;
                generic_exec_single(cpu, data);
+        } else {
+                err = -ENXIO;   /* CPU not online */
        }
        put_cpu();
-        return 0;
+        return err;
 }
 EXPORT_SYMBOL(smp_call_function_single);
@@ -258,6 +265,42 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
        generic_exec_single(cpu, data);
 }
+/* Dummy function */
+static void quiesce_dummy(void *unused)
+{
+}
+/*
+ * Ensure stack based data used in call function mask is safe to free.
+ *
+ * This is needed by smp_call_function_mask when using on-stack data, because
+ * a single call function queue is shared by all CPUs, and any CPU may pick up
+ * the data item on the queue at any time before it is deleted. So we need to
+ * ensure that all CPUs have transitioned through a quiescent state after
+ * this call.
+ *
+ * This is a very slow function, implemented by sending synchronous IPIs to
+ * all possible CPUs. For this reason, we have to alloc data rather than use
+ * stack based data even in the case of synchronous calls. The stack based
+ * data is then just used for deadlock/oom fallback which will be very rare.
+ *
+ * If a faster scheme can be made, we could go back to preferring stack based
+ * data -- the data allocation/free is non-zero cost.
+ */
+static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
+{
+        struct call_single_data data;
+        int cpu;
+        data.func = quiesce_dummy;
+        data.info = NULL;
+        for_each_cpu_mask(cpu, mask) {
+                data.flags = CSD_FLAG_WAIT;
+                generic_exec_single(cpu, &data);
+        }
+}
 /**
 * smp_call_function_mask(): Run a function on a set of other CPUs.
 * @mask: The set of cpus to run on.
@@ -283,6 +326,7 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
        cpumask_t allbutself;
        unsigned long flags;
        int cpu, num_cpus;
+        int slowpath = 0;
        /* Can deadlock when called with interrupts disabled */
        WARN_ON(irqs_disabled());
@@ -304,15 +348,16 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
                return smp_call_function_single(cpu, func, info, wait);
        }
-        if (!wait) {
+        data = kmalloc(sizeof(*data), GFP_ATOMIC);
-                data = kmalloc(sizeof(*data), GFP_ATOMIC);
+        if (data) {
-                if (data)
+                data->csd.flags = CSD_FLAG_ALLOC;
-                        data->csd.flags = CSD_FLAG_ALLOC;
+                if (wait)
-        }
+                        data->csd.flags |= CSD_FLAG_WAIT;
-        if (!data) {
+        } else {
                data = &d;
                data->csd.flags = CSD_FLAG_WAIT;
                wait = 1;
+                slowpath = 1;
        }
        spin_lock_init(&data->lock);
@@ -329,8 +374,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
        arch_send_call_function_ipi(mask);
        /* optionally wait for the CPUs to complete */
-        if (wait)
+        if (wait) {
                csd_flag_wait(&data->csd);
+                if (unlikely(slowpath))
+                        smp_call_function_mask_quiesce_stack(mask);
+        }
        return 0;
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f6b03d56c2bf..c506f266a6b9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -630,7 +630,7 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
-__init int spawn_ksoftirqd(void)
+static __init int spawn_ksoftirqd(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
        int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
@@ -640,6 +640,7 @@ __init int spawn_ksoftirqd(void)
        register_cpu_notifier(&cpu_nfb);
        return 0;
 }
+early_initcall(spawn_ksoftirqd);
 #ifdef CONFIG_SMP
 /*
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 7bd8d1aadd5d..cb838ee93a82 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -233,7 +233,8 @@ static void check_hung_uninterruptible_tasks(int this_cpu)
        do_each_thread(g, t) {
                if (!--max_count)
                        goto unlock;
-                if (t->state & TASK_UNINTERRUPTIBLE)
+                /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
+                if (t->state == TASK_UNINTERRUPTIBLE)
                        check_hung_task(t, now);
        } while_each_thread(g, t);
 unlock:
@@ -338,14 +339,33 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
-__init void spawn_softlockup_task(void)
+static int __initdata nosoftlockup;
+static int __init nosoftlockup_setup(char *str)
+{
+        nosoftlockup = 1;
+        return 1;
+}
+__setup("nosoftlockup", nosoftlockup_setup);
+static int __init spawn_softlockup_task(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
-        int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+        int err;
-        BUG_ON(err == NOTIFY_BAD);
+        if (nosoftlockup)
+                return 0;
+        err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+        if (err == NOTIFY_BAD) {
+                BUG();
+                return 1;
+        }
        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
        register_cpu_notifier(&cpu_nfb);
        atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+        return 0;
 }
+early_initcall(spawn_softlockup_task);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index a1fb54c93cdd..29ab20749dd3 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -290,8 +290,8 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
        spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
        LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
 EXPORT_SYMBOL(_spin_lock_nested);
 unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
 {
        unsigned long flags;
@@ -311,9 +311,17 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
 #endif
        return flags;
 }
 EXPORT_SYMBOL(_spin_lock_irqsave_nested);
+void __lockfunc _spin_lock_nest_lock(spinlock_t *lock,
+                                     struct lockdep_map *nest_lock)
+{
+        preempt_disable();
+        spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+        LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+}
+EXPORT_SYMBOL(_spin_lock_nest_lock);
 #endif
 void __lockfunc _spin_unlock(spinlock_t *lock)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 738b411ff2d3..af3c7cea258b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,4 +1,4 @@
-/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
+/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
 * GPL v2 and any later version.
 */
 #include <linux/cpu.h>
@@ -13,204 +13,177 @@
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
-/* Since we effect priority and affinity (both of which are visible
+/* This controls the threads on each CPU. */
- * to, and settable by outside processes) we do indirection via a
- * kthread. */
-/* Thread to stop each CPU in user context. */
 enum stopmachine_state {
-        STOPMACHINE_WAIT,
+        /* Dummy starting state for thread. */
+        STOPMACHINE_NONE,
+        /* Awaiting everyone to be scheduled. */
        STOPMACHINE_PREPARE,
+        /* Disable interrupts. */
        STOPMACHINE_DISABLE_IRQ,
+        /* Run the function */
+        STOPMACHINE_RUN,
+        /* Exit */
        STOPMACHINE_EXIT,
 };
+static enum stopmachine_state state;
-static enum stopmachine_state stopmachine_state;
+struct stop_machine_data {
-static unsigned int stopmachine_num_threads;
+        int (*fn)(void *);
-static atomic_t stopmachine_thread_ack;
+        void *data;
+        int fnret;
-static int stopmachine(void *cpu)
+};
-{
-        int irqs_disabled = 0;
-        int prepared = 0;
-        cpumask_of_cpu_ptr(cpumask, (int)(long)cpu);
-        set_cpus_allowed_ptr(current, cpumask);
-        /* Ack: we are alive */
-        smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
-        atomic_inc(&stopmachine_thread_ack);
-        /* Simple state machine */
-        while (stopmachine_state != STOPMACHINE_EXIT) {
-                if (stopmachine_state == STOPMACHINE_DISABLE_IRQ 
-                    && !irqs_disabled) {
-                        local_irq_disable();
-                        hard_irq_disable();
-                        irqs_disabled = 1;
-                        /* Ack: irqs disabled. */
-                        smp_mb(); /* Must read state first. */
-                        atomic_inc(&stopmachine_thread_ack);
-                } else if (stopmachine_state == STOPMACHINE_PREPARE
-                           && !prepared) {
-                        /* Everyone is in place, hold CPU. */
-                        preempt_disable();
-                        prepared = 1;
-                        smp_mb(); /* Must read state first. */
-                        atomic_inc(&stopmachine_thread_ack);
-                }
-                /* Yield in first stage: migration threads need to
-                 * help our sisters onto their CPUs. */
-                if (!prepared && !irqs_disabled)
-                        yield();
-                cpu_relax();
-        }
-        /* Ack: we are exiting. */
-        smp_mb(); /* Must read state first. */
-        atomic_inc(&stopmachine_thread_ack);
-        if (irqs_disabled)
-                local_irq_enable();
-        if (prepared)
-                preempt_enable();
-        return 0;
+/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-}
+static unsigned int num_threads;
+static atomic_t thread_ack;
+static struct completion finished;
+static DEFINE_MUTEX(lock);
-/* Change the thread state */
+static void set_state(enum stopmachine_state newstate)
-static void stopmachine_set_state(enum stopmachine_state state)
 {
-        atomic_set(&stopmachine_thread_ack, 0);
+        /* Reset ack counter. */
+        atomic_set(&thread_ack, num_threads);
        smp_wmb();
-        stopmachine_state = state;
+        state = newstate;
-        while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
-                cpu_relax();
 }
-static int stop_machine(void)
+/* Last one to ack a state moves to the next state. */
+static void ack_state(void)
 {
-        int i, ret = 0;
+        if (atomic_dec_and_test(&thread_ack)) {
+                /* If we're the last one to ack the EXIT, we're finished. */
-        atomic_set(&stopmachine_thread_ack, 0);
+                if (state == STOPMACHINE_EXIT)
-        stopmachine_num_threads = 0;
+                        complete(&finished);
-        stopmachine_state = STOPMACHINE_WAIT;
+                else
+                        set_state(state + 1);
-        for_each_online_cpu(i) {
-                if (i == raw_smp_processor_id())
-                        continue;
-                ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
-                if (ret < 0)
-                        break;
-                stopmachine_num_threads++;
-        }
-        /* Wait for them all to come to life. */
-        while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) {
-                yield();
-                cpu_relax();
        }
+}
-        /* If some failed, kill them all. */
+/* This is the actual thread which stops the CPU.  It exits by itself rather
-        if (ret < 0) {
+ * than waiting for kthread_stop(), because it's easier for hotplug CPU. */
-                stopmachine_set_state(STOPMACHINE_EXIT);
+static int stop_cpu(struct stop_machine_data *smdata)
-                return ret;
+{
-        }
+        enum stopmachine_state curstate = STOPMACHINE_NONE;
-        /* Now they are all started, make them hold the CPUs, ready. */
+        /* Simple state machine */
-        preempt_disable();
+        do {
-        stopmachine_set_state(STOPMACHINE_PREPARE);
+                /* Chill out and ensure we re-read stopmachine_state. */
+                cpu_relax();
+                if (state != curstate) {
+                        curstate = state;
+                        switch (curstate) {
+                        case STOPMACHINE_DISABLE_IRQ:
+                                local_irq_disable();
+                                hard_irq_disable();
+                                break;
+                        case STOPMACHINE_RUN:
+                                /* |= allows error detection if functions on
+                                 * multiple CPUs. */
+                                smdata->fnret |= smdata->fn(smdata->data);
+                                break;
+                        default:
+                                break;
+                        }
+                        ack_state();
+                }
+        } while (curstate != STOPMACHINE_EXIT);
-        /* Make them disable irqs. */
+        local_irq_enable();
-        local_irq_disable();
+        do_exit(0);
-        hard_irq_disable();
+}
-        stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
+/* Callback for CPUs which aren't supposed to do anything. */
+static int chill(void *unused)
+{
        return 0;
 }
-static void restart_machine(void)
+int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
 {
-        stopmachine_set_state(STOPMACHINE_EXIT);
+        int i, err;
-        local_irq_enable();
+        struct stop_machine_data active, idle;
-        preempt_enable_no_resched();
+        struct task_struct **threads;
-}
+        active.fn = fn;
+        active.data = data;
+        active.fnret = 0;
+        idle.fn = chill;
+        idle.data = NULL;
+        /* This could be too big for stack on large machines. */
+        threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL);
+        if (!threads)
+                return -ENOMEM;
+        /* Set up initial state. */
+        mutex_lock(&lock);
+        init_completion(&finished);
+        num_threads = num_online_cpus();
+        set_state(STOPMACHINE_PREPARE);
-struct stop_machine_data {
+        for_each_online_cpu(i) {
-        int (*fn)(void *);
+                struct stop_machine_data *smdata = &idle;
-        void *data;
+                struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-        struct completion done;
-};
-static int do_stop(void *_smdata)
+                if (!cpus) {
-{
+                        if (i == first_cpu(cpu_online_map))
-        struct stop_machine_data *smdata = _smdata;
+                                smdata = &active;
-        int ret;
+                } else {
+                        if (cpu_isset(i, *cpus))
+                                smdata = &active;
+                }
-        ret = stop_machine();
+                threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u",
-        if (ret == 0) {
+                                            i);
-                ret = smdata->fn(smdata->data);
+                if (IS_ERR(threads[i])) {
-                restart_machine();
+                        err = PTR_ERR(threads[i]);
-        }
+                        threads[i] = NULL;
+                        goto kill_threads;
+                }
-        /* We're done: you can kthread_stop us now */
+                /* Place it onto correct cpu. */
-        complete(&smdata->done);
+                kthread_bind(threads[i], i);
-        /* Wait for kthread_stop */
+                /* Make it highest prio. */
-        set_current_state(TASK_INTERRUPTIBLE);
+                if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, &param))
-        while (!kthread_should_stop()) {
+                        BUG();
-                schedule();
-                set_current_state(TASK_INTERRUPTIBLE);
        }
-        __set_current_state(TASK_RUNNING);
-        return ret;
-}
-struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
+        /* We've created all the threads.  Wake them all: hold this CPU so one
-                                       unsigned int cpu)
+         * doesn't hit this CPU until we're ready. */
-{
+        get_cpu();
-        static DEFINE_MUTEX(stopmachine_mutex);
+        for_each_online_cpu(i)
-        struct stop_machine_data smdata;
+                wake_up_process(threads[i]);
-        struct task_struct *p;
-        smdata.fn = fn;
+        /* This will release the thread on our CPU. */
-        smdata.data = data;
+        put_cpu();
-        init_completion(&smdata.done);
+        wait_for_completion(&finished);
+        mutex_unlock(&lock);
-        mutex_lock(&stopmachine_mutex);
+        kfree(threads);
-        /* If they don't care which CPU fn runs on, bind to any online one. */
+        return active.fnret;
-        if (cpu == NR_CPUS)
-                cpu = raw_smp_processor_id();
-        p = kthread_create(do_stop, &smdata, "kstopmachine");
+kill_threads:
-        if (!IS_ERR(p)) {
+        for_each_online_cpu(i)
-                struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+                if (threads[i])
+                        kthread_stop(threads[i]);
+        mutex_unlock(&lock);
-                /* One high-prio thread per cpu.  We'll do this one. */
+        kfree(threads);
-                sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+        return err;
-                kthread_bind(p, cpu);
-                wake_up_process(p);
-                wait_for_completion(&smdata.done);
-        }
-        mutex_unlock(&stopmachine_mutex);
-        return p;
 }
-int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
+int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
 {
-        struct task_struct *p;
        int ret;
        /* No CPUs can come up or down during this. */
        get_online_cpus();
-        p = __stop_machine_run(fn, data, cpu);
+        ret = __stop_machine(fn, data, cpus);
-        if (!IS_ERR(p))
-                ret = kthread_stop(p);
-        else
-                ret = PTR_ERR(p);
        put_online_cpus();
        return ret;
 }
-EXPORT_SYMBOL_GPL(stop_machine_run);
+EXPORT_SYMBOL_GPL(stop_machine);
diff --git a/kernel/sys.c b/kernel/sys.c
index 0c9d3fa1f5ff..038a7bc0901d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -169,9 +169,9 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
                                pgrp = find_vpid(who);
                        else
                                pgrp = task_pgrp(current);
-                        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+                        do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
                                error = set_one_prio(p, niceval, error);
-                        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                        break;
                case PRIO_USER:
                        user = current->user;
@@ -229,11 +229,11 @@ asmlinkage long sys_getpriority(int which, int who)
                                pgrp = find_vpid(who);
                        else
                                pgrp = task_pgrp(current);
-                        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+                        do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
                                niceval = 20 - task_nice(p);
                                if (niceval > retval)
                                        retval = niceval;
-                        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
                        break;
                case PRIO_USER:
                        user = current->user;
@@ -274,7 +274,7 @@ void emergency_restart(void)
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
-static void kernel_restart_prepare(char *cmd)
+void kernel_restart_prepare(char *cmd)
 {
        blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
        system_state = SYSTEM_RESTART;
@@ -301,26 +301,6 @@ void kernel_restart(char *cmd)
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
-/**
- *      kernel_kexec - reboot the system
- *
- *      Move into place and start executing a preloaded standalone
- *      executable.  If nothing was preloaded return an error.
- */
-static void kernel_kexec(void)
-{
-#ifdef CONFIG_KEXEC
-        struct kimage *image;
-        image = xchg(&kexec_image, NULL);
-        if (!image)
-                return;
-        kernel_restart_prepare(NULL);
-        printk(KERN_EMERG "Starting new kernel\n");
-        machine_shutdown();
-        machine_kexec(image);
-#endif
-}
 static void kernel_shutdown_prepare(enum system_states state)
 {
        blocking_notifier_call_chain(&reboot_notifier_list,
@@ -425,10 +405,15 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
                kernel_restart(buffer);
                break;
+#ifdef CONFIG_KEXEC
        case LINUX_REBOOT_CMD_KEXEC:
-                kernel_kexec();
+                {
-                unlock_kernel();
+                        int ret;
-                return -EINVAL;
+                        ret = kernel_kexec();
+                        unlock_kernel();
+                        return ret;
+                }
+#endif
 #ifdef CONFIG_HIBERNATION
        case LINUX_REBOOT_CMD_SW_SUSPEND:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 35a50db9b6ce..1bf369bd4423 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -118,10 +118,8 @@ extern char modprobe_path[];
 extern int sg_big_buff;
 #endif
-#ifdef __sparc__
+#ifdef CONFIG_SPARC
-extern char reboot_command [];
+#include <asm/system.h>
-extern int stop_a_enabled;
-extern int scons_pwroff;
 #endif
 #ifdef __hppa__
@@ -159,13 +157,15 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *
 static struct ctl_table root_table[];
 static struct ctl_table_root sysctl_table_root;
 static struct ctl_table_header root_table_header = {
+        .count = 1,
        .ctl_table = root_table,
-        .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list),
+        .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
        .root = &sysctl_table_root,
+        .set = &sysctl_table_root.default_set,
 };
 static struct ctl_table_root sysctl_table_root = {
        .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
-        .header_list = LIST_HEAD_INIT(root_table_header.ctl_entry),
+        .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
 };
 static struct ctl_table kern_table[];
@@ -413,7 +413,7 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
-#ifdef __sparc__
+#ifdef CONFIG_SPARC
        {
                .ctl_name       = KERN_SPARC_REBOOT,
                .procname       = "reboot-cmd",
@@ -1386,6 +1386,9 @@ static void start_unregistering(struct ctl_table_header *p)
                spin_unlock(&sysctl_lock);
                wait_for_completion(&wait);
                spin_lock(&sysctl_lock);
+        } else {
+                /* anything non-NULL; we'll never dereference it */
+                p->unregistering = ERR_PTR(-EINVAL);
        }
        /*
         * do not remove from the list until nobody holds it; walking the
@@ -1394,6 +1397,32 @@ static void start_unregistering(struct ctl_table_header *p)
        list_del_init(&p->ctl_entry);
 }
+void sysctl_head_get(struct ctl_table_header *head)
+{
+        spin_lock(&sysctl_lock);
+        head->count++;
+        spin_unlock(&sysctl_lock);
+}
+void sysctl_head_put(struct ctl_table_header *head)
+{
+        spin_lock(&sysctl_lock);
+        if (!--head->count)
+                kfree(head);
+        spin_unlock(&sysctl_lock);
+}
+struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
+{
+        if (!head)
+                BUG();
+        spin_lock(&sysctl_lock);
+        if (!use_table(head))
+                head = ERR_PTR(-ENOENT);
+        spin_unlock(&sysctl_lock);
+        return head;
+}
 void sysctl_head_finish(struct ctl_table_header *head)
 {
        if (!head)
@@ -1403,14 +1432,20 @@ void sysctl_head_finish(struct ctl_table_header *head)
        spin_unlock(&sysctl_lock);
 }
+static struct ctl_table_set *
+lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+        struct ctl_table_set *set = &root->default_set;
+        if (root->lookup)
+                set = root->lookup(root, namespaces);
+        return set;
+}
 static struct list_head *
 lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
 {
-        struct list_head *header_list;
+        struct ctl_table_set *set = lookup_header_set(root, namespaces);
-        header_list = &root->header_list;
+        return &set->list;
-        if (root->lookup)
-                header_list = root->lookup(root, namespaces);
-        return header_list;
 }
 struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
@@ -1480,9 +1515,9 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
        int op = 0, rc;
        if (oldval)
-                op |= 004;
+                op |= MAY_READ;
        if (newval)
-                op |= 002;
+                op |= MAY_WRITE;
        if (sysctl_perm(root, table, op))
                return -EPERM;
@@ -1524,7 +1559,7 @@ repeat:
                if (n == table->ctl_name) {
                        int error;
                        if (table->child) {
-                                if (sysctl_perm(root, table, 001))
+                                if (sysctl_perm(root, table, MAY_EXEC))
                                        return -EPERM;
                                name++;
                                nlen--;
@@ -1599,7 +1634,7 @@ static int test_perm(int mode, int op)
                mode >>= 6;
        else if (in_egroup_p(0))
                mode >>= 3;
-        if ((mode & op & 0007) == op)
+        if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
                return 0;
        return -EACCES;
 }
@@ -1609,7 +1644,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
        int error;
        int mode;
-        error = security_sysctl(table, op);
+        error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
        if (error)
                return error;
@@ -1644,6 +1679,54 @@ static __init int sysctl_init(void)
 core_initcall(sysctl_init);
+static struct ctl_table *is_branch_in(struct ctl_table *branch,
+                                      struct ctl_table *table)
+{
+        struct ctl_table *p;
+        const char *s = branch->procname;
+        /* branch should have named subdirectory as its first element */
+        if (!s || !branch->child)
+                return NULL;
+        /* ... and nothing else */
+        if (branch[1].procname || branch[1].ctl_name)
+                return NULL;
+        /* table should contain subdirectory with the same name */
+        for (p = table; p->procname || p->ctl_name; p++) {
+                if (!p->child)
+                        continue;
+                if (p->procname && strcmp(p->procname, s) == 0)
+                        return p;
+        }
+        return NULL;
+}
+/* see if attaching q to p would be an improvement */
+static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
+{
+        struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
+        struct ctl_table *next;
+        int is_better = 0;
+        int not_in_parent = !p->attached_by;
+        while ((next = is_branch_in(by, to)) != NULL) {
+                if (by == q->attached_by)
+                        is_better = 1;
+                if (to == p->attached_by)
+                        not_in_parent = 1;
+                by = by->child;
+                to = next->child;
+        }
+        if (is_better && not_in_parent) {
+                q->attached_by = by;
+                q->attached_to = to;
+                q->parent = p;
+        }
+}
 /**
 * __register_sysctl_paths - register a sysctl hierarchy
 * @root: List of sysctl headers to register on
@@ -1720,10 +1803,10 @@ struct ctl_table_header *__register_sysctl_paths(
        struct nsproxy *namespaces,
        const struct ctl_path *path, struct ctl_table *table)
 {
-        struct list_head *header_list;
        struct ctl_table_header *header;
        struct ctl_table *new, **prevp;
        unsigned int n, npath;
+        struct ctl_table_set *set;
        /* Count the path components */
        for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
@@ -1765,6 +1848,7 @@ struct ctl_table_header *__register_sysctl_paths(
        header->unregistering = NULL;
        header->root = root;
        sysctl_set_parent(NULL, header->ctl_table);
+        header->count = 1;
 #ifdef CONFIG_SYSCTL_SYSCALL_CHECK
        if (sysctl_check_table(namespaces, header->ctl_table)) {
                kfree(header);
@@ -1772,8 +1856,20 @@ struct ctl_table_header *__register_sysctl_paths(
        }
 #endif
        spin_lock(&sysctl_lock);
-        header_list = lookup_header_list(root, namespaces);
+        header->set = lookup_header_set(root, namespaces);
-        list_add_tail(&header->ctl_entry, header_list);
+        header->attached_by = header->ctl_table;
+        header->attached_to = root_table;
+        header->parent = &root_table_header;
+        for (set = header->set; set; set = set->parent) {
+                struct ctl_table_header *p;
+                list_for_each_entry(p, &set->list, ctl_entry) {
+                        if (p->unregistering)
+                                continue;
+                        try_attach(p, header);
+                }
+        }
+        header->parent->count++;
+        list_add_tail(&header->ctl_entry, &header->set->list);
        spin_unlock(&sysctl_lock);
        return header;
@@ -1828,8 +1924,37 @@ void unregister_sysctl_table(struct ctl_table_header * header)
        spin_lock(&sysctl_lock);
        start_unregistering(header);
+        if (!--header->parent->count) {
+                WARN_ON(1);
+                kfree(header->parent);
+        }
+        if (!--header->count)
+                kfree(header);
        spin_unlock(&sysctl_lock);
-        kfree(header);
+}
+int sysctl_is_seen(struct ctl_table_header *p)
+{
+        struct ctl_table_set *set = p->set;
+        int res;
+        spin_lock(&sysctl_lock);
+        if (p->unregistering)
+                res = 0;
+        else if (!set->is_seen)
+                res = 1;
+        else
+                res = set->is_seen(set);
+        spin_unlock(&sysctl_lock);
+        return res;
+}
+void setup_sysctl_set(struct ctl_table_set *p,
+        struct ctl_table_set *parent,
+        int (*is_seen)(struct ctl_table_set *))
+{
+        INIT_LIST_HEAD(&p->list);
+        p->parent = parent ? parent : &sysctl_table_root.default_set;
+        p->is_seen = is_seen;
 }
 #else /* !CONFIG_SYSCTL */
@@ -1848,6 +1973,16 @@ void unregister_sysctl_table(struct ctl_table_header * table)
 {
 }
+void setup_sysctl_set(struct ctl_table_set *p,
+        struct ctl_table_set *parent,
+        int (*is_seen)(struct ctl_table_set *))
+{
+}
+void sysctl_head_put(struct ctl_table_header *head)
+{
+}
 #endif /* CONFIG_SYSCTL */
 /*
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 3d1e3e1a1971..f8d968063cea 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -72,6 +72,16 @@ void clockevents_set_mode(struct clock_event_device *dev,
 }
 /**
+ * clockevents_shutdown - shutdown the device and clear next_event
+ * @dev:        device to shutdown
+ */
+void clockevents_shutdown(struct clock_event_device *dev)
+{
+        clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+        dev->next_event.tv64 = KTIME_MAX;
+}
+/**
 * clockevents_program_event - Reprogram the clock event device.
 * @expires:    absolute expiry time (monotonic clock)
 *
@@ -177,7 +187,7 @@ void clockevents_register_device(struct clock_event_device *dev)
 /*
 * Noop handler when we shut down an event device
 */
-static void clockevents_handle_noop(struct clock_event_device *dev)
+void clockevents_handle_noop(struct clock_event_device *dev)
 {
 }
@@ -199,7 +209,6 @@ void clockevents_exchange_device(struct clock_event_device *old,
         * released list and do a notify add later.
         */
        if (old) {
-                old->event_handler = clockevents_handle_noop;
                clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
                list_del(&old->list);
                list_add(&old->list, &clockevents_released);
@@ -207,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
        if (new) {
                BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
-                clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN);
+                clockevents_shutdown(new);
        }
        local_irq_restore(flags);
 }
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5125ddd8196b..1ad46f3df6e7 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -245,7 +245,7 @@ static void sync_cmos_clock(unsigned long dummy)
        if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
                fail = update_persistent_clock(now);
-        next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec;
+        next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
        if (next.tv_nsec <= 0)
                next.tv_nsec += NSEC_PER_SEC;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 31463d370b94..cb01cd8f919b 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -175,6 +175,8 @@ static void tick_do_periodic_broadcast(void)
 */
 static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
 {
+        ktime_t next;
        tick_do_periodic_broadcast();
        /*
@@ -185,10 +187,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
        /*
         * Setup the next period for devices, which do not have
-         * periodic mode:
+         * periodic mode. We read dev->next_event first and add to it
+         * when the event alrady expired. clockevents_program_event()
+         * sets dev->next_event only when the event is really
+         * programmed to the device.
         */
-        for (;;) {
+        for (next = dev->next_event; ;) {
-                ktime_t next = ktime_add(dev->next_event, tick_period);
+                next = ktime_add(next, tick_period);
                if (!clockevents_program_event(dev, next, ktime_get()))
                        return;
@@ -205,7 +210,7 @@ static void tick_do_broadcast_on_off(void *why)
        struct clock_event_device *bc, *dev;
        struct tick_device *td;
        unsigned long flags, *reason = why;
-        int cpu;
+        int cpu, bc_stopped;
        spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -223,14 +228,16 @@ static void tick_do_broadcast_on_off(void *why)
        if (!tick_device_is_functional(dev))
                goto out;
+        bc_stopped = cpus_empty(tick_broadcast_mask);
        switch (*reason) {
        case CLOCK_EVT_NOTIFY_BROADCAST_ON:
        case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
                if (!cpu_isset(cpu, tick_broadcast_mask)) {
                        cpu_set(cpu, tick_broadcast_mask);
-                        if (td->mode == TICKDEV_MODE_PERIODIC)
+                        if (tick_broadcast_device.mode ==
-                                clockevents_set_mode(dev,
+                            TICKDEV_MODE_PERIODIC)
-                                                     CLOCK_EVT_MODE_SHUTDOWN);
+                                clockevents_shutdown(dev);
                }
                if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
                        tick_broadcast_force = 1;
@@ -239,15 +246,17 @@ static void tick_do_broadcast_on_off(void *why)
                if (!tick_broadcast_force &&
                    cpu_isset(cpu, tick_broadcast_mask)) {
                        cpu_clear(cpu, tick_broadcast_mask);
-                        if (td->mode == TICKDEV_MODE_PERIODIC)
+                        if (tick_broadcast_device.mode ==
+                            TICKDEV_MODE_PERIODIC)
                                tick_setup_periodic(dev, 0);
                }
                break;
        }
-        if (cpus_empty(tick_broadcast_mask))
+        if (cpus_empty(tick_broadcast_mask)) {
-                clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+                if (!bc_stopped)
-        else {
+                        clockevents_shutdown(bc);
+        } else if (bc_stopped) {
                if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
                        tick_broadcast_start_periodic(bc);
                else
@@ -298,7 +307,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
        if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
                if (bc && cpus_empty(tick_broadcast_mask))
-                        clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+                        clockevents_shutdown(bc);
        }
        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -313,7 +322,7 @@ void tick_suspend_broadcast(void)
        bc = tick_broadcast_device.evtdev;
        if (bc)
-                clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+                clockevents_shutdown(bc);
        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
@@ -364,16 +373,8 @@ cpumask_t *tick_get_broadcast_oneshot_mask(void)
 static int tick_broadcast_set_event(ktime_t expires, int force)
 {
        struct clock_event_device *bc = tick_broadcast_device.evtdev;
-        ktime_t now = ktime_get();
-        int res;
+        return tick_dev_program_event(bc, expires, force);
-        for(;;) {
-                res = clockevents_program_event(bc, expires, now);
-                if (!res || !force)
-                        return res;
-                now = ktime_get();
-                expires = ktime_add(now, ktime_set(0, bc->min_delta_ns));
-        }
 }
 int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -491,14 +492,52 @@ static void tick_broadcast_clear_oneshot(int cpu)
        cpu_clear(cpu, tick_broadcast_oneshot_mask);
 }
+static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires)
+{
+        struct tick_device *td;
+        int cpu;
+        for_each_cpu_mask_nr(cpu, *mask) {
+                td = &per_cpu(tick_cpu_device, cpu);
+                if (td->evtdev)
+                        td->evtdev->next_event = expires;
+        }
+}
 /**
 * tick_broadcast_setup_oneshot - setup the broadcast device
 */
 void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
-        bc->event_handler = tick_handle_oneshot_broadcast;
+        /* Set it up only once ! */
-        clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+        if (bc->event_handler != tick_handle_oneshot_broadcast) {
-        bc->next_event.tv64 = KTIME_MAX;
+                int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
+                int cpu = smp_processor_id();
+                cpumask_t mask;
+                bc->event_handler = tick_handle_oneshot_broadcast;
+                clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+                /* Take the do_timer update */
+                tick_do_timer_cpu = cpu;
+                /*
+                 * We must be careful here. There might be other CPUs
+                 * waiting for periodic broadcast. We need to set the
+                 * oneshot_mask bits for those and program the
+                 * broadcast device to fire.
+                 */
+                mask = tick_broadcast_mask;
+                cpu_clear(cpu, mask);
+                cpus_or(tick_broadcast_oneshot_mask,
+                        tick_broadcast_oneshot_mask, mask);
+                if (was_periodic && !cpus_empty(mask)) {
+                        tick_broadcast_init_next_event(&mask, tick_next_period);
+                        tick_broadcast_set_event(tick_next_period, 1);
+                } else
+                        bc->next_event.tv64 = KTIME_MAX;
+        }
 }
 /*
@@ -538,4 +577,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
+/*
+ * Check, whether the broadcast device is in one shot mode
+ */
+int tick_broadcast_oneshot_active(void)
+{
+        return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
+}
 #endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index bf43284d6855..df12434b43ca 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
 */
 ktime_t tick_next_period;
 ktime_t tick_period;
-int tick_do_timer_cpu __read_mostly = -1;
+int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
 DEFINE_SPINLOCK(tick_device_lock);
 /*
@@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
        if (!tick_device_is_functional(dev))
                return;
-        if (dev->features & CLOCK_EVT_FEAT_PERIODIC) {
+        if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
+            !tick_broadcast_oneshot_active()) {
                clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
        } else {
                unsigned long seq;
@@ -148,7 +149,7 @@ static void tick_setup_device(struct tick_device *td,
                 * If no cpu took the do_timer update, assign it to
                 * this cpu:
                 */
-                if (tick_do_timer_cpu == -1) {
+                if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
                        tick_do_timer_cpu = cpu;
                        tick_next_period = ktime_get();
                        tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
@@ -161,6 +162,7 @@ static void tick_setup_device(struct tick_device *td,
        } else {
                handler = td->evtdev->event_handler;
                next_event = td->evtdev->next_event;
+                td->evtdev->event_handler = clockevents_handle_noop;
        }
        td->evtdev = newdev;
@@ -196,12 +198,10 @@ static int tick_check_new_device(struct clock_event_device *newdev)
        struct tick_device *td;
        int cpu, ret = NOTIFY_OK;
        unsigned long flags;
-        cpumask_of_cpu_ptr_declare(cpumask);
        spin_lock_irqsave(&tick_device_lock, flags);
        cpu = smp_processor_id();
-        cpumask_of_cpu_ptr_next(cpumask, cpu);
        if (!cpu_isset(cpu, newdev->cpumask))
                goto out_bc;
@@ -209,7 +209,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
        curdev = td->evtdev;
        /* cpu local device ? */
-        if (!cpus_equal(newdev->cpumask, *cpumask)) {
+        if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) {
                /*
                 * If the cpu affinity of the device interrupt can not
@@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
                 * If we have a cpu local device already, do not replace it
                 * by a non cpu local device
                 */
-                if (curdev && cpus_equal(curdev->cpumask, *cpumask))
+                if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu)))
                        goto out_bc;
        }
@@ -250,11 +250,11 @@ static int tick_check_new_device(struct clock_event_device *newdev)
         * not give it back to the clockevents layer !
         */
        if (tick_is_broadcast_device(curdev)) {
-                clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN);
+                clockevents_shutdown(curdev);
                curdev = NULL;
        }
        clockevents_exchange_device(curdev, newdev);
-        tick_setup_device(td, newdev, cpu, cpumask);
+        tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu));
        if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
                tick_oneshot_notify();
@@ -301,7 +301,8 @@ static void tick_shutdown(unsigned int *cpup)
        if (*cpup == tick_do_timer_cpu) {
                int cpu = first_cpu(cpu_online_map);
-                tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1;
+                tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu :
+                        TICK_DO_TIMER_NONE;
        }
        spin_unlock_irqrestore(&tick_device_lock, flags);
 }
@@ -312,7 +313,7 @@ static void tick_suspend(void)
        unsigned long flags;
        spin_lock_irqsave(&tick_device_lock, flags);
-        clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
+        clockevents_shutdown(td->evtdev);
        spin_unlock_irqrestore(&tick_device_lock, flags);
 }
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f13f2b7f4fd4..469248782c23 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
 /*
 * tick internal variable and functions used by low/high res code
 */
+#define TICK_DO_TIMER_NONE      -1
+#define TICK_DO_TIMER_BOOT      -2
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 extern spinlock_t tick_device_lock;
 extern ktime_t tick_next_period;
@@ -10,6 +14,8 @@ extern int tick_do_timer_cpu __read_mostly;
 extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
 extern void tick_handle_periodic(struct clock_event_device *dev);
+extern void clockevents_shutdown(struct clock_event_device *dev);
 /*
 * NO_HZ / high resolution timer shared code
 */
@@ -17,6 +23,8 @@ extern void tick_handle_periodic(struct clock_event_device *dev);
 extern void tick_setup_oneshot(struct clock_event_device *newdev,
                               void (*handler)(struct clock_event_device *),
                               ktime_t nextevt);
+extern int tick_dev_program_event(struct clock_event_device *dev,
+                                  ktime_t expires, int force);
 extern int tick_program_event(ktime_t expires, int force);
 extern void tick_oneshot_notify(void);
 extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
@@ -27,6 +35,7 @@ extern void tick_broadcast_oneshot_control(unsigned long reason);
 extern void tick_broadcast_switch_to_oneshot(void);
 extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
 extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
+extern int tick_broadcast_oneshot_active(void);
 # else /* BROADCAST */
 static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
@@ -35,6 +44,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
 static inline void tick_broadcast_switch_to_oneshot(void) { }
 static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+static inline int tick_broadcast_oneshot_active(void) { return 0; }
 # endif /* !BROADCAST */
 #else /* !ONESHOT */
@@ -64,6 +74,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 {
        return 0;
 }
+static inline int tick_broadcast_oneshot_active(void) { return 0; }
 #endif /* !TICK_ONESHOT */
 /*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 450c04935b66..2e8de678e767 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -23,24 +23,56 @@
 #include "tick-internal.h"
 /**
- * tick_program_event
+ * tick_program_event internal worker function
 */
-int tick_program_event(ktime_t expires, int force)
+int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
+                           int force)
 {
-        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
        ktime_t now = ktime_get();
+        int i;
-        while (1) {
+        for (i = 0;;) {
                int ret = clockevents_program_event(dev, expires, now);
                if (!ret || !force)
                        return ret;
+                /*
+                 * We tried 2 times to program the device with the given
+                 * min_delta_ns. If that's not working then we double it
+                 * and emit a warning.
+                 */
+                if (++i > 2) {
+                        /* Increase the min. delta and try again */
+                        if (!dev->min_delta_ns)
+                                dev->min_delta_ns = 5000;
+                        else
+                                dev->min_delta_ns += dev->min_delta_ns >> 1;
+                        printk(KERN_WARNING
+                               "CE: %s increasing min_delta_ns to %lu nsec\n",
+                               dev->name ? dev->name : "?",
+                               dev->min_delta_ns << 1);
+                        i = 0;
+                }
                now = ktime_get();
-                expires = ktime_add(now, ktime_set(0, dev->min_delta_ns));
+                expires = ktime_add_ns(now, dev->min_delta_ns);
        }
 }
 /**
+ * tick_program_event
+ */
+int tick_program_event(ktime_t expires, int force)
+{
+        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+        return tick_dev_program_event(dev, expires, force);
+}
+/**
 * tick_resume_onshot - resume oneshot mode
 */
 void tick_resume_oneshot(void)
@@ -61,7 +93,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
 {
        newdev->event_handler = handler;
        clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
-        clockevents_program_event(newdev, next_event, ktime_get());
+        tick_dev_program_event(newdev, next_event, 1);
 }
 /**
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 825b4c00fe44..a4d219398167 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
 #include <linux/profile.h>
 #include <linux/sched.h>
 #include <linux/tick.h>
+#include <linux/module.h>
 #include <asm/irq_regs.h>
@@ -75,6 +76,9 @@ static void tick_do_update_jiffies64(ktime_t now)
                                                           incr * ticks);
                }
                do_timer(++ticks);
+                /* Keep the tick_next_period variable up to date */
+                tick_next_period = ktime_add(last_jiffies_update, tick_period);
        }
        write_sequnlock(&xtime_lock);
 }
@@ -162,6 +166,8 @@ void tick_nohz_stop_idle(int cpu)
                ts->idle_lastupdate = now;
                ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
                ts->idle_active = 0;
+                sched_clock_idle_wakeup_event(0);
        }
 }
@@ -177,6 +183,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
        }
        ts->idle_entrytime = now;
        ts->idle_active = 1;
+        sched_clock_idle_sleep_event();
        return now;
 }
@@ -184,9 +191,17 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-        *last_update_time = ktime_to_us(ts->idle_lastupdate);
+        if (!tick_nohz_enabled)
+                return -1;
+        if (ts->idle_active)
+                *last_update_time = ktime_to_us(ts->idle_lastupdate);
+        else
+                *last_update_time = ktime_to_us(ktime_get());
        return ktime_to_us(ts->idle_sleeptime);
 }
+EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 /**
 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
@@ -218,7 +233,7 @@ void tick_nohz_stop_sched_tick(int inidle)
         */
        if (unlikely(!cpu_online(cpu))) {
                if (cpu == tick_do_timer_cpu)
-                        tick_do_timer_cpu = -1;
+                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
        }
        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
@@ -289,7 +304,6 @@ void tick_nohz_stop_sched_tick(int inidle)
                        ts->tick_stopped = 1;
                        ts->idle_jiffies = last_jiffies;
                        rcu_enter_nohz();
-                        sched_clock_tick_stop(cpu);
                }
                /*
@@ -301,7 +315,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                 * invoked.
                 */
                if (cpu == tick_do_timer_cpu)
-                        tick_do_timer_cpu = -1;
+                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
                ts->idle_sleeps++;
@@ -392,7 +406,6 @@ void tick_nohz_restart_sched_tick(void)
        select_nohz_load_balancer(0);
        now = ktime_get();
        tick_do_update_jiffies64(now);
-        sched_clock_tick_start(cpu);
        cpu_clear(cpu, nohz_cpu_mask);
        /*
@@ -467,7 +480,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
         * this duty, then the jiffies update is still serialized by
         * xtime_lock.
         */
-        if (unlikely(tick_do_timer_cpu == -1))
+        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
                tick_do_timer_cpu = cpu;
        /* Check, if the jiffies need an update */
@@ -569,7 +582,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
         * this duty, then the jiffies update is still serialized by
         * xtime_lock.
         */
-        if (unlikely(tick_do_timer_cpu == -1))
+        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
                tick_do_timer_cpu = cpu;
 #endif
@@ -621,7 +634,7 @@ void tick_setup_sched_timer(void)
         */
        hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
        ts->sched_timer.function = tick_sched_timer;
-        ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+        ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
        /* Get the next period (per cpu) */
        ts->sched_timer.expires = tick_init_jiffy_update();
@@ -645,17 +658,21 @@ void tick_setup_sched_timer(void)
                ts->nohz_mode = NOHZ_MODE_HIGHRES;
 #endif
 }
+#endif /* HIGH_RES_TIMERS */
+#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS
 void tick_cancel_sched_timer(int cpu)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+# ifdef CONFIG_HIGH_RES_TIMERS
        if (ts->sched_timer.base)
                hrtimer_cancel(&ts->sched_timer);
+# endif
        ts->nohz_mode = NOHZ_MODE_INACTIVE;
 }
-#endif /* HIGH_RES_TIMERS */
+#endif
 /**
 * Async notification about clocksource changes
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4231a3dc224a..f6e3af31b403 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -587,7 +587,7 @@ static int __ftrace_modify_code(void *data)
 static void ftrace_run_update_code(int command)
 {
-        stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
+        stop_machine(__ftrace_modify_code, &command, NULL);
 }
 void ftrace_disable_daemon(void)
@@ -787,7 +787,7 @@ static int ftrace_update_code(void)
            !ftrace_enabled || !ftraced_trigger)
                return 0;
-        stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
+        stop_machine(__ftrace_update_code, NULL, NULL);
        return 1;
 }
@@ -1564,7 +1564,7 @@ static int __init ftrace_dynamic_init(void)
        addr = (unsigned long)ftrace_record_ip;
-        stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
+        stop_machine(ftrace_dyn_arch_init, &addr, NULL);
        /* ftrace_dyn_arch_init places the return code in addr */
        if (addr) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 868e121c8e38..8f3fb3db61c3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1183,7 +1183,6 @@ static void *find_next_entry_inc(struct trace_iterator *iter)
 static void *s_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct trace_iterator *iter = m->private;
-        void *last_ent = iter->ent;
        int i = (int)*pos;
        void *ent;
@@ -1203,9 +1202,6 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
        iter->pos = *pos;
-        if (last_ent && !ent)
-                seq_puts(m, "\n\nvim:ft=help\n");
        return ent;
 }
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 421d6fe3650e..ece6cfb649fa 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -253,12 +253,14 @@ void start_critical_timings(void)
        if (preempt_trace() || irq_trace())
                start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
+EXPORT_SYMBOL_GPL(start_critical_timings);
 void stop_critical_timings(void)
 {
        if (preempt_trace() || irq_trace())
                stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
+EXPORT_SYMBOL_GPL(stop_critical_timings);
 #ifdef CONFIG_IRQSOFF_TRACER
 #ifdef CONFIG_PROVE_LOCKING
@@ -337,12 +339,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
 #ifdef CONFIG_PREEMPT_TRACER
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
-        stop_critical_timing(a0, a1);
+        if (preempt_trace())
+                stop_critical_timing(a0, a1);
 }
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
-        start_critical_timing(a0, a1);
+        if (preempt_trace())
+                start_critical_timing(a0, a1);
 }
 #endif /* CONFIG_PREEMPT_TRACER */
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3c8d61df4474..e303ccb62cdf 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -26,7 +26,8 @@ static struct task_struct	*wakeup_task;
 static int                      wakeup_cpu;
 static unsigned                 wakeup_prio = -1;
-static DEFINE_SPINLOCK(wakeup_lock);
+static raw_spinlock_t wakeup_lock =
+        (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 static void __wakeup_reset(struct trace_array *tr);
@@ -56,7 +57,8 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
        if (unlikely(disabled != 1))
                goto out;
-        spin_lock_irqsave(&wakeup_lock, flags);
+        local_irq_save(flags);
+        __raw_spin_lock(&wakeup_lock);
        if (unlikely(!wakeup_task))
                goto unlock;
@@ -71,7 +73,8 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
        trace_function(tr, data, ip, parent_ip, flags);
 unlock:
-        spin_unlock_irqrestore(&wakeup_lock, flags);
+        __raw_spin_unlock(&wakeup_lock);
+        local_irq_restore(flags);
 out:
        atomic_dec(&data->disabled);
@@ -145,7 +148,8 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
        if (likely(disabled != 1))
                goto out;
-        spin_lock_irqsave(&wakeup_lock, flags);
+        local_irq_save(flags);
+        __raw_spin_lock(&wakeup_lock);
        /* We could race with grabbing wakeup_lock */
        if (unlikely(!tracer_enabled || next != wakeup_task))
@@ -174,7 +178,8 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
 out_unlock:
        __wakeup_reset(tr);
-        spin_unlock_irqrestore(&wakeup_lock, flags);
+        __raw_spin_unlock(&wakeup_lock);
+        local_irq_restore(flags);
 out:
        atomic_dec(&tr->data[cpu]->disabled);
 }
@@ -209,8 +214,6 @@ static void __wakeup_reset(struct trace_array *tr)
        struct trace_array_cpu *data;
        int cpu;
-        assert_spin_locked(&wakeup_lock);
        for_each_possible_cpu(cpu) {
                data = tr->data[cpu];
                tracing_reset(data);
@@ -229,9 +232,11 @@ static void wakeup_reset(struct trace_array *tr)
 {
        unsigned long flags;
-        spin_lock_irqsave(&wakeup_lock, flags);
+        local_irq_save(flags);
+        __raw_spin_lock(&wakeup_lock);
        __wakeup_reset(tr);
-        spin_unlock_irqrestore(&wakeup_lock, flags);
+        __raw_spin_unlock(&wakeup_lock);
+        local_irq_restore(flags);
 }
 static void
@@ -252,7 +257,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,
                goto out;
        /* interrupts should be off from try_to_wake_up */
-        spin_lock(&wakeup_lock);
+        __raw_spin_lock(&wakeup_lock);
        /* check for races. */
        if (!tracer_enabled || p->prio >= wakeup_prio)
@@ -274,7 +279,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,
                       CALLER_ADDR1, CALLER_ADDR2, flags);
 out_locked:
-        spin_unlock(&wakeup_lock);
+        __raw_spin_unlock(&wakeup_lock);
 out:
        atomic_dec(&tr->data[cpu]->disabled);
 }
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index ce2d723c10e1..db58fb66a135 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -202,7 +202,7 @@ static void start_stack_timer(int cpu)
        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        hrtimer->function = stack_trace_timer_fn;
-        hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+        hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
        hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
 }
@@ -213,9 +213,7 @@ static void start_stack_timers(void)
        int cpu;
        for_each_online_cpu(cpu) {
-                cpumask_of_cpu_ptr(new_mask, cpu);
+                set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-                set_cpus_allowed_ptr(current, new_mask);
                start_stack_timer(cpu);
        }
        set_cpus_allowed_ptr(current, &saved_mask);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 3da47ccdc5e5..8ebcd8532dfb 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -94,10 +94,10 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
                stats->hiwater_vm    = mm->hiwater_vm * PAGE_SIZE / KB;
                mmput(mm);
        }
-        stats->read_char        = p->rchar;
+        stats->read_char        = p->ioac.rchar;
-        stats->write_char       = p->wchar;
+        stats->write_char       = p->ioac.wchar;
-        stats->read_syscalls    = p->syscr;
+        stats->read_syscalls    = p->ioac.syscr;
-        stats->write_syscalls   = p->syscw;
+        stats->write_syscalls   = p->ioac.syscw;
 #ifdef CONFIG_TASK_IO_ACCOUNTING
        stats->read_bytes       = p->ioac.read_bytes;
        stats->write_bytes      = p->ioac.write_bytes;
diff --git a/kernel/user.c b/kernel/user.c
index 865ecf57a096..39d6159fae43 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
 {
        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
+        return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
 }
 static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
@@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
        unsigned long rt_runtime;
        int rc;
-        sscanf(buf, "%lu", &rt_runtime);
+        sscanf(buf, "%ld", &rt_runtime);
        rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index a9ab0596de44..532858fa5b88 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -6,7 +6,6 @@
 */
 #include <linux/module.h>
-#include <linux/version.h>
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 64d398f12444..815237a55af8 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -12,7 +12,6 @@
 #include <linux/module.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
-#include <linux/version.h>
 #include <linux/err.h>
 #include <linux/slab.h>
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index fe3a56c2256d..4ab9659d269e 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -12,7 +12,6 @@
 #include <linux/module.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
-#include <linux/version.h>
 #include <linux/sysctl.h>
 static void *get_uts(ctl_table *table, int write)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ec7e4f62aaff..4048e92aa04f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -290,11 +290,11 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
                BUG_ON(get_wq_data(work) != cwq);
                work_clear_pending(work);
-                lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+                lock_map_acquire(&cwq->wq->lockdep_map);
-                lock_acquire(&lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+                lock_map_acquire(&lockdep_map);
                f(work);
-                lock_release(&lockdep_map, 1, _THIS_IP_);
+                lock_map_release(&lockdep_map);
-                lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+                lock_map_release(&cwq->wq->lockdep_map);
                if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
                        printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
@@ -413,8 +413,8 @@ void flush_workqueue(struct workqueue_struct *wq)
        int cpu;
        might_sleep();
-        lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_map_acquire(&wq->lockdep_map);
-        lock_release(&wq->lockdep_map, 1, _THIS_IP_);
+        lock_map_release(&wq->lockdep_map);
        for_each_cpu_mask_nr(cpu, *cpu_map)
                flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 }
@@ -441,8 +441,8 @@ int flush_work(struct work_struct *work)
        if (!cwq)
                return 0;
-        lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_map_acquire(&cwq->wq->lockdep_map);
-        lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+        lock_map_release(&cwq->wq->lockdep_map);
        prev = NULL;
        spin_lock_irq(&cwq->lock);
@@ -536,8 +536,8 @@ static void wait_on_work(struct work_struct *work)
        might_sleep();
-        lock_acquire(&work->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_map_acquire(&work->lockdep_map);
-        lock_release(&work->lockdep_map, 1, _THIS_IP_);
+        lock_map_release(&work->lockdep_map);
        cwq = get_wq_data(work);
        if (!cwq)
@@ -830,10 +830,21 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
                start_workqueue_thread(cwq, -1);
        } else {
                cpu_maps_update_begin();
+                /*
+                 * We must place this wq on list even if the code below fails.
+                 * cpu_down(cpu) can remove cpu from cpu_populated_map before
+                 * destroy_workqueue() takes the lock, in that case we leak
+                 * cwq[cpu]->thread.
+                 */
                spin_lock(&workqueue_lock);
                list_add(&wq->list, &workqueues);
                spin_unlock(&workqueue_lock);
+                /*
+                 * We must initialize cwqs for each possible cpu even if we
+                 * are going to call destroy_workqueue() finally. Otherwise
+                 * cpu_up() can hit the uninitialized cwq once we drop the
+                 * lock.
+                 */
                for_each_possible_cpu(cpu) {
                        cwq = init_cpu_workqueue(wq, cpu);
                        if (err || !cpu_online(cpu))
@@ -861,8 +872,8 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
        if (cwq->thread == NULL)
                return;
-        lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+        lock_map_acquire(&cwq->wq->lockdep_map);
-        lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+        lock_map_release(&cwq->wq->lockdep_map);
        flush_cpu_workqueue(cwq);
        /*