74 files changed, 2830 insertions, 1711 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index dd68b9059418..f6006a60df5d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -548,7 +548,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 #endif
        spin_lock_irq(&current->sighand->siglock);
-        tty = current->signal->tty;
+        tty = current->signal->tty;     /* Safe as we hold the siglock */
        ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
        ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
        ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 972f8e61d36a..cf5bc2f5f9c3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -243,10 +243,11 @@ static inline int open_arg(int flags, int mask)
 static int audit_match_perm(struct audit_context *ctx, int mask)
 {
+        unsigned n;
        if (unlikely(!ctx))
                return 0;
+        n = ctx->major;
-        unsigned n = ctx->major;
        switch (audit_classify_syscall(ctx->arch, n)) {
        case 0: /* native */
                if ((mask & AUDIT_PERM_WRITE) &&
@@ -1203,13 +1204,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                                 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
                                 context->return_code);
-        mutex_lock(&tty_mutex);
+        spin_lock_irq(&tsk->sighand->siglock);
-        read_lock(&tasklist_lock);
        if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
                tty = tsk->signal->tty->name;
        else
                tty = "(none)";
-        read_unlock(&tasklist_lock);
+        spin_unlock_irq(&tsk->sighand->siglock);
        audit_log_format(ab,
                  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
                  " ppid=%d pid=%d auid=%u uid=%u gid=%u"
@@ -1229,7 +1230,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                  context->egid, context->sgid, context->fsgid, tty,
                  tsk->sessionid);
-        mutex_unlock(&tty_mutex);
        audit_log_task_info(ab, tsk);
        if (context->filterkey) {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 13932abde159..8c6e1c17e6d3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2735,21 +2735,24 @@ void cgroup_fork_callbacks(struct task_struct *child)
 * Called on every change to mm->owner. mm_init_owner() does not
 * invoke this routine, since it assigns the mm->owner the first time
 * and does not change it.
+ *
+ * The callbacks are invoked with mmap_sem held in read mode.
 */
 void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
 {
-        struct cgroup *oldcgrp, *newcgrp;
+        struct cgroup *oldcgrp, *newcgrp = NULL;
        if (need_mm_owner_callback) {
                int i;
                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        oldcgrp = task_cgroup(old, ss->subsys_id);
-                        newcgrp = task_cgroup(new, ss->subsys_id);
+                        if (new)
+                                newcgrp = task_cgroup(new, ss->subsys_id);
                        if (oldcgrp == newcgrp)
                                continue;
                        if (ss->mm_owner_changed)
-                                ss->mm_owner_changed(ss, oldcgrp, newcgrp);
+                                ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
                }
        }
 }
diff --git a/kernel/compat.c b/kernel/compat.c
index 32c254a8ab9a..8eafe3eb50d9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -23,9 +23,68 @@
 #include <linux/timex.h>
 #include <linux/migrate.h>
 #include <linux/posix-timers.h>
+#include <linux/times.h>
 #include <asm/uaccess.h>
+/*
+ * Note that the native side is already converted to a timespec, because
+ * that's what we want anyway.
+ */
+static int compat_get_timeval(struct timespec *o,
+                struct compat_timeval __user *i)
+{
+        long usec;
+        if (get_user(o->tv_sec, &i->tv_sec) ||
+            get_user(usec, &i->tv_usec))
+                return -EFAULT;
+        o->tv_nsec = usec * 1000;
+        return 0;
+}
+static int compat_put_timeval(struct compat_timeval __user *o,
+                struct timeval *i)
+{
+        return (put_user(i->tv_sec, &o->tv_sec) ||
+                put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
+}
+asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
+                struct timezone __user *tz)
+{
+        if (tv) {
+                struct timeval ktv;
+                do_gettimeofday(&ktv);
+                if (compat_put_timeval(tv, &ktv))
+                        return -EFAULT;
+        }
+        if (tz) {
+                if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
+                        return -EFAULT;
+        }
+        return 0;
+}
+asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
+                struct timezone __user *tz)
+{
+        struct timespec kts;
+        struct timezone ktz;
+        if (tv) {
+                if (compat_get_timeval(&kts, tv))
+                        return -EFAULT;
+        }
+        if (tz) {
+                if (copy_from_user(&ktz, tz, sizeof(ktz)))
+                        return -EFAULT;
+        }
+        return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
+}
 int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
 {
        return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
@@ -150,49 +209,23 @@ asmlinkage long compat_sys_setitimer(int which,
        return 0;
 }
+static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
+{
+        return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
+}
 asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
 {
-        /*
-         *      In the SMP world we might just be unlucky and have one of
-         *      the times increment as we use it. Since the value is an
-         *      atomically safe type this is just fine. Conceptually its
-         *      as if the syscall took an instant longer to occur.
-         */
        if (tbuf) {
+                struct tms tms;
                struct compat_tms tmp;
-                struct task_struct *tsk = current;
-                struct task_struct *t;
+                do_sys_times(&tms);
-                cputime_t utime, stime, cutime, cstime;
+                /* Convert our struct tms to the compat version. */
+                tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
-                read_lock(&tasklist_lock);
+                tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
-                utime = tsk->signal->utime;
+                tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
-                stime = tsk->signal->stime;
+                tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
-                t = tsk;
-                do {
-                        utime = cputime_add(utime, t->utime);
-                        stime = cputime_add(stime, t->stime);
-                        t = next_thread(t);
-                } while (t != tsk);
-                /*
-                 * While we have tasklist_lock read-locked, no dying thread
-                 * can be updating current->signal->[us]time.  Instead,
-                 * we got their counts included in the live thread loop.
-                 * However, another thread can come in right now and
-                 * do a wait call that updates current->signal->c[us]time.
-                 * To make sure we always see that pair updated atomically,
-                 * we take the siglock around fetching them.
-                 */
-                spin_lock_irq(&tsk->sighand->siglock);
-                cutime = tsk->signal->cutime;
-                cstime = tsk->signal->cstime;
-                spin_unlock_irq(&tsk->sighand->siglock);
-                read_unlock(&tasklist_lock);
-                tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime));
-                tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime));
-                tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime));
-                tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime));
                if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
                        return -EFAULT;
        }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f17e9854c246..86d49045daed 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param)
        struct take_cpu_down_param *param = _param;
        int err;
-        raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
-                                param->hcpu);
        /* Ensure this CPU doesn't handle any more interrupts. */
        err = __cpu_disable();
        if (err < 0)
                return err;
+        raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
+                                param->hcpu);
        /* Force idle task to run as soon as we yield: it should
           immediately notice cpu is offline and die quickly. */
        sched_idle_next();
@@ -453,6 +454,25 @@ out:
 }
 #endif /* CONFIG_PM_SLEEP_SMP */
+/**
+ * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
+ * @cpu: cpu that just started
+ *
+ * This function calls the cpu_chain notifiers with CPU_STARTING.
+ * It must be called by the arch code on the new cpu, before the new cpu
+ * enables interrupts and before the "boot" cpu returns from __cpu_up().
+ */
+void notify_cpu_starting(unsigned int cpu)
+{
+        unsigned long val = CPU_STARTING;
+#ifdef CONFIG_PM_SLEEP_SMP
+        if (cpu_isset(cpu, frozen_cpus))
+                val = CPU_STARTING_FROZEN;
+#endif /* CONFIG_PM_SLEEP_SMP */
+        raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
+}
 #endif /* CONFIG_SMP */
 /*
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5ab79cf516d..eab7bd6628e0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -14,6 +14,8 @@
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  2006 Rework by Paul Menage to use generic cgroups
+ *  2008 Rework of the scheduler domains and CPU hotplug handling
+ *       by Max Krasnyansky
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
@@ -236,9 +238,11 @@ static struct cpuset top_cpuset = {
 static DEFINE_MUTEX(callback_mutex);
-/* This is ugly, but preserves the userspace API for existing cpuset
+/*
+ * This is ugly, but preserves the userspace API for existing cpuset
 * users. If someone tries to mount the "cpuset" filesystem, we
- * silently switch it to mount "cgroup" instead */
+ * silently switch it to mount "cgroup" instead
+ */
 static int cpuset_get_sb(struct file_system_type *fs_type,
                         int flags, const char *unused_dev_name,
                         void *data, struct vfsmount *mnt)
@@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 }
 /*
- * Helper routine for rebuild_sched_domains().
+ * Helper routine for generate_sched_domains().
 * Do cpusets a, b have overlapping cpus_allowed masks?
 */
 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 {
        return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
@@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 }
 /*
- * rebuild_sched_domains()
+ * generate_sched_domains()
- *
+ *
- * This routine will be called to rebuild the scheduler's dynamic
+ * This function builds a partial partition of the systems CPUs
- * sched domains:
+ * A 'partial partition' is a set of non-overlapping subsets whose
- * - if the flag 'sched_load_balance' of any cpuset with non-empty
+ * union is a subset of that set.
- *   'cpus' changes,
+ * The output of this function needs to be passed to kernel/sched.c
- * - or if the 'cpus' allowed changes in any cpuset which has that
+ * partition_sched_domains() routine, which will rebuild the scheduler's
- *   flag enabled,
+ * load balancing domains (sched domains) as specified by that partial
- * - or if the 'sched_relax_domain_level' of any cpuset which has
+ * partition.
- *   that flag enabled and with non-empty 'cpus' changes,
- * - or if any cpuset with non-empty 'cpus' is removed,
- * - or if a cpu gets offlined.
- *
- * This routine builds a partial partition of the systems CPUs
- * (the set of non-overlappping cpumask_t's in the array 'part'
- * below), and passes that partial partition to the kernel/sched.c
- * partition_sched_domains() routine, which will rebuild the
- * schedulers load balancing domains (sched domains) as specified
- * by that partial partition.  A 'partial partition' is a set of
- * non-overlapping subsets whose union is a subset of that set.
 *
 * See "What is sched_load_balance" in Documentation/cpusets.txt
 * for a background explanation of this.
@@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 * domains when operating in the severe memory shortage situations
 * that could cause allocation failures below.
 *
- * Call with cgroup_mutex held.  May take callback_mutex during
+ * Must be called with cgroup_lock held.
- * call due to the kfifo_alloc() and kmalloc() calls.  May nest
- * a call to the get_online_cpus()/put_online_cpus() pair.
- * Must not be called holding callback_mutex, because we must not
- * call get_online_cpus() while holding callback_mutex.  Elsewhere
- * the kernel nests callback_mutex inside get_online_cpus() calls.
- * So the reverse nesting would risk an ABBA deadlock.
 *
 * The three key local variables below are:
 *    q  - a linked-list queue of cpuset pointers, used to implement a
@@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 *      element of the partition (one sched domain) to be passed to
 *      partition_sched_domains().
 */
+static int generate_sched_domains(cpumask_t **domains,
-void rebuild_sched_domains(void)
+                        struct sched_domain_attr **attributes)
 {
-        LIST_HEAD(q);           /* queue of cpusets to be scanned*/
+        LIST_HEAD(q);           /* queue of cpusets to be scanned */
        struct cpuset *cp;      /* scans q */
        struct cpuset **csa;    /* array of all cpuset ptrs */
        int csn;                /* how many cpuset ptrs in csa so far */
@@ -601,23 +587,26 @@ void rebuild_sched_domains(void)
        int ndoms;              /* number of sched domains in result */
        int nslot;              /* next empty doms[] cpumask_t slot */
-        csa = NULL;
+        ndoms = 0;
        doms = NULL;
        dattr = NULL;
+        csa = NULL;
        /* Special case for the 99% of systems with one, full, sched domain */
        if (is_sched_load_balance(&top_cpuset)) {
-                ndoms = 1;
                doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
                if (!doms)
-                        goto rebuild;
+                        goto done;
                dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
                if (dattr) {
                        *dattr = SD_ATTR_INIT;
                        update_domain_attr_tree(dattr, &top_cpuset);
                }
                *doms = top_cpuset.cpus_allowed;
-                goto rebuild;
+                ndoms = 1;
+                goto done;
        }
        csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
@@ -680,61 +669,141 @@ restart:
                }
        }
-        /* Convert <csn, csa> to <ndoms, doms> */
+        /*
+         * Now we know how many domains to create.
+         * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
+         */
        doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
-        if (!doms)
+        if (!doms) {
-                goto rebuild;
+                ndoms = 0;
+                goto done;
+        }
+        /*
+         * The rest of the code, including the scheduler, can deal with
+         * dattr==NULL case. No need to abort if alloc fails.
+         */
        dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
        for (nslot = 0, i = 0; i < csn; i++) {
                struct cpuset *a = csa[i];
+                cpumask_t *dp;
                int apn = a->pn;
-                if (apn >= 0) {
+                if (apn < 0) {
-                        cpumask_t *dp = doms + nslot;
+                        /* Skip completed partitions */
+                        continue;
-                        if (nslot == ndoms) {
+                }
-                                static int warnings = 10;
-                                if (warnings) {
+                dp = doms + nslot;
-                                        printk(KERN_WARNING
-                                         "rebuild_sched_domains confused:"
+                if (nslot == ndoms) {
-                                          " nslot %d, ndoms %d, csn %d, i %d,"
+                        static int warnings = 10;
-                                          " apn %d\n",
+                        if (warnings) {
-                                          nslot, ndoms, csn, i, apn);
+                                printk(KERN_WARNING
-                                        warnings--;
+                                 "rebuild_sched_domains confused:"
-                                }
+                                  " nslot %d, ndoms %d, csn %d, i %d,"
-                                continue;
+                                  " apn %d\n",
+                                  nslot, ndoms, csn, i, apn);
+                                warnings--;
                        }
+                        continue;
+                }
-                        cpus_clear(*dp);
+                cpus_clear(*dp);
-                        if (dattr)
+                if (dattr)
-                                *(dattr + nslot) = SD_ATTR_INIT;
+                        *(dattr + nslot) = SD_ATTR_INIT;
-                        for (j = i; j < csn; j++) {
+                for (j = i; j < csn; j++) {
-                                struct cpuset *b = csa[j];
+                        struct cpuset *b = csa[j];
-                                if (apn == b->pn) {
+                        if (apn == b->pn) {
-                                        cpus_or(*dp, *dp, b->cpus_allowed);
+                                cpus_or(*dp, *dp, b->cpus_allowed);
-                                        b->pn = -1;
+                                if (dattr)
-                                        if (dattr)
+                                        update_domain_attr_tree(dattr + nslot, b);
-                                                update_domain_attr_tree(dattr
-                                                                   + nslot, b);
+                                /* Done with this partition */
-                                }
+                                b->pn = -1;
                        }
-                        nslot++;
                }
+                nslot++;
        }
        BUG_ON(nslot != ndoms);
-rebuild:
+done:
-        /* Have scheduler rebuild sched domains */
+        kfree(csa);
+        *domains    = doms;
+        *attributes = dattr;
+        return ndoms;
+}
+/*
+ * Rebuild scheduler domains.
+ *
+ * Call with neither cgroup_mutex held nor within get_online_cpus().
+ * Takes both cgroup_mutex and get_online_cpus().
+ *
+ * Cannot be directly called from cpuset code handling changes
+ * to the cpuset pseudo-filesystem, because it cannot be called
+ * from code that already holds cgroup_mutex.
+ */
+static void do_rebuild_sched_domains(struct work_struct *unused)
+{
+        struct sched_domain_attr *attr;
+        cpumask_t *doms;
+        int ndoms;
        get_online_cpus();
-        partition_sched_domains(ndoms, doms, dattr);
+        /* Generate domain masks and attrs */
+        cgroup_lock();
+        ndoms = generate_sched_domains(&doms, &attr);
+        cgroup_unlock();
+        /* Have scheduler rebuild the domains */
+        partition_sched_domains(ndoms, doms, attr);
        put_online_cpus();
+}
-done:
+static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
-        kfree(csa);
-        /* Don't kfree(doms) -- partition_sched_domains() does that. */
+/*
-        /* Don't kfree(dattr) -- partition_sched_domains() does that. */
+ * Rebuild scheduler domains, asynchronously via workqueue.
+ *
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
+ *
+ * The rebuild_sched_domains() and partition_sched_domains()
+ * routines must nest cgroup_lock() inside get_online_cpus(),
+ * but such cpuset changes as these must nest that locking the
+ * other way, holding cgroup_lock() for much of the code.
+ *
+ * So in order to avoid an ABBA deadlock, the cpuset code handling
+ * these user changes delegates the actual sched domain rebuilding
+ * to a separate workqueue thread, which ends up processing the
+ * above do_rebuild_sched_domains() function.
+ */
+static void async_rebuild_sched_domains(void)
+{
+        schedule_work(&rebuild_sched_domains_work);
+}
+/*
+ * Accomplishes the same scheduler domain rebuild as the above
+ * async_rebuild_sched_domains(), however it directly calls the
+ * rebuild routine synchronously rather than calling it via an
+ * asynchronous work thread.
+ *
+ * This can only be called from code that is not holding
+ * cgroup_mutex (not nested in a cgroup_lock() call.)
+ */
+void rebuild_sched_domains(void)
+{
+        do_rebuild_sched_domains(NULL);
 }
 /**
@@ -774,37 +843,25 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
 /**
 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 *
 * Called with cgroup_mutex held
 *
 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
 * calling callback functions for each.
 *
- * Return 0 if successful, -errno if not.
+ * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * if @heap != NULL.
 */
-static int update_tasks_cpumask(struct cpuset *cs)
+static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
 {
        struct cgroup_scanner scan;
-        struct ptr_heap heap;
-        int retval;
-        /*
-         * cgroup_scan_tasks() will initialize heap->gt for us.
-         * heap_init() is still needed here for we should not change
-         * cs->cpus_allowed when heap_init() fails.
-         */
-        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
-        if (retval)
-                return retval;
        scan.cg = cs->css.cgroup;
        scan.test_task = cpuset_test_cpumask;
        scan.process_task = cpuset_change_cpumask;
-        scan.heap = &heap;
+        scan.heap = heap;
-        retval = cgroup_scan_tasks(&scan);
+        cgroup_scan_tasks(&scan);
-        heap_free(&heap);
-        return retval;
 }
 /**
@@ -814,6 +871,7 @@ static int update_tasks_cpumask(struct cpuset *cs)
 */
 static int update_cpumask(struct cpuset *cs, const char *buf)
 {
+        struct ptr_heap heap;
        struct cpuset trialcs;
        int retval;
        int is_load_balanced;
@@ -848,6 +906,10 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
        if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
                return 0;
+        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+        if (retval)
+                return retval;
        is_load_balanced = is_sched_load_balance(&trialcs);
        mutex_lock(&callback_mutex);
@@ -858,12 +920,12 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
         * Scan tasks in the cpuset, and update the cpumasks of any
         * that need an update.
         */
-        retval = update_tasks_cpumask(cs);
+        update_tasks_cpumask(cs, &heap);
-        if (retval < 0)
-                return retval;
+        heap_free(&heap);
        if (is_load_balanced)
-                rebuild_sched_domains();
+                async_rebuild_sched_domains();
        return 0;
 }
@@ -1090,7 +1152,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
        if (val != cs->relax_domain_level) {
                cs->relax_domain_level = val;
                if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
-                        rebuild_sched_domains();
+                        async_rebuild_sched_domains();
        }
        return 0;
@@ -1131,7 +1193,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
        mutex_unlock(&callback_mutex);
        if (cpus_nonempty && balance_flag_changed)
-                rebuild_sched_domains();
+                async_rebuild_sched_domains();
        return 0;
 }
@@ -1492,6 +1554,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
        default:
                BUG();
        }
+        /* Unreachable but makes gcc happy */
+        return 0;
 }
 static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
@@ -1504,6 +1569,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
        default:
                BUG();
        }
+        /* Unrechable but makes gcc happy */
+        return 0;
 }
@@ -1692,15 +1760,9 @@ static struct cgroup_subsys_state *cpuset_create(
 }
 /*
- * Locking note on the strange update_flag() call below:
- *
 * If the cpuset being removed has its flag 'sched_load_balance'
 * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains().  The get_online_cpus()
+ * will call async_rebuild_sched_domains().
- * call in rebuild_sched_domains() must not be made while holding
- * callback_mutex.  Elsewhere the kernel nests callback_mutex inside
- * get_online_cpus() calls.  So the reverse nesting would risk an
- * ABBA deadlock.
 */
 static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -1719,7 +1781,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
 struct cgroup_subsys cpuset_subsys = {
        .name = "cpuset",
        .create = cpuset_create,
-        .destroy  = cpuset_destroy,
+        .destroy = cpuset_destroy,
        .can_attach = cpuset_can_attach,
        .attach = cpuset_attach,
        .populate = cpuset_populate,
@@ -1811,7 +1873,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
 }
 /*
- * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
+ * If CPU and/or memory hotplug handlers, below, unplug any CPUs
 * or memory nodes, we need to walk over the cpuset hierarchy,
 * removing that CPU or node from all cpusets.  If this removes the
 * last CPU or node from a cpuset, then move the tasks in the empty
@@ -1859,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 * that has tasks along with an empty 'mems'.  But if we did see such
 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
 */
-static void scan_for_empty_cpusets(const struct cpuset *root)
+static void scan_for_empty_cpusets(struct cpuset *root)
 {
        LIST_HEAD(queue);
        struct cpuset *cp;      /* scans cpusets being updated */
@@ -1896,42 +1958,13 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
                     nodes_empty(cp->mems_allowed))
                        remove_tasks_in_empty_cpuset(cp);
                else {
-                        update_tasks_cpumask(cp);
+                        update_tasks_cpumask(cp, NULL);
                        update_tasks_nodemask(cp, &oldmems);
                }
        }
 }
 /*
- * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
- * cpu_online_map and node_states[N_HIGH_MEMORY].  Force the top cpuset to
- * track what's online after any CPU or memory node hotplug or unplug event.
- *
- * Since there are two callers of this routine, one for CPU hotplug
- * events and one for memory node hotplug events, we could have coded
- * two separate routines here.  We code it as a single common routine
- * in order to minimize text size.
- */
-static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
-{
-        cgroup_lock();
-        top_cpuset.cpus_allowed = cpu_online_map;
-        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-        scan_for_empty_cpusets(&top_cpuset);
-        /*
-         * Scheduler destroys domains on hotplug events.
-         * Rebuild them based on the current settings.
-         */
-        if (rebuild_sd)
-                rebuild_sched_domains();
-        cgroup_unlock();
-}
-/*
 * The top_cpuset tracks what CPUs and Memory Nodes are online,
 * period.  This is necessary in order to make cpusets transparent
 * (of no affect) on systems that are actively using CPU hotplug
@@ -1939,40 +1972,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
 *
 * This routine ensures that top_cpuset.cpus_allowed tracks
 * cpu_online_map on each CPU hotplug (cpuhp) event.
+ *
+ * Called within get_online_cpus().  Needs to call cgroup_lock()
+ * before calling generate_sched_domains().
 */
+static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
-static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
                                unsigned long phase, void *unused_cpu)
 {
+        struct sched_domain_attr *attr;
+        cpumask_t *doms;
+        int ndoms;
        switch (phase) {
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
-        case CPU_DOWN_FAILED:
-        case CPU_DOWN_FAILED_FROZEN:
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                common_cpu_mem_hotplug_unplug(1);
                break;
        default:
                return NOTIFY_DONE;
        }
+        cgroup_lock();
+        top_cpuset.cpus_allowed = cpu_online_map;
+        scan_for_empty_cpusets(&top_cpuset);
+        ndoms = generate_sched_domains(&doms, &attr);
+        cgroup_unlock();
+        /* Have scheduler rebuild the domains */
+        partition_sched_domains(ndoms, doms, attr);
        return NOTIFY_OK;
 }
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
- * Call this routine anytime after you change
+ * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
- * node_states[N_HIGH_MEMORY].
+ * See also the previous routine cpuset_track_online_cpus().
- * See also the previous routine cpuset_handle_cpuhp().
 */
 void cpuset_track_online_nodes(void)
 {
-        common_cpu_mem_hotplug_unplug(0);
+        cgroup_lock();
+        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+        scan_for_empty_cpusets(&top_cpuset);
+        cgroup_unlock();
 }
 #endif
@@ -1987,7 +2032,7 @@ void __init cpuset_init_smp(void)
        top_cpuset.cpus_allowed = cpu_online_map;
        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-        hotcpu_notifier(cpuset_handle_cpuhp, 0);
+        hotcpu_notifier(cpuset_track_online_cpus, 0);
 }
 /**
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index c1d4d5b4c61c..f013a0c2e111 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -124,6 +124,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
        }
        return (mem != NULL);
 }
+EXPORT_SYMBOL(dma_alloc_from_coherent);
 /**
 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
@@ -151,3 +152,4 @@ int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
        }
        return 0;
 }
+EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/dma.c b/kernel/dma.c
index d2c60a822790..f903189c5304 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -1,4 +1,4 @@
-/* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $
+/*
 * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
 *
 * Written by Hennus Bergman, 1992.
diff --git a/kernel/exit.c b/kernel/exit.c
index 38ec40630149..059b38cae384 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -112,9 +112,7 @@ static void __exit_signal(struct task_struct *tsk)
                 * We won't ever get here for the group leader, since it
                 * will have been the last reference on the signal_struct.
                 */
-                sig->utime = cputime_add(sig->utime, tsk->utime);
+                sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
-                sig->stime = cputime_add(sig->stime, tsk->stime);
-                sig->gtime = cputime_add(sig->gtime, tsk->gtime);
                sig->min_flt += tsk->min_flt;
                sig->maj_flt += tsk->maj_flt;
                sig->nvcsw += tsk->nvcsw;
@@ -122,7 +120,6 @@ static void __exit_signal(struct task_struct *tsk)
                sig->inblock += task_io_get_inblock(tsk);
                sig->oublock += task_io_get_oublock(tsk);
                task_io_accounting_add(&sig->ioac, &tsk->ioac);
-                sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
                sig = NULL; /* Marker for below. */
        }
@@ -583,8 +580,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
         * If there are other users of the mm and the owner (us) is exiting
         * we need to find a new owner to take on the responsibility.
         */
-        if (!mm)
-                return 0;
        if (atomic_read(&mm->mm_users) <= 1)
                return 0;
        if (mm->owner != p)
@@ -627,29 +622,38 @@ retry:
        } while_each_thread(g, c);
        read_unlock(&tasklist_lock);
+        /*
+         * We found no owner yet mm_users > 1: this implies that we are
+         * most likely racing with swapoff (try_to_unuse()) or /proc or
+         * ptrace or page migration (get_task_mm()).  Mark owner as NULL,
+         * so that subsystems can understand the callback and take action.
+         */
+        down_write(&mm->mmap_sem);
+        cgroup_mm_owner_callbacks(mm->owner, NULL);
+        mm->owner = NULL;
+        up_write(&mm->mmap_sem);
        return;
 assign_new_owner:
        BUG_ON(c == p);
        get_task_struct(c);
+        read_unlock(&tasklist_lock);
+        down_write(&mm->mmap_sem);
        /*
         * The task_lock protects c->mm from changing.
         * We always want mm->owner->mm == mm
         */
        task_lock(c);
-        /*
-         * Delay read_unlock() till we have the task_lock()
-         * to ensure that c does not slip away underneath us
-         */
-        read_unlock(&tasklist_lock);
        if (c->mm != mm) {
                task_unlock(c);
+                up_write(&mm->mmap_sem);
                put_task_struct(c);
                goto retry;
        }
        cgroup_mm_owner_callbacks(mm->owner, c);
        mm->owner = c;
        task_unlock(c);
+        up_write(&mm->mmap_sem);
        put_task_struct(c);
 }
 #endif /* CONFIG_MM_OWNER */
@@ -831,26 +835,50 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
 * the child reaper process (ie "init") in our pid
 * space.
 */
+static struct task_struct *find_new_reaper(struct task_struct *father)
+{
+        struct pid_namespace *pid_ns = task_active_pid_ns(father);
+        struct task_struct *thread;
+        thread = father;
+        while_each_thread(father, thread) {
+                if (thread->flags & PF_EXITING)
+                        continue;
+                if (unlikely(pid_ns->child_reaper == father))
+                        pid_ns->child_reaper = thread;
+                return thread;
+        }
+        if (unlikely(pid_ns->child_reaper == father)) {
+                write_unlock_irq(&tasklist_lock);
+                if (unlikely(pid_ns == &init_pid_ns))
+                        panic("Attempted to kill init!");
+                zap_pid_ns_processes(pid_ns);
+                write_lock_irq(&tasklist_lock);
+                /*
+                 * We can not clear ->child_reaper or leave it alone.
+                 * There may by stealth EXIT_DEAD tasks on ->children,
+                 * forget_original_parent() must move them somewhere.
+                 */
+                pid_ns->child_reaper = init_pid_ns.child_reaper;
+        }
+        return pid_ns->child_reaper;
+}
 static void forget_original_parent(struct task_struct *father)
 {
-        struct task_struct *p, *n, *reaper = father;
+        struct task_struct *p, *n, *reaper;
        LIST_HEAD(ptrace_dead);
        write_lock_irq(&tasklist_lock);
+        reaper = find_new_reaper(father);
        /*
         * First clean up ptrace if we were using it.
         */
        ptrace_exit(father, &ptrace_dead);
-        do {
-                reaper = next_thread(reaper);
-                if (reaper == father) {
-                        reaper = task_child_reaper(father);
-                        break;
-                }
-        } while (reaper->flags & PF_EXITING);
        list_for_each_entry_safe(p, n, &father->children, sibling) {
                p->real_parent = reaper;
                if (p->parent == father) {
@@ -918,8 +946,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
        /* mt-exec, de_thread() is waiting for us */
        if (thread_group_leader(tsk) &&
-            tsk->signal->notify_count < 0 &&
+            tsk->signal->group_exit_task &&
-            tsk->signal->group_exit_task)
+            tsk->signal->notify_count < 0)
                wake_up_process(tsk->signal->group_exit_task);
        write_unlock_irq(&tasklist_lock);
@@ -959,39 +987,6 @@ static void check_stack_usage(void)
 static inline void check_stack_usage(void) {}
 #endif
-static inline void exit_child_reaper(struct task_struct *tsk)
-{
-        if (likely(tsk->group_leader != task_child_reaper(tsk)))
-                return;
-        if (tsk->nsproxy->pid_ns == &init_pid_ns)
-                panic("Attempted to kill init!");
-        /*
-         * @tsk is the last thread in the 'cgroup-init' and is exiting.
-         * Terminate all remaining processes in the namespace and reap them
-         * before exiting @tsk.
-         *
-         * Note that @tsk (last thread of cgroup-init) may not necessarily
-         * be the child-reaper (i.e main thread of cgroup-init) of the
-         * namespace i.e the child_reaper may have already exited.
-         *
-         * Even after a child_reaper exits, we let it inherit orphaned children,
-         * because, pid_ns->child_reaper remains valid as long as there is
-         * at least one living sub-thread in the cgroup init.
-         * This living sub-thread of the cgroup-init will be notified when
-         * a child inherited by the 'child-reaper' exits (do_notify_parent()
-         * uses __group_send_sig_info()). Further, when reaping child processes,
-         * do_wait() iterates over children of all living sub threads.
-         * i.e even though 'child_reaper' thread is listed as the parent of the
-         * orphaned children, any living sub-thread in the cgroup-init can
-         * perform the role of the child_reaper.
-         */
-        zap_pid_ns_processes(tsk->nsproxy->pid_ns);
-}
 NORET_TYPE void do_exit(long code)
 {
        struct task_struct *tsk = current;
@@ -1051,7 +1046,6 @@ NORET_TYPE void do_exit(long code)
        }
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
-                exit_child_reaper(tsk);
                hrtimer_cancel(&tsk->signal->real_timer);
                exit_itimers(tsk->signal);
        }
@@ -1304,6 +1298,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
        if (likely(!traced)) {
                struct signal_struct *psig;
                struct signal_struct *sig;
+                struct task_cputime cputime;
                /*
                 * The resource counters for the group leader are in its
@@ -1319,20 +1314,23 @@ static int wait_task_zombie(struct task_struct *p, int options,
                 * need to protect the access to p->parent->signal fields,
                 * as other threads in the parent group can be right
                 * here reaping other children at the same time.
+                 *
+                 * We use thread_group_cputime() to get times for the thread
+                 * group, which consolidates times for all threads in the
+                 * group including the group leader.
                 */
                spin_lock_irq(&p->parent->sighand->siglock);
                psig = p->parent->signal;
                sig = p->signal;
+                thread_group_cputime(p, &cputime);
                psig->cutime =
                        cputime_add(psig->cutime,
-                        cputime_add(p->utime,
+                        cputime_add(cputime.utime,
-                        cputime_add(sig->utime,
+                                    sig->cutime));
-                                    sig->cutime)));
                psig->cstime =
                        cputime_add(psig->cstime,
-                        cputime_add(p->stime,
+                        cputime_add(cputime.stime,
-                        cputime_add(sig->stime,
+                                    sig->cstime));
-                                    sig->cstime)));
                psig->cgtime =
                        cputime_add(psig->cgtime,
                        cputime_add(p->gtime,
diff --git a/kernel/fork.c b/kernel/fork.c
index 7ce2ebe84796..44e64d7ba29b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -759,15 +759,44 @@ void __cleanup_sighand(struct sighand_struct *sighand)
                kmem_cache_free(sighand_cachep, sighand);
 }
+/*
+ * Initialize POSIX timer handling for a thread group.
+ */
+static void posix_cpu_timers_init_group(struct signal_struct *sig)
+{
+        /* Thread group counters. */
+        thread_group_cputime_init(sig);
+        /* Expiration times and increments. */
+        sig->it_virt_expires = cputime_zero;
+        sig->it_virt_incr = cputime_zero;
+        sig->it_prof_expires = cputime_zero;
+        sig->it_prof_incr = cputime_zero;
+        /* Cached expiration times. */
+        sig->cputime_expires.prof_exp = cputime_zero;
+        sig->cputime_expires.virt_exp = cputime_zero;
+        sig->cputime_expires.sched_exp = 0;
+        /* The timer lists. */
+        INIT_LIST_HEAD(&sig->cpu_timers[0]);
+        INIT_LIST_HEAD(&sig->cpu_timers[1]);
+        INIT_LIST_HEAD(&sig->cpu_timers[2]);
+}
 static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 {
        struct signal_struct *sig;
        int ret;
        if (clone_flags & CLONE_THREAD) {
-                atomic_inc(&current->signal->count);
+                ret = thread_group_cputime_clone_thread(current);
-                atomic_inc(&current->signal->live);
+                if (likely(!ret)) {
-                return 0;
+                        atomic_inc(&current->signal->count);
+                        atomic_inc(&current->signal->live);
+                }
+                return ret;
        }
        sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
        tsk->signal = sig;
@@ -795,39 +824,25 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sig->it_real_incr.tv64 = 0;
        sig->real_timer.function = it_real_fn;
-        sig->it_virt_expires = cputime_zero;
-        sig->it_virt_incr = cputime_zero;
-        sig->it_prof_expires = cputime_zero;
-        sig->it_prof_incr = cputime_zero;
        sig->leader = 0;        /* session leadership doesn't inherit */
        sig->tty_old_pgrp = NULL;
+        sig->tty = NULL;
-        sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
+        sig->cutime = sig->cstime = cputime_zero;
        sig->gtime = cputime_zero;
        sig->cgtime = cputime_zero;
        sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
        sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
        sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
        task_io_accounting_init(&sig->ioac);
-        sig->sum_sched_runtime = 0;
-        INIT_LIST_HEAD(&sig->cpu_timers[0]);
-        INIT_LIST_HEAD(&sig->cpu_timers[1]);
-        INIT_LIST_HEAD(&sig->cpu_timers[2]);
        taskstats_tgid_init(sig);
        task_lock(current->group_leader);
        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
        task_unlock(current->group_leader);
-        if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+        posix_cpu_timers_init_group(sig);
-                /*
-                 * New sole thread in the process gets an expiry time
-                 * of the whole CPU time limit.
-                 */
-                tsk->it_prof_expires =
-                        secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
-        }
        acct_init_pacct(&sig->pacct);
        tty_audit_fork(sig);
@@ -837,7 +852,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 void __cleanup_signal(struct signal_struct *sig)
 {
+        thread_group_cputime_free(sig);
        exit_thread_group_keys(sig);
+        tty_kref_put(sig->tty);
        kmem_cache_free(signal_cachep, sig);
 }
@@ -886,6 +903,19 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 #endif /* CONFIG_MM_OWNER */
 /*
+ * Initialize POSIX timer handling for a single task.
+ */
+static void posix_cpu_timers_init(struct task_struct *tsk)
+{
+        tsk->cputime_expires.prof_exp = cputime_zero;
+        tsk->cputime_expires.virt_exp = cputime_zero;
+        tsk->cputime_expires.sched_exp = 0;
+        INIT_LIST_HEAD(&tsk->cpu_timers[0]);
+        INIT_LIST_HEAD(&tsk->cpu_timers[1]);
+        INIT_LIST_HEAD(&tsk->cpu_timers[2]);
+}
+/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
@@ -995,12 +1025,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        task_io_accounting_init(&p->ioac);
        acct_clear_integrals(p);
-        p->it_virt_expires = cputime_zero;
+        posix_cpu_timers_init(p);
-        p->it_prof_expires = cputime_zero;
-        p->it_sched_expires = 0;
-        INIT_LIST_HEAD(&p->cpu_timers[0]);
-        INIT_LIST_HEAD(&p->cpu_timers[1]);
-        INIT_LIST_HEAD(&p->cpu_timers[2]);
        p->lock_depth = -1;             /* -1 = no lock */
        do_posix_clock_monotonic_gettime(&p->start_time);
@@ -1201,21 +1226,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (clone_flags & CLONE_THREAD) {
                p->group_leader = current->group_leader;
                list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
-                if (!cputime_eq(current->signal->it_virt_expires,
-                                cputime_zero) ||
-                    !cputime_eq(current->signal->it_prof_expires,
-                                cputime_zero) ||
-                    current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
-                    !list_empty(&current->signal->cpu_timers[0]) ||
-                    !list_empty(&current->signal->cpu_timers[1]) ||
-                    !list_empty(&current->signal->cpu_timers[2])) {
-                        /*
-                         * Have child wake up on its first tick to check
-                         * for process CPU timers.
-                         */
-                        p->it_prof_expires = jiffies_to_cputime(1);
-                }
        }
        if (likely(p->pid)) {
@@ -1227,7 +1237,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                                p->nsproxy->pid_ns->child_reaper = p;
                        p->signal->leader_pid = pid;
-                        p->signal->tty = current->signal->tty;
+                        tty_kref_put(p->signal->tty);
+                        p->signal->tty = tty_kref_get(current->signal->tty);
                        set_task_pgrp(p, task_pgrp_nr(current));
                        set_task_session(p, task_session_nr(current));
                        attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 4d761d50c529..95978f48e039 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -672,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
                         */
                        BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
                        return 1;
-                case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
+                case HRTIMER_CB_IRQSAFE_PERCPU:
+                case HRTIMER_CB_IRQSAFE_UNLOCKED:
                        /*
                         * This is solely for the sched tick emulation with
                         * dynamic tick support to ensure that we do not
                         * restart the tick right on the edge and end up with
                         * the tick timer in the softirq ! The calling site
-                         * takes care of this.
+                         * takes care of this. Also used for hrtimer sleeper !
                         */
                        debug_hrtimer_deactivate(timer);
                        return 1;
@@ -1245,7 +1246,8 @@ static void __run_hrtimer(struct hrtimer *timer)
        timer_stats_account_hrtimer(timer);
        fn = timer->function;
-        if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+        if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
+            timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
                /*
                 * Used for scheduler timers, avoid lock inversion with
                 * rq->lock and tasklist_lock.
@@ -1450,7 +1452,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
        sl->timer.function = hrtimer_wakeup;
        sl->task = task;
 #ifdef CONFIG_HIGH_RES_TIMERS
-        sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+        sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
 #endif
 }
@@ -1589,29 +1591,95 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 #ifdef CONFIG_HOTPLUG_CPU
-static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
+static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
-                                struct hrtimer_clock_base *new_base)
+                                struct hrtimer_clock_base *new_base, int dcpu)
 {
        struct hrtimer *timer;
        struct rb_node *node;
+        int raise = 0;
        while ((node = rb_first(&old_base->active))) {
                timer = rb_entry(node, struct hrtimer, node);
                BUG_ON(hrtimer_callback_running(timer));
                debug_hrtimer_deactivate(timer);
-                __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
+                /*
+                 * Should not happen. Per CPU timers should be
+                 * canceled _before_ the migration code is called
+                 */
+                if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
+                        __remove_hrtimer(timer, old_base,
+                                         HRTIMER_STATE_INACTIVE, 0);
+                        WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
+                             timer, timer->function, dcpu);
+                        continue;
+                }
+                /*
+                 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+                 * timer could be seen as !active and just vanish away
+                 * under us on another CPU
+                 */
+                __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
                timer->base = new_base;
                /*
                 * Enqueue the timer. Allow reprogramming of the event device
                 */
                enqueue_hrtimer(timer, new_base, 1);
+#ifdef CONFIG_HIGH_RES_TIMERS
+                /*
+                 * Happens with high res enabled when the timer was
+                 * already expired and the callback mode is
+                 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
+                 * enqueue code does not move them to the soft irq
+                 * pending list for performance/latency reasons, but
+                 * in the migration state, we need to do that
+                 * otherwise we end up with a stale timer.
+                 */
+                if (timer->state == HRTIMER_STATE_MIGRATE) {
+                        timer->state = HRTIMER_STATE_PENDING;
+                        list_add_tail(&timer->cb_entry,
+                                      &new_base->cpu_base->cb_pending);
+                        raise = 1;
+                }
+#endif
+                /* Clear the migration state bit */
+                timer->state &= ~HRTIMER_STATE_MIGRATE;
+        }
+        return raise;
+}
+#ifdef CONFIG_HIGH_RES_TIMERS
+static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
+                                   struct hrtimer_cpu_base *new_base)
+{
+        struct hrtimer *timer;
+        int raise = 0;
+        while (!list_empty(&old_base->cb_pending)) {
+                timer = list_entry(old_base->cb_pending.next,
+                                   struct hrtimer, cb_entry);
+                __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
+                timer->base = &new_base->clock_base[timer->base->index];
+                list_add_tail(&timer->cb_entry, &new_base->cb_pending);
+                raise = 1;
        }
+        return raise;
+}
+#else
+static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
+                                   struct hrtimer_cpu_base *new_base)
+{
+        return 0;
 }
+#endif
 static void migrate_hrtimers(int cpu)
 {
        struct hrtimer_cpu_base *old_base, *new_base;
-        int i;
+        int i, raise = 0;
        BUG_ON(cpu_online(cpu));
        old_base = &per_cpu(hrtimer_bases, cpu);
@@ -1626,13 +1694,20 @@ static void migrate_hrtimers(int cpu)
        spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-                migrate_hrtimer_list(&old_base->clock_base[i],
+                if (migrate_hrtimer_list(&old_base->clock_base[i],
-                                     &new_base->clock_base[i]);
+                                         &new_base->clock_base[i], cpu))
+                        raise = 1;
        }
+        if (migrate_hrtimer_pending(old_base, new_base))
+                raise = 1;
        spin_unlock(&old_base->lock);
        spin_unlock_irq(&new_base->lock);
        put_cpu_var(hrtimer_bases);
+        if (raise)
+                hrtimer_raise_softirq();
 }
 #endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0314074fa232..60c49e324390 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -89,7 +89,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
        set_balance_irq_affinity(irq, cpumask);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-        set_pending_irq(irq, cpumask);
+        if (desc->status & IRQ_MOVE_PCNTXT) {
+                unsigned long flags;
+                spin_lock_irqsave(&desc->lock, flags);
+                desc->chip->set_affinity(irq, cpumask);
+                spin_unlock_irqrestore(&desc->lock, flags);
+        } else
+                set_pending_irq(irq, cpumask);
 #else
        desc->affinity = cpumask;
        desc->chip->set_affinity(irq, cpumask);
diff --git a/kernel/itimer.c b/kernel/itimer.c
index ab982747d9bd..db7c358b9a02 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -55,17 +55,15 @@ int do_getitimer(int which, struct itimerval *value)
                spin_unlock_irq(&tsk->sighand->siglock);
                break;
        case ITIMER_VIRTUAL:
-                read_lock(&tasklist_lock);
                spin_lock_irq(&tsk->sighand->siglock);
                cval = tsk->signal->it_virt_expires;
                cinterval = tsk->signal->it_virt_incr;
                if (!cputime_eq(cval, cputime_zero)) {
-                        struct task_struct *t = tsk;
+                        struct task_cputime cputime;
-                        cputime_t utime = tsk->signal->utime;
+                        cputime_t utime;
-                        do {
-                                utime = cputime_add(utime, t->utime);
+                        thread_group_cputime(tsk, &cputime);
-                                t = next_thread(t);
+                        utime = cputime.utime;
-                        } while (t != tsk);
                        if (cputime_le(cval, utime)) { /* about to fire */
                                cval = jiffies_to_cputime(1);
                        } else {
@@ -73,25 +71,19 @@ int do_getitimer(int which, struct itimerval *value)
                        }
                }
                spin_unlock_irq(&tsk->sighand->siglock);
-                read_unlock(&tasklist_lock);
                cputime_to_timeval(cval, &value->it_value);
                cputime_to_timeval(cinterval, &value->it_interval);
                break;
        case ITIMER_PROF:
-                read_lock(&tasklist_lock);
                spin_lock_irq(&tsk->sighand->siglock);
                cval = tsk->signal->it_prof_expires;
                cinterval = tsk->signal->it_prof_incr;
                if (!cputime_eq(cval, cputime_zero)) {
-                        struct task_struct *t = tsk;
+                        struct task_cputime times;
-                        cputime_t ptime = cputime_add(tsk->signal->utime,
+                        cputime_t ptime;
-                                                      tsk->signal->stime);
-                        do {
+                        thread_group_cputime(tsk, &times);
-                                ptime = cputime_add(ptime,
+                        ptime = cputime_add(times.utime, times.stime);
-                                                    cputime_add(t->utime,
-                                                                t->stime));
-                                t = next_thread(t);
-                        } while (t != tsk);
                        if (cputime_le(cval, ptime)) { /* about to fire */
                                cval = jiffies_to_cputime(1);
                        } else {
@@ -99,7 +91,6 @@ int do_getitimer(int which, struct itimerval *value)
                        }
                }
                spin_unlock_irq(&tsk->sighand->siglock);
-                read_unlock(&tasklist_lock);
                cputime_to_timeval(cval, &value->it_value);
                cputime_to_timeval(cinterval, &value->it_interval);
                break;
@@ -185,7 +176,6 @@ again:
        case ITIMER_VIRTUAL:
                nval = timeval_to_cputime(&value->it_value);
                ninterval = timeval_to_cputime(&value->it_interval);
-                read_lock(&tasklist_lock);
                spin_lock_irq(&tsk->sighand->siglock);
                cval = tsk->signal->it_virt_expires;
                cinterval = tsk->signal->it_virt_incr;
@@ -200,7 +190,6 @@ again:
                tsk->signal->it_virt_expires = nval;
                tsk->signal->it_virt_incr = ninterval;
                spin_unlock_irq(&tsk->sighand->siglock);
-                read_unlock(&tasklist_lock);
                if (ovalue) {
                        cputime_to_timeval(cval, &ovalue->it_value);
                        cputime_to_timeval(cinterval, &ovalue->it_interval);
@@ -209,7 +198,6 @@ again:
        case ITIMER_PROF:
                nval = timeval_to_cputime(&value->it_value);
                ninterval = timeval_to_cputime(&value->it_interval);
-                read_lock(&tasklist_lock);
                spin_lock_irq(&tsk->sighand->siglock);
                cval = tsk->signal->it_prof_expires;
                cinterval = tsk->signal->it_prof_incr;
@@ -224,7 +212,6 @@ again:
                tsk->signal->it_prof_expires = nval;
                tsk->signal->it_prof_incr = ninterval;
                spin_unlock_irq(&tsk->sighand->siglock);
-                read_unlock(&tasklist_lock);
                if (ovalue) {
                        cputime_to_timeval(cval, &ovalue->it_value);
                        cputime_to_timeval(cinterval, &ovalue->it_interval);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 38fc10ac7541..5072cf1685a2 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -260,7 +260,6 @@ const char *kallsyms_lookup(unsigned long addr,
        /* see if it's in a module */
        return module_address_lookup(addr, symbolsize, offset, modname,
                                     namebuf);
-        return NULL;
 }
 int lookup_symbol_name(unsigned long addr, char *symname)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 59f3f0df35d4..aef265325cd3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -753,8 +753,14 @@ static struct page *kimage_alloc_page(struct kimage *image,
                        *old = addr | (*old & ~PAGE_MASK);
                        /* The old page I have found cannot be a
-                         * destination page, so return it.
+                         * destination page, so return it if it's
+                         * gfp_flags honor the ones passed in.
                         */
+                        if (!(gfp_mask & __GFP_HIGHMEM) &&
+                            PageHighMem(old_page)) {
+                                kimage_free_pages(old_page);
+                                continue;
+                        }
                        addr = old_addr;
                        page = old_page;
                        break;
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index eaa21fc9ad1d..e4dcfb2272a4 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -488,7 +488,7 @@ static int write_mem_msg(int binary)
                if (err)
                        return err;
                if (CACHE_FLUSH_IS_SAFE)
-                        flush_icache_range(addr, addr + length + 1);
+                        flush_icache_range(addr, addr + length);
                return 0;
        }
@@ -590,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs)
        /* Signal the primary CPU that we are done: */
        atomic_set(&cpu_in_kgdb[cpu], 0);
+        touch_softlockup_watchdog();
        clocksource_touch_watchdog();
        local_irq_restore(flags);
 }
@@ -1432,6 +1433,7 @@ acquirelock:
            atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
                atomic_set(&kgdb_active, -1);
+                touch_softlockup_watchdog();
                clocksource_touch_watchdog();
                local_irq_restore(flags);
@@ -1462,7 +1464,7 @@ acquirelock:
         * Get the passive CPU lock which will hold all the non-primary
         * CPU in a spin state while the debugger is active
         */
-        if (!kgdb_single_step || !kgdb_contthread) {
+        if (!kgdb_single_step) {
                for (i = 0; i < NR_CPUS; i++)
                        atomic_set(&passive_cpu_wait[i], 1);
        }
@@ -1475,7 +1477,7 @@ acquirelock:
 #ifdef CONFIG_SMP
        /* Signal the other CPUs to enter kgdb_wait() */
-        if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup)
+        if ((!kgdb_single_step) && kgdb_do_roundup)
                kgdb_roundup_cpus(flags);
 #endif
@@ -1494,7 +1496,7 @@ acquirelock:
        kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
        kgdb_deactivate_sw_breakpoints();
        kgdb_single_step = 0;
-        kgdb_contthread = NULL;
+        kgdb_contthread = current;
        exception_level = 0;
        /* Talk to debugger with gdbserial protocol */
@@ -1508,7 +1510,7 @@ acquirelock:
        kgdb_info[ks->cpu].task = NULL;
        atomic_set(&cpu_in_kgdb[ks->cpu], 0);
-        if (!kgdb_single_step || !kgdb_contthread) {
+        if (!kgdb_single_step) {
                for (i = NR_CPUS-1; i >= 0; i--)
                        atomic_set(&passive_cpu_wait[i], 0);
                /*
@@ -1524,6 +1526,7 @@ acquirelock:
 kgdb_restore:
        /* Free kgdb_active */
        atomic_set(&kgdb_active, -1);
+        touch_softlockup_watchdog();
        clocksource_touch_watchdog();
        local_irq_restore(flags);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2456d1a0befb..3d3c3ea3a023 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -113,7 +113,7 @@ int request_module(const char *fmt, ...)
        return ret;
 }
 EXPORT_SYMBOL(request_module);
-#endif /* CONFIG_KMOD */
+#endif /* CONFIG_MODULES */
 struct subprocess_info {
        struct work_struct work;
@@ -265,7 +265,7 @@ static void __call_usermodehelper(struct work_struct *work)
        }
 }
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 /*
 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
 * (used for preventing user land processes from being created after the user
@@ -288,39 +288,37 @@ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
 */
 #define RUNNING_HELPERS_TIMEOUT (5 * HZ)
-static int usermodehelper_pm_callback(struct notifier_block *nfb,
+/**
-                                        unsigned long action,
+ * usermodehelper_disable - prevent new helpers from being started
-                                        void *ignored)
+ */
+int usermodehelper_disable(void)
 {
        long retval;
-        switch (action) {
+        usermodehelper_disabled = 1;
-        case PM_HIBERNATION_PREPARE:
+        smp_mb();
-        case PM_SUSPEND_PREPARE:
+        /*
-                usermodehelper_disabled = 1;
+         * From now on call_usermodehelper_exec() won't start any new
-                smp_mb();
+         * helpers, so it is sufficient if running_helpers turns out to
-                /*
+         * be zero at one point (it may be increased later, but that
-                 * From now on call_usermodehelper_exec() won't start any new
+         * doesn't matter).
-                 * helpers, so it is sufficient if running_helpers turns out to
+         */
-                 * be zero at one point (it may be increased later, but that
+        retval = wait_event_timeout(running_helpers_waitq,
-                 * doesn't matter).
-                 */
-                retval = wait_event_timeout(running_helpers_waitq,
                                        atomic_read(&running_helpers) == 0,
                                        RUNNING_HELPERS_TIMEOUT);
-                if (retval) {
+        if (retval)
-                        return NOTIFY_OK;
+                return 0;
-                } else {
-                        usermodehelper_disabled = 0;
-                        return NOTIFY_BAD;
-                }
-        case PM_POST_HIBERNATION:
-        case PM_POST_SUSPEND:
-                usermodehelper_disabled = 0;
-                return NOTIFY_OK;
-        }
-        return NOTIFY_DONE;
+        usermodehelper_disabled = 0;
+        return -EAGAIN;
+}
+/**
+ * usermodehelper_enable - allow new helpers to be started again
+ */
+void usermodehelper_enable(void)
+{
+        usermodehelper_disabled = 0;
 }
 static void helper_lock(void)
@@ -334,18 +332,12 @@ static void helper_unlock(void)
        if (atomic_dec_and_test(&running_helpers))
                wake_up(&running_helpers_waitq);
 }
+#else /* CONFIG_PM_SLEEP */
-static void register_pm_notifier_callback(void)
-{
-        pm_notifier(usermodehelper_pm_callback, 0);
-}
-#else /* CONFIG_PM */
 #define usermodehelper_disabled 0
 static inline void helper_lock(void) {}
 static inline void helper_unlock(void) {}
-static inline void register_pm_notifier_callback(void) {}
+#endif /* CONFIG_PM_SLEEP */
-#endif /* CONFIG_PM */
 /**
 * call_usermodehelper_setup - prepare to call a usermode helper
@@ -515,5 +507,4 @@ void __init usermodehelper_init(void)
 {
        khelper_wq = create_singlethread_workqueue("khelper");
        BUG_ON(!khelper_wq);
-        register_pm_notifier_callback();
 }
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 75bc2cd9ebc6..8b57a2597f21 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -404,7 +404,7 @@ void kretprobe_hash_lock(struct task_struct *tsk,
        spin_lock_irqsave(hlist_lock, *flags);
 }
-void kretprobe_table_lock(unsigned long hash, unsigned long *flags)
+static void kretprobe_table_lock(unsigned long hash, unsigned long *flags)
 {
        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
        spin_lock_irqsave(hlist_lock, *flags);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e53bc30e9ba5..08dd8ed86c77 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kexec.h>
+#include <linux/profile.h>
 #include <linux/sched.h>
 #define KERNEL_ATTR_RO(_name) \
@@ -53,6 +54,37 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
 KERNEL_ATTR_RW(uevent_helper);
 #endif
+#ifdef CONFIG_PROFILING
+static ssize_t profiling_show(struct kobject *kobj,
+                                  struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%d\n", prof_on);
+}
+static ssize_t profiling_store(struct kobject *kobj,
+                                   struct kobj_attribute *attr,
+                                   const char *buf, size_t count)
+{
+        int ret;
+        if (prof_on)
+                return -EEXIST;
+        /*
+         * This eventually calls into get_option() which
+         * has a ton of callers and is not const.  It is
+         * easiest to cast it away here.
+         */
+        profile_setup((char *)buf);
+        ret = profile_init();
+        if (ret)
+                return ret;
+        ret = create_proc_profile();
+        if (ret)
+                return ret;
+        return count;
+}
+KERNEL_ATTR_RW(profiling);
+#endif
 #ifdef CONFIG_KEXEC
 static ssize_t kexec_loaded_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
@@ -109,6 +141,9 @@ static struct attribute * kernel_attrs[] = {
        &uevent_seqnum_attr.attr,
        &uevent_helper_attr.attr,
 #endif
+#ifdef CONFIG_PROFILING
+        &profiling_attr.attr,
+#endif
 #ifdef CONFIG_KEXEC
        &kexec_loaded_attr.attr,
        &kexec_crash_loaded_attr.attr,
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 3bfb1877a003..dbda475b13bd 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -875,11 +875,11 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
        if (!entry)
                return 0;
-        entry->class = this;
-        entry->distance = distance;
        if (!save_trace(&entry->trace))
                return 0;
+        entry->class = this;
+        entry->distance = distance;
        /*
         * Since we never remove from the dependency list, the list can
         * be walked lockless by other CPUs, it's only allocation
@@ -3029,7 +3029,7 @@ found_it:
        stats = get_lock_stats(hlock_class(hlock));
        if (point < ARRAY_SIZE(stats->contention_point))
-                stats->contention_point[i]++;
+                stats->contention_point[point]++;
        if (lock->cpu != smp_processor_id())
                stats->bounces[bounce_contended + !!hlock->read]++;
        put_lock_stats(stats);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 4b194d34d77f..20dbcbf9c7dd 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -472,8 +472,9 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr)
 {
        unsigned long rem;
+        nr += 5; /* for display rounding */
        rem = do_div(nr, 1000); /* XXX: do_div_signed */
-        snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10);
+        snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10);
 }
 static void seq_time(struct seq_file *m, s64 time)
diff --git a/kernel/module.c b/kernel/module.c
index 08864d257eb0..25bc9ac9e226 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -100,7 +100,7 @@ static inline int strong_try_module_get(struct module *mod)
 static inline void add_taint_module(struct module *mod, unsigned flag)
 {
        add_taint(flag);
-        mod->taints |= flag;
+        mod->taints |= (1U << flag);
 }
 /*
@@ -784,6 +784,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
        mutex_lock(&module_mutex);
        /* Store the name of the last unloaded module for diagnostic purposes */
        strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
+        unregister_dynamic_debug_module(mod->name);
        free_module(mod);
 out:
@@ -923,7 +924,7 @@ static const char vermagic[] = VERMAGIC_STRING;
 static int try_to_force_load(struct module *mod, const char *symname)
 {
 #ifdef CONFIG_MODULE_FORCE_LOAD
-        if (!(tainted & TAINT_FORCED_MODULE))
+        if (!test_taint(TAINT_FORCED_MODULE))
                printk("%s: no version for \"%s\" found: kernel tainted.\n",
                       mod->name, symname);
        add_taint_module(mod, TAINT_FORCED_MODULE);
@@ -1033,7 +1034,7 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
        const unsigned long *crc;
        ret = find_symbol(name, &owner, &crc,
-                          !(mod->taints & TAINT_PROPRIETARY_MODULE), true);
+                          !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
        if (!IS_ERR_VALUE(ret)) {
                /* use_module can fail due to OOM,
                   or module initialization or unloading */
@@ -1173,7 +1174,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
                while (i-- > 0)
                        sysfs_remove_bin_file(notes_attrs->dir,
                                              &notes_attrs->attrs[i]);
-                kobject_del(notes_attrs->dir);
+                kobject_put(notes_attrs->dir);
        }
        kfree(notes_attrs);
 }
@@ -1634,7 +1635,7 @@ static void set_license(struct module *mod, const char *license)
                license = "unspecified";
        if (!license_is_gpl_compatible(license)) {
-                if (!(tainted & TAINT_PROPRIETARY_MODULE))
+                if (!test_taint(TAINT_PROPRIETARY_MODULE))
                        printk(KERN_WARNING "%s: module license '%s' taints "
                                "kernel.\n", mod->name, license);
                add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
@@ -1783,6 +1784,33 @@ static inline void add_kallsyms(struct module *mod,
 }
 #endif /* CONFIG_KALLSYMS */
+#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
+static void dynamic_printk_setup(Elf_Shdr *sechdrs, unsigned int verboseindex)
+{
+        struct mod_debug *debug_info;
+        unsigned long pos, end;
+        unsigned int num_verbose;
+        pos = sechdrs[verboseindex].sh_addr;
+        num_verbose = sechdrs[verboseindex].sh_size /
+                                sizeof(struct mod_debug);
+        end = pos + (num_verbose * sizeof(struct mod_debug));
+        for (; pos < end; pos += sizeof(struct mod_debug)) {
+                debug_info = (struct mod_debug *)pos;
+                register_dynamic_debug_module(debug_info->modname,
+                        debug_info->type, debug_info->logical_modname,
+                        debug_info->flag_names, debug_info->hash,
+                        debug_info->hash2);
+        }
+}
+#else
+static inline void dynamic_printk_setup(Elf_Shdr *sechdrs,
+                                        unsigned int verboseindex)
+{
+}
+#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
 static void *module_alloc_update_bounds(unsigned long size)
 {
        void *ret = module_alloc(size);
@@ -1799,13 +1827,14 @@ static void *module_alloc_update_bounds(unsigned long size)
 /* Allocate and load the module: note that size of section 0 is always
   zero, and we rely on this for optional sections. */
-static struct module *load_module(void __user *umod,
+static noinline struct module *load_module(void __user *umod,
                                  unsigned long len,
                                  const char __user *uargs)
 {
        Elf_Ehdr *hdr;
        Elf_Shdr *sechdrs;
        char *secstrings, *args, *modmagic, *strtab = NULL;
+        char *staging;
        unsigned int i;
        unsigned int symindex = 0;
        unsigned int strindex = 0;
@@ -1831,6 +1860,7 @@ static struct module *load_module(void __user *umod,
 #endif
        unsigned int markersindex;
        unsigned int markersstringsindex;
+        unsigned int verboseindex;
        struct module *mod;
        long err = 0;
        void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1960,6 +1990,14 @@ static struct module *load_module(void __user *umod,
                goto free_hdr;
        }
+        staging = get_modinfo(sechdrs, infoindex, "staging");
+        if (staging) {
+                add_taint_module(mod, TAINT_CRAP);
+                printk(KERN_WARNING "%s: module is from the staging directory,"
+                       " the quality is unknown, you have been warned.\n",
+                       mod->name);
+        }
        /* Now copy in args */
        args = strndup_user(uargs, ~0UL >> 1);
        if (IS_ERR(args)) {
@@ -2117,6 +2155,7 @@ static struct module *load_module(void __user *umod,
        markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
        markersstringsindex = find_sec(hdr, sechdrs, secstrings,
                                        "__markers_strings");
+        verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose");
        /* Now do relocations. */
        for (i = 1; i < hdr->e_shnum; i++) {
@@ -2167,6 +2206,7 @@ static struct module *load_module(void __user *umod,
                marker_update_probe_range(mod->markers,
                        mod->markers + mod->num_markers);
 #endif
+        dynamic_printk_setup(sechdrs, verboseindex);
        err = module_finalize(hdr, sechdrs, mod);
        if (err < 0)
                goto cleanup;
@@ -2552,10 +2592,12 @@ static char *module_flags(struct module *mod, char *buf)
            mod->state == MODULE_STATE_GOING ||
            mod->state == MODULE_STATE_COMING) {
                buf[bx++] = '(';
-                if (mod->taints & TAINT_PROPRIETARY_MODULE)
+                if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
                        buf[bx++] = 'P';
-                if (mod->taints & TAINT_FORCED_MODULE)
+                if (mod->taints & (1 << TAINT_FORCED_MODULE))
                        buf[bx++] = 'F';
+                if (mod->taints & (1 << TAINT_CRAP))
+                        buf[bx++] = 'C';
                /*
                 * TAINT_FORCED_RMMOD: could be added.
                 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 21575fc46d05..1d3ef29a2583 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -14,7 +14,6 @@
 */
 #include <linux/module.h>
-#include <linux/version.h>
 #include <linux/nsproxy.h>
 #include <linux/init_task.h>
 #include <linux/mnt_namespace.h>
diff --git a/kernel/panic.c b/kernel/panic.c
index 12c5a0a6c89b..bda561ef3cdf 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -23,7 +23,7 @@
 #include <linux/kallsyms.h>
 int panic_on_oops;
-int tainted;
+static unsigned long tainted_mask;
 static int pause_on_oops;
 static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
@@ -143,6 +143,27 @@ NORET_TYPE void panic(const char * fmt, ...)
 EXPORT_SYMBOL(panic);
+struct tnt {
+        u8 bit;
+        char true;
+        char false;
+};
+static const struct tnt tnts[] = {
+        { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
+        { TAINT_FORCED_MODULE, 'F', ' ' },
+        { TAINT_UNSAFE_SMP, 'S', ' ' },
+        { TAINT_FORCED_RMMOD, 'R', ' ' },
+        { TAINT_MACHINE_CHECK, 'M', ' ' },
+        { TAINT_BAD_PAGE, 'B', ' ' },
+        { TAINT_USER, 'U', ' ' },
+        { TAINT_DIE, 'D', ' ' },
+        { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
+        { TAINT_WARN, 'W', ' ' },
+        { TAINT_CRAP, 'C', ' ' },
+};
 /**
 *      print_tainted - return a string to represent the kernel taint state.
 *
@@ -155,35 +176,45 @@ EXPORT_SYMBOL(panic);
 *  'U' - Userspace-defined naughtiness.
 *  'A' - ACPI table overridden.
 *  'W' - Taint on warning.
+ *  'C' - modules from drivers/staging are loaded.
 *
 *      The string is overwritten by the next call to print_taint().
 */
 const char *print_tainted(void)
 {
-        static char buf[20];
+        static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1];
-        if (tainted) {
-                snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c",
+        if (tainted_mask) {
-                        tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
+                char *s;
-                        tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
+                int i;
-                        tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
-                        tainted & TAINT_FORCED_RMMOD ? 'R' : ' ',
+                s = buf + sprintf(buf, "Tainted: ");
-                        tainted & TAINT_MACHINE_CHECK ? 'M' : ' ',
+                for (i = 0; i < ARRAY_SIZE(tnts); i++) {
-                        tainted & TAINT_BAD_PAGE ? 'B' : ' ',
+                        const struct tnt *t = &tnts[i];
-                        tainted & TAINT_USER ? 'U' : ' ',
+                        *s++ = test_bit(t->bit, &tainted_mask) ?
-                        tainted & TAINT_DIE ? 'D' : ' ',
+                                        t->true : t->false;
-                        tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ',
+                }
-                        tainted & TAINT_WARN ? 'W' : ' ');
+                *s = 0;
-        }
+        } else
-        else
                snprintf(buf, sizeof(buf), "Not tainted");
        return(buf);
 }
+int test_taint(unsigned flag)
+{
+        return test_bit(flag, &tainted_mask);
+}
+EXPORT_SYMBOL(test_taint);
+unsigned long get_taint(void)
+{
+        return tainted_mask;
+}
 void add_taint(unsigned flag)
 {
        debug_locks = 0; /* can't trust the integrity of the kernel anymore */
-        tainted |= flag;
+        set_bit(flag, &tainted_mask);
 }
 EXPORT_SYMBOL(add_taint);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ea567b78d1aa..fab8ea86fac3 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -179,9 +179,6 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
                rc = sys_wait4(-1, NULL, __WALL, NULL);
        } while (rc != -ECHILD);
-        /* Child reaper for the pid namespace is going away */
-        pid_ns->child_reaper = NULL;
        acct_exit_ns(pid_ns);
        return;
 }
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index da9c2dda6a4e..dfdec524d1b7 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -43,7 +43,7 @@
 #include <linux/uaccess.h>
 /*
- * locking rule: all changes to target_value or requirements or notifiers lists
+ * locking rule: all changes to requirements or notifiers lists
 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
 * held, taken with _irqsave.  One lock to rule them all
 */
@@ -66,7 +66,7 @@ struct pm_qos_object {
        struct miscdevice pm_qos_power_miscdev;
        char *name;
        s32 default_value;
-        s32 target_value;
+        atomic_t target_value;
        s32 (*comparitor)(s32, s32);
 };
@@ -77,7 +77,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
        .notifiers = &cpu_dma_lat_notifier,
        .name = "cpu_dma_latency",
        .default_value = 2000 * USEC_PER_SEC,
-        .target_value = 2000 * USEC_PER_SEC,
+        .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
        .comparitor = min_compare
 };
@@ -87,7 +87,7 @@ static struct pm_qos_object network_lat_pm_qos = {
        .notifiers = &network_lat_notifier,
        .name = "network_latency",
        .default_value = 2000 * USEC_PER_SEC,
-        .target_value = 2000 * USEC_PER_SEC,
+        .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
        .comparitor = min_compare
 };
@@ -99,7 +99,7 @@ static struct pm_qos_object network_throughput_pm_qos = {
        .notifiers = &network_throughput_notifier,
        .name = "network_throughput",
        .default_value = 0,
-        .target_value = 0,
+        .target_value = ATOMIC_INIT(0),
        .comparitor = max_compare
 };
@@ -150,11 +150,11 @@ static void update_target(int target)
                extreme_value = pm_qos_array[target]->comparitor(
                                extreme_value, node->value);
        }
-        if (pm_qos_array[target]->target_value != extreme_value) {
+        if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) {
                call_notifier = 1;
-                pm_qos_array[target]->target_value = extreme_value;
+                atomic_set(&pm_qos_array[target]->target_value, extreme_value);
                pr_debug(KERN_ERR "new target for qos %d is %d\n", target,
-                        pm_qos_array[target]->target_value);
+                        atomic_read(&pm_qos_array[target]->target_value));
        }
        spin_unlock_irqrestore(&pm_qos_lock, flags);
@@ -193,14 +193,7 @@ static int find_pm_qos_object_by_minor(int minor)
 */
 int pm_qos_requirement(int pm_qos_class)
 {
-        int ret_val;
+        return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
-        unsigned long flags;
-        spin_lock_irqsave(&pm_qos_lock, flags);
-        ret_val = pm_qos_array[pm_qos_class]->target_value;
-        spin_unlock_irqrestore(&pm_qos_lock, flags);
-        return ret_val;
 }
 EXPORT_SYMBOL_GPL(pm_qos_requirement);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index c42a03aef36f..153dcb2639c3 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -7,6 +7,93 @@
 #include <linux/errno.h>
 #include <linux/math64.h>
 #include <asm/uaccess.h>
+#include <linux/kernel_stat.h>
+/*
+ * Allocate the thread_group_cputime structure appropriately and fill in the
+ * current values of the fields.  Called from copy_signal() via
+ * thread_group_cputime_clone_thread() when adding a second or subsequent
+ * thread to a thread group.  Assumes interrupts are enabled when called.
+ */
+int thread_group_cputime_alloc(struct task_struct *tsk)
+{
+        struct signal_struct *sig = tsk->signal;
+        struct task_cputime *cputime;
+        /*
+         * If we have multiple threads and we don't already have a
+         * per-CPU task_cputime struct (checked in the caller), allocate
+         * one and fill it in with the times accumulated so far.  We may
+         * race with another thread so recheck after we pick up the sighand
+         * lock.
+         */
+        cputime = alloc_percpu(struct task_cputime);
+        if (cputime == NULL)
+                return -ENOMEM;
+        spin_lock_irq(&tsk->sighand->siglock);
+        if (sig->cputime.totals) {
+                spin_unlock_irq(&tsk->sighand->siglock);
+                free_percpu(cputime);
+                return 0;
+        }
+        sig->cputime.totals = cputime;
+        cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id());
+        cputime->utime = tsk->utime;
+        cputime->stime = tsk->stime;
+        cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
+        spin_unlock_irq(&tsk->sighand->siglock);
+        return 0;
+}
+/**
+ * thread_group_cputime - Sum the thread group time fields across all CPUs.
+ *
+ * @tsk:        The task we use to identify the thread group.
+ * @times:      task_cputime structure in which we return the summed fields.
+ *
+ * Walk the list of CPUs to sum the per-CPU time fields in the thread group
+ * time structure.
+ */
+void thread_group_cputime(
+        struct task_struct *tsk,
+        struct task_cputime *times)
+{
+        struct signal_struct *sig;
+        int i;
+        struct task_cputime *tot;
+        sig = tsk->signal;
+        if (unlikely(!sig) || !sig->cputime.totals) {
+                times->utime = tsk->utime;
+                times->stime = tsk->stime;
+                times->sum_exec_runtime = tsk->se.sum_exec_runtime;
+                return;
+        }
+        times->stime = times->utime = cputime_zero;
+        times->sum_exec_runtime = 0;
+        for_each_possible_cpu(i) {
+                tot = per_cpu_ptr(tsk->signal->cputime.totals, i);
+                times->utime = cputime_add(times->utime, tot->utime);
+                times->stime = cputime_add(times->stime, tot->stime);
+                times->sum_exec_runtime += tot->sum_exec_runtime;
+        }
+}
+/*
+ * Called after updating RLIMIT_CPU to set timer expiration if necessary.
+ */
+void update_rlimit_cpu(unsigned long rlim_new)
+{
+        cputime_t cputime;
+        cputime = secs_to_cputime(rlim_new);
+        if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
+            cputime_lt(current->signal->it_prof_expires, cputime)) {
+                spin_lock_irq(&current->sighand->siglock);
+                set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
+                spin_unlock_irq(&current->sighand->siglock);
+        }
+}
 static int check_clock(const clockid_t which_clock)
 {
@@ -158,10 +245,6 @@ static inline cputime_t virt_ticks(struct task_struct *p)
 {
        return p->utime;
 }
-static inline unsigned long long sched_ns(struct task_struct *p)
-{
-        return task_sched_runtime(p);
-}
 int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
 {
@@ -211,7 +294,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
                cpu->cpu = virt_ticks(p);
                break;
        case CPUCLOCK_SCHED:
-                cpu->sched = sched_ns(p);
+                cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
                break;
        }
        return 0;
@@ -220,59 +303,30 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 /*
 * Sample a process (thread group) clock for the given group_leader task.
 * Must be called with tasklist_lock held for reading.
- * Must be called with tasklist_lock held for reading, and p->sighand->siglock.
 */
-static int cpu_clock_sample_group_locked(unsigned int clock_idx,
+static int cpu_clock_sample_group(const clockid_t which_clock,
-                                         struct task_struct *p,
+                                  struct task_struct *p,
-                                         union cpu_time_count *cpu)
+                                  union cpu_time_count *cpu)
 {
-        struct task_struct *t = p;
+        struct task_cputime cputime;
-        switch (clock_idx) {
+        thread_group_cputime(p, &cputime);
+        switch (which_clock) {
        default:
                return -EINVAL;
        case CPUCLOCK_PROF:
-                cpu->cpu = cputime_add(p->signal->utime, p->signal->stime);
+                cpu->cpu = cputime_add(cputime.utime, cputime.stime);
-                do {
-                        cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t));
-                        t = next_thread(t);
-                } while (t != p);
                break;
        case CPUCLOCK_VIRT:
-                cpu->cpu = p->signal->utime;
+                cpu->cpu = cputime.utime;
-                do {
-                        cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t));
-                        t = next_thread(t);
-                } while (t != p);
                break;
        case CPUCLOCK_SCHED:
-                cpu->sched = p->signal->sum_sched_runtime;
+                cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
-                /* Add in each other live thread.  */
-                while ((t = next_thread(t)) != p) {
-                        cpu->sched += t->se.sum_exec_runtime;
-                }
-                cpu->sched += sched_ns(p);
                break;
        }
        return 0;
 }
-/*
- * Sample a process (thread group) clock for the given group_leader task.
- * Must be called with tasklist_lock held for reading.
- */
-static int cpu_clock_sample_group(const clockid_t which_clock,
-                                  struct task_struct *p,
-                                  union cpu_time_count *cpu)
-{
-        int ret;
-        unsigned long flags;
-        spin_lock_irqsave(&p->sighand->siglock, flags);
-        ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p,
-                                            cpu);
-        spin_unlock_irqrestore(&p->sighand->siglock, flags);
-        return ret;
-}
 int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 {
@@ -471,80 +525,11 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
 {
-        cleanup_timers(tsk->signal->cpu_timers,
+        struct task_cputime cputime;
-                       cputime_add(tsk->utime, tsk->signal->utime),
-                       cputime_add(tsk->stime, tsk->signal->stime),
-                     tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
-}
+        thread_group_cputime(tsk, &cputime);
-/*
+        cleanup_timers(tsk->signal->cpu_timers,
- * Set the expiry times of all the threads in the process so one of them
+                       cputime.utime, cputime.stime, cputime.sum_exec_runtime);
- * will go off before the process cumulative expiry total is reached.
- */
-static void process_timer_rebalance(struct task_struct *p,
-                                    unsigned int clock_idx,
-                                    union cpu_time_count expires,
-                                    union cpu_time_count val)
-{
-        cputime_t ticks, left;
-        unsigned long long ns, nsleft;
-        struct task_struct *t = p;
-        unsigned int nthreads = atomic_read(&p->signal->live);
-        if (!nthreads)
-                return;
-        switch (clock_idx) {
-        default:
-                BUG();
-                break;
-        case CPUCLOCK_PROF:
-                left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
-                                       nthreads);
-                do {
-                        if (likely(!(t->flags & PF_EXITING))) {
-                                ticks = cputime_add(prof_ticks(t), left);
-                                if (cputime_eq(t->it_prof_expires,
-                                               cputime_zero) ||
-                                    cputime_gt(t->it_prof_expires, ticks)) {
-                                        t->it_prof_expires = ticks;
-                                }
-                        }
-                        t = next_thread(t);
-                } while (t != p);
-                break;
-        case CPUCLOCK_VIRT:
-                left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
-                                       nthreads);
-                do {
-                        if (likely(!(t->flags & PF_EXITING))) {
-                                ticks = cputime_add(virt_ticks(t), left);
-                                if (cputime_eq(t->it_virt_expires,
-                                               cputime_zero) ||
-                                    cputime_gt(t->it_virt_expires, ticks)) {
-                                        t->it_virt_expires = ticks;
-                                }
-                        }
-                        t = next_thread(t);
-                } while (t != p);
-                break;
-        case CPUCLOCK_SCHED:
-                nsleft = expires.sched - val.sched;
-                do_div(nsleft, nthreads);
-                nsleft = max_t(unsigned long long, nsleft, 1);
-                do {
-                        if (likely(!(t->flags & PF_EXITING))) {
-                                ns = t->se.sum_exec_runtime + nsleft;
-                                if (t->it_sched_expires == 0 ||
-                                    t->it_sched_expires > ns) {
-                                        t->it_sched_expires = ns;
-                                }
-                        }
-                        t = next_thread(t);
-                } while (t != p);
-                break;
-        }
 }
 static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
@@ -608,29 +593,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
                        default:
                                BUG();
                        case CPUCLOCK_PROF:
-                                if (cputime_eq(p->it_prof_expires,
+                                if (cputime_eq(p->cputime_expires.prof_exp,
                                               cputime_zero) ||
-                                    cputime_gt(p->it_prof_expires,
+                                    cputime_gt(p->cputime_expires.prof_exp,
                                               nt->expires.cpu))
-                                        p->it_prof_expires = nt->expires.cpu;
+                                        p->cputime_expires.prof_exp =
+                                                nt->expires.cpu;
                                break;
                        case CPUCLOCK_VIRT:
-                                if (cputime_eq(p->it_virt_expires,
+                                if (cputime_eq(p->cputime_expires.virt_exp,
                                               cputime_zero) ||
-                                    cputime_gt(p->it_virt_expires,
+                                    cputime_gt(p->cputime_expires.virt_exp,
                                               nt->expires.cpu))
-                                        p->it_virt_expires = nt->expires.cpu;
+                                        p->cputime_expires.virt_exp =
+                                                nt->expires.cpu;
                                break;
                        case CPUCLOCK_SCHED:
-                                if (p->it_sched_expires == 0 ||
+                                if (p->cputime_expires.sched_exp == 0 ||
-                                    p->it_sched_expires > nt->expires.sched)
+                                    p->cputime_expires.sched_exp >
-                                        p->it_sched_expires = nt->expires.sched;
+                                                        nt->expires.sched)
+                                        p->cputime_expires.sched_exp =
+                                                nt->expires.sched;
                                break;
                        }
                } else {
                        /*
-                         * For a process timer, we must balance
+                         * For a process timer, set the cached expiration time.
-                         * all the live threads' expirations.
                         */
                        switch (CPUCLOCK_WHICH(timer->it_clock)) {
                        default:
@@ -641,7 +629,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
                                    cputime_lt(p->signal->it_virt_expires,
                                               timer->it.cpu.expires.cpu))
                                        break;
-                                goto rebalance;
+                                p->signal->cputime_expires.virt_exp =
+                                        timer->it.cpu.expires.cpu;
+                                break;
                        case CPUCLOCK_PROF:
                                if (!cputime_eq(p->signal->it_prof_expires,
                                                cputime_zero) &&
@@ -652,13 +642,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
                                if (i != RLIM_INFINITY &&
                                    i <= cputime_to_secs(timer->it.cpu.expires.cpu))
                                        break;
-                                goto rebalance;
+                                p->signal->cputime_expires.prof_exp =
+                                        timer->it.cpu.expires.cpu;
+                                break;
                        case CPUCLOCK_SCHED:
-                        rebalance:
+                                p->signal->cputime_expires.sched_exp =
-                                process_timer_rebalance(
+                                        timer->it.cpu.expires.sched;
-                                        timer->it.cpu.task,
-                                        CPUCLOCK_WHICH(timer->it_clock),
-                                        timer->it.cpu.expires, now);
                                break;
                        }
                }
@@ -969,13 +958,13 @@ static void check_thread_timers(struct task_struct *tsk,
        struct signal_struct *const sig = tsk->signal;
        maxfire = 20;
-        tsk->it_prof_expires = cputime_zero;
+        tsk->cputime_expires.prof_exp = cputime_zero;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
                if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
-                        tsk->it_prof_expires = t->expires.cpu;
+                        tsk->cputime_expires.prof_exp = t->expires.cpu;
                        break;
                }
                t->firing = 1;
@@ -984,13 +973,13 @@ static void check_thread_timers(struct task_struct *tsk,
        ++timers;
        maxfire = 20;
-        tsk->it_virt_expires = cputime_zero;
+        tsk->cputime_expires.virt_exp = cputime_zero;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
                if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
-                        tsk->it_virt_expires = t->expires.cpu;
+                        tsk->cputime_expires.virt_exp = t->expires.cpu;
                        break;
                }
                t->firing = 1;
@@ -999,13 +988,13 @@ static void check_thread_timers(struct task_struct *tsk,
        ++timers;
        maxfire = 20;
-        tsk->it_sched_expires = 0;
+        tsk->cputime_expires.sched_exp = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
                if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
-                        tsk->it_sched_expires = t->expires.sched;
+                        tsk->cputime_expires.sched_exp = t->expires.sched;
                        break;
                }
                t->firing = 1;
@@ -1055,10 +1044,10 @@ static void check_process_timers(struct task_struct *tsk,
 {
        int maxfire;
        struct signal_struct *const sig = tsk->signal;
-        cputime_t utime, stime, ptime, virt_expires, prof_expires;
+        cputime_t utime, ptime, virt_expires, prof_expires;
        unsigned long long sum_sched_runtime, sched_expires;
-        struct task_struct *t;
        struct list_head *timers = sig->cpu_timers;
+        struct task_cputime cputime;
        /*
         * Don't sample the current process CPU clocks if there are no timers.
@@ -1074,18 +1063,10 @@ static void check_process_timers(struct task_struct *tsk,
        /*
         * Collect the current process totals.
         */
-        utime = sig->utime;
+        thread_group_cputime(tsk, &cputime);
-        stime = sig->stime;
+        utime = cputime.utime;
-        sum_sched_runtime = sig->sum_sched_runtime;
+        ptime = cputime_add(utime, cputime.stime);
-        t = tsk;
+        sum_sched_runtime = cputime.sum_exec_runtime;
-        do {
-                utime = cputime_add(utime, t->utime);
-                stime = cputime_add(stime, t->stime);
-                sum_sched_runtime += t->se.sum_exec_runtime;
-                t = next_thread(t);
-        } while (t != tsk);
-        ptime = cputime_add(utime, stime);
        maxfire = 20;
        prof_expires = cputime_zero;
        while (!list_empty(timers)) {
@@ -1193,60 +1174,18 @@ static void check_process_timers(struct task_struct *tsk,
                }
        }
-        if (!cputime_eq(prof_expires, cputime_zero) ||
+        if (!cputime_eq(prof_expires, cputime_zero) &&
-            !cputime_eq(virt_expires, cputime_zero) ||
+            (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) ||
-            sched_expires != 0) {
+             cputime_gt(sig->cputime_expires.prof_exp, prof_expires)))
-                /*
+                sig->cputime_expires.prof_exp = prof_expires;
-                 * Rebalance the threads' expiry times for the remaining
+        if (!cputime_eq(virt_expires, cputime_zero) &&
-                 * process CPU timers.
+            (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
-                 */
+             cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
+                sig->cputime_expires.virt_exp = virt_expires;
-                cputime_t prof_left, virt_left, ticks;
+        if (sched_expires != 0 &&
-                unsigned long long sched_left, sched;
+            (sig->cputime_expires.sched_exp == 0 ||
-                const unsigned int nthreads = atomic_read(&sig->live);
+             sig->cputime_expires.sched_exp > sched_expires))
+                sig->cputime_expires.sched_exp = sched_expires;
-                if (!nthreads)
-                        return;
-                prof_left = cputime_sub(prof_expires, utime);
-                prof_left = cputime_sub(prof_left, stime);
-                prof_left = cputime_div_non_zero(prof_left, nthreads);
-                virt_left = cputime_sub(virt_expires, utime);
-                virt_left = cputime_div_non_zero(virt_left, nthreads);
-                if (sched_expires) {
-                        sched_left = sched_expires - sum_sched_runtime;
-                        do_div(sched_left, nthreads);
-                        sched_left = max_t(unsigned long long, sched_left, 1);
-                } else {
-                        sched_left = 0;
-                }
-                t = tsk;
-                do {
-                        if (unlikely(t->flags & PF_EXITING))
-                                continue;
-                        ticks = cputime_add(cputime_add(t->utime, t->stime),
-                                            prof_left);
-                        if (!cputime_eq(prof_expires, cputime_zero) &&
-                            (cputime_eq(t->it_prof_expires, cputime_zero) ||
-                             cputime_gt(t->it_prof_expires, ticks))) {
-                                t->it_prof_expires = ticks;
-                        }
-                        ticks = cputime_add(t->utime, virt_left);
-                        if (!cputime_eq(virt_expires, cputime_zero) &&
-                            (cputime_eq(t->it_virt_expires, cputime_zero) ||
-                             cputime_gt(t->it_virt_expires, ticks))) {
-                                t->it_virt_expires = ticks;
-                        }
-                        sched = t->se.sum_exec_runtime + sched_left;
-                        if (sched_expires && (t->it_sched_expires == 0 ||
-                                              t->it_sched_expires > sched)) {
-                                t->it_sched_expires = sched;
-                        }
-                } while ((t = next_thread(t)) != tsk);
-        }
 }
 /*
@@ -1314,6 +1253,86 @@ out:
        ++timer->it_requeue_pending;
 }
+/**
+ * task_cputime_zero - Check a task_cputime struct for all zero fields.
+ *
+ * @cputime:    The struct to compare.
+ *
+ * Checks @cputime to see if all fields are zero.  Returns true if all fields
+ * are zero, false if any field is nonzero.
+ */
+static inline int task_cputime_zero(const struct task_cputime *cputime)
+{
+        if (cputime_eq(cputime->utime, cputime_zero) &&
+            cputime_eq(cputime->stime, cputime_zero) &&
+            cputime->sum_exec_runtime == 0)
+                return 1;
+        return 0;
+}
+/**
+ * task_cputime_expired - Compare two task_cputime entities.
+ *
+ * @sample:     The task_cputime structure to be checked for expiration.
+ * @expires:    Expiration times, against which @sample will be checked.
+ *
+ * Checks @sample against @expires to see if any field of @sample has expired.
+ * Returns true if any field of the former is greater than the corresponding
+ * field of the latter if the latter field is set.  Otherwise returns false.
+ */
+static inline int task_cputime_expired(const struct task_cputime *sample,
+                                        const struct task_cputime *expires)
+{
+        if (!cputime_eq(expires->utime, cputime_zero) &&
+            cputime_ge(sample->utime, expires->utime))
+                return 1;
+        if (!cputime_eq(expires->stime, cputime_zero) &&
+            cputime_ge(cputime_add(sample->utime, sample->stime),
+                       expires->stime))
+                return 1;
+        if (expires->sum_exec_runtime != 0 &&
+            sample->sum_exec_runtime >= expires->sum_exec_runtime)
+                return 1;
+        return 0;
+}
+/**
+ * fastpath_timer_check - POSIX CPU timers fast path.
+ *
+ * @tsk:        The task (thread) being checked.
+ *
+ * Check the task and thread group timers.  If both are zero (there are no
+ * timers set) return false.  Otherwise snapshot the task and thread group
+ * timers and compare them with the corresponding expiration times.  Return
+ * true if a timer has expired, else return false.
+ */
+static inline int fastpath_timer_check(struct task_struct *tsk)
+{
+        struct signal_struct *sig = tsk->signal;
+        if (unlikely(!sig))
+                return 0;
+        if (!task_cputime_zero(&tsk->cputime_expires)) {
+                struct task_cputime task_sample = {
+                        .utime = tsk->utime,
+                        .stime = tsk->stime,
+                        .sum_exec_runtime = tsk->se.sum_exec_runtime
+                };
+                if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
+                        return 1;
+        }
+        if (!task_cputime_zero(&sig->cputime_expires)) {
+                struct task_cputime group_sample;
+                thread_group_cputime(tsk, &group_sample);
+                if (task_cputime_expired(&group_sample, &sig->cputime_expires))
+                        return 1;
+        }
+        return 0;
+}
 /*
 * This is called from the timer interrupt handler.  The irq handler has
 * already updated our counts.  We need to check if any timers fire now.
@@ -1326,42 +1345,31 @@ void run_posix_cpu_timers(struct task_struct *tsk)
        BUG_ON(!irqs_disabled());
-#define UNEXPIRED(clock) \
+        /*
-                (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \
+         * The fast path checks that there are no expired thread or thread
-                 cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires))
+         * group timers.  If that's so, just return.
+         */
-        if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
+        if (!fastpath_timer_check(tsk))
-            (tsk->it_sched_expires == 0 ||
-             tsk->se.sum_exec_runtime < tsk->it_sched_expires))
                return;
-#undef  UNEXPIRED
+        spin_lock(&tsk->sighand->siglock);
        /*
-         * Double-check with locks held.
+         * Here we take off tsk->signal->cpu_timers[N] and
+         * tsk->cpu_timers[N] all the timers that are firing, and
+         * put them on the firing list.
         */
-        read_lock(&tasklist_lock);
+        check_thread_timers(tsk, &firing);
-        if (likely(tsk->signal != NULL)) {
+        check_process_timers(tsk, &firing);
-                spin_lock(&tsk->sighand->siglock);
-                /*
+        /*
-                 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
+         * We must release these locks before taking any timer's lock.
-                 * all the timers that are firing, and put them on the firing list.
+         * There is a potential race with timer deletion here, as the
-                 */
+         * siglock now protects our private firing list.  We have set
-                check_thread_timers(tsk, &firing);
+         * the firing flag in each timer, so that a deletion attempt
-                check_process_timers(tsk, &firing);
+         * that gets the timer lock before we do will give it up and
+         * spin until we've taken care of that timer below.
-                /*
+         */
-                 * We must release these locks before taking any timer's lock.
+        spin_unlock(&tsk->sighand->siglock);
-                 * There is a potential race with timer deletion here, as the
-                 * siglock now protects our private firing list.  We have set
-                 * the firing flag in each timer, so that a deletion attempt
-                 * that gets the timer lock before we do will give it up and
-                 * spin until we've taken care of that timer below.
-                 */
-                spin_unlock(&tsk->sighand->siglock);
-        }
-        read_unlock(&tasklist_lock);
        /*
         * Now that all the timers on our list have the firing flag,
@@ -1389,10 +1397,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 /*
 * Set one of the process-wide special case CPU timers.
- * The tasklist_lock and tsk->sighand->siglock must be held by the caller.
+ * The tsk->sighand->siglock must be held by the caller.
- * The oldval argument is null for the RLIMIT_CPU timer, where *newval is
+ * The *newval argument is relative and we update it to be absolute, *oldval
- * absolute; non-null for ITIMER_*, where *newval is relative and we update
+ * is absolute and we update it to be relative.
- * it to be absolute, *oldval is absolute and we update it to be relative.
 */
 void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
                           cputime_t *newval, cputime_t *oldval)
@@ -1401,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
        struct list_head *head;
        BUG_ON(clock_idx == CPUCLOCK_SCHED);
-        cpu_clock_sample_group_locked(clock_idx, tsk, &now);
+        cpu_clock_sample_group(clock_idx, tsk, &now);
        if (oldval) {
                if (!cputime_eq(*oldval, cputime_zero)) {
@@ -1435,13 +1442,14 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
            cputime_ge(list_first_entry(head,
                                  struct cpu_timer_list, entry)->expires.cpu,
                       *newval)) {
-                /*
+                switch (clock_idx) {
-                 * Rejigger each thread's expiry time so that one will
+                case CPUCLOCK_PROF:
-                 * notice before we hit the process-cumulative expiry time.
+                        tsk->signal->cputime_expires.prof_exp = *newval;
-                 */
+                        break;
-                union cpu_time_count expires = { .sched = 0 };
+                case CPUCLOCK_VIRT:
-                expires.cpu = *newval;
+                        tsk->signal->cputime_expires.virt_exp = *newval;
-                process_timer_rebalance(tsk, clock_idx, expires, now);
+                        break;
+                }
        }
 }
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e36d5798cbff..b931d7cedbfa 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -223,6 +223,15 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
 }
 /*
+ * Get monotonic time for posix timers
+ */
+static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
+{
+        getrawmonotonic(tp);
+        return 0;
+}
+/*
 * Initialize everything, well, just everything in Posix clocks/timers ;)
 */
 static __init int init_posix_timers(void)
@@ -235,9 +244,15 @@ static __init int init_posix_timers(void)
                .clock_get = posix_ktime_get_ts,
                .clock_set = do_posix_clock_nosettime,
        };
+        struct k_clock clock_monotonic_raw = {
+                .clock_getres = hrtimer_get_res,
+                .clock_get = posix_get_monotonic_raw,
+                .clock_set = do_posix_clock_nosettime,
+        };
        register_posix_clock(CLOCK_REALTIME, &clock_realtime);
        register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
+        register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
        posix_timers_cache = kmem_cache_create("posix_timers_cache",
                                        sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -298,6 +313,7 @@ void do_schedule_next_timer(struct siginfo *info)
 int posix_timer_event(struct k_itimer *timr, int si_private)
 {
+        int shared, ret;
        /*
         * FIXME: if ->sigq is queued we can race with
         * dequeue_signal()->do_schedule_next_timer().
@@ -311,25 +327,10 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
         */
        timr->sigq->info.si_sys_private = si_private;
-        timr->sigq->info.si_signo = timr->it_sigev_signo;
+        shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
-        timr->sigq->info.si_code = SI_TIMER;
+        ret = send_sigqueue(timr->sigq, timr->it_process, shared);
-        timr->sigq->info.si_tid = timr->it_id;
+        /* If we failed to send the signal the timer stops. */
-        timr->sigq->info.si_value = timr->it_sigev_value;
+        return ret > 0;
-        if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
-                struct task_struct *leader;
-                int ret = send_sigqueue(timr->sigq, timr->it_process, 0);
-                if (likely(ret >= 0))
-                        return ret;
-                timr->it_sigev_notify = SIGEV_SIGNAL;
-                leader = timr->it_process->group_leader;
-                put_task_struct(timr->it_process);
-                timr->it_process = leader;
-        }
-        return send_sigqueue(timr->sigq, timr->it_process, 1);
 }
 EXPORT_SYMBOL_GPL(posix_timer_event);
@@ -441,7 +442,7 @@ static struct k_itimer * alloc_posix_timer(void)
                return tmr;
        if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
                kmem_cache_free(posix_timers_cache, tmr);
-                tmr = NULL;
+                return NULL;
        }
        memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
        return tmr;
@@ -468,11 +469,9 @@ sys_timer_create(const clockid_t which_clock,
                 struct sigevent __user *timer_event_spec,
                 timer_t __user * created_timer_id)
 {
-        int error = 0;
+        struct k_itimer *new_timer;
-        struct k_itimer *new_timer = NULL;
+        int error, new_timer_id;
-        int new_timer_id;
+        struct task_struct *process;
-        struct task_struct *process = NULL;
-        unsigned long flags;
        sigevent_t event;
        int it_id_set = IT_ID_NOT_SET;
@@ -490,12 +489,11 @@ sys_timer_create(const clockid_t which_clock,
                goto out;
        }
        spin_lock_irq(&idr_lock);
-        error = idr_get_new(&posix_timers_id, (void *) new_timer,
+        error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
-                            &new_timer_id);
        spin_unlock_irq(&idr_lock);
-        if (error == -EAGAIN)
+        if (error) {
-                goto retry;
+                if (error == -EAGAIN)
-        else if (error) {
+                        goto retry;
                /*
                 * Weird looking, but we return EAGAIN if the IDR is
                 * full (proper POSIX return value for this)
@@ -526,67 +524,43 @@ sys_timer_create(const clockid_t which_clock,
                        error = -EFAULT;
                        goto out;
                }
-                new_timer->it_sigev_notify = event.sigev_notify;
+                rcu_read_lock();
-                new_timer->it_sigev_signo = event.sigev_signo;
+                process = good_sigevent(&event);
-                new_timer->it_sigev_value = event.sigev_value;
+                if (process)
+                        get_task_struct(process);
-                read_lock(&tasklist_lock);
+                rcu_read_unlock();
-                if ((process = good_sigevent(&event))) {
-                        /*
-                         * We may be setting up this process for another
-                         * thread.  It may be exiting.  To catch this
-                         * case the we check the PF_EXITING flag.  If
-                         * the flag is not set, the siglock will catch
-                         * him before it is too late (in exit_itimers).
-                         *
-                         * The exec case is a bit more invloved but easy
-                         * to code.  If the process is in our thread
-                         * group (and it must be or we would not allow
-                         * it here) and is doing an exec, it will cause
-                         * us to be killed.  In this case it will wait
-                         * for us to die which means we can finish this
-                         * linkage with our last gasp. I.e. no code :)
-                         */
-                        spin_lock_irqsave(&process->sighand->siglock, flags);
-                        if (!(process->flags & PF_EXITING)) {
-                                new_timer->it_process = process;
-                                list_add(&new_timer->list,
-                                         &process->signal->posix_timers);
-                                if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-                                        get_task_struct(process);
-                                spin_unlock_irqrestore(&process->sighand->siglock, flags);
-                        } else {
-                                spin_unlock_irqrestore(&process->sighand->siglock, flags);
-                                process = NULL;
-                        }
-                }
-                read_unlock(&tasklist_lock);
                if (!process) {
                        error = -EINVAL;
                        goto out;
                }
        } else {
-                new_timer->it_sigev_notify = SIGEV_SIGNAL;
+                event.sigev_notify = SIGEV_SIGNAL;
-                new_timer->it_sigev_signo = SIGALRM;
+                event.sigev_signo = SIGALRM;
-                new_timer->it_sigev_value.sival_int = new_timer->it_id;
+                event.sigev_value.sival_int = new_timer->it_id;
                process = current->group_leader;
-                spin_lock_irqsave(&process->sighand->siglock, flags);
+                get_task_struct(process);
-                new_timer->it_process = process;
-                list_add(&new_timer->list, &process->signal->posix_timers);
-                spin_unlock_irqrestore(&process->sighand->siglock, flags);
        }
+        new_timer->it_sigev_notify     = event.sigev_notify;
+        new_timer->sigq->info.si_signo = event.sigev_signo;
+        new_timer->sigq->info.si_value = event.sigev_value;
+        new_timer->sigq->info.si_tid   = new_timer->it_id;
+        new_timer->sigq->info.si_code  = SI_TIMER;
+        spin_lock_irq(&current->sighand->siglock);
+        new_timer->it_process = process;
+        list_add(&new_timer->list, &current->signal->posix_timers);
+        spin_unlock_irq(&current->sighand->siglock);
+        return 0;
        /*
         * In the case of the timer belonging to another task, after
         * the task is unlocked, the timer is owned by the other task
         * and may cease to exist at any time.  Don't use or modify
         * new_timer after the unlock call.
         */
 out:
-        if (error)
+        release_posix_timer(new_timer, it_id_set);
-                release_posix_timer(new_timer, it_id_set);
        return error;
 }
@@ -597,7 +571,7 @@ out:
 * the find to the timer lock.  To avoid a dead lock, the timer id MUST
 * be release with out holding the timer lock.
 */
-static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
+static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
 {
        struct k_itimer *timr;
        /*
@@ -605,23 +579,20 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
         * flags part over to the timer lock.  Must not let interrupts in
         * while we are moving the lock.
         */
        spin_lock_irqsave(&idr_lock, *flags);
-        timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id);
+        timr = idr_find(&posix_timers_id, (int)timer_id);
        if (timr) {
                spin_lock(&timr->it_lock);
+                if (timr->it_process &&
-                if ((timr->it_id != timer_id) || !(timr->it_process) ||
+                    same_thread_group(timr->it_process, current)) {
-                                !same_thread_group(timr->it_process, current)) {
-                        spin_unlock(&timr->it_lock);
-                        spin_unlock_irqrestore(&idr_lock, *flags);
-                        timr = NULL;
-                } else
                        spin_unlock(&idr_lock);
-        } else
+                        return timr;
-                spin_unlock_irqrestore(&idr_lock, *flags);
+                }
+                spin_unlock(&timr->it_lock);
+        }
+        spin_unlock_irqrestore(&idr_lock, *flags);
-        return timr;
+        return NULL;
 }
 /*
@@ -862,8 +833,7 @@ retry_delete:
         * This keeps any tasks waiting on the spin lock from thinking
         * they got something (see the lock code above).
         */
-        if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+        put_task_struct(timer->it_process);
-                put_task_struct(timer->it_process);
        timer->it_process = NULL;
        unlock_timer(timer, flags);
@@ -890,8 +860,7 @@ retry_delete:
         * This keeps any tasks waiting on the spin lock from thinking
         * they got something (see the lock code above).
         */
-        if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+        put_task_struct(timer->it_process);
-                put_task_struct(timer->it_process);
        timer->it_process = NULL;
        unlock_timer(timer, flags);
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f011e0870b52..331f9836383f 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -14,6 +14,7 @@
 #include <linux/reboot.h>
 #include <linux/string.h>
 #include <linux/device.h>
+#include <linux/kmod.h>
 #include <linux/delay.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -21,6 +22,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <linux/ftrace.h>
 #include "power.h"
@@ -255,7 +257,7 @@ static int create_image(int platform_mode)
 int hibernation_snapshot(int platform_mode)
 {
-        int error;
+        int error, ftrace_save;
        /* Free memory before shutting down devices. */
        error = swsusp_shrink_memory();
@@ -267,6 +269,7 @@ int hibernation_snapshot(int platform_mode)
                goto Close;
        suspend_console();
+        ftrace_save = __ftrace_enabled_save();
        error = device_suspend(PMSG_FREEZE);
        if (error)
                goto Recover_platform;
@@ -296,6 +299,7 @@ int hibernation_snapshot(int platform_mode)
 Resume_devices:
        device_resume(in_suspend ?
                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+        __ftrace_enabled_restore(ftrace_save);
        resume_console();
 Close:
        platform_end(platform_mode);
@@ -366,10 +370,11 @@ static int resume_target_kernel(void)
 int hibernation_restore(int platform_mode)
 {
-        int error;
+        int error, ftrace_save;
        pm_prepare_console();
        suspend_console();
+        ftrace_save = __ftrace_enabled_save();
        error = device_suspend(PMSG_QUIESCE);
        if (error)
                goto Finish;
@@ -384,6 +389,7 @@ int hibernation_restore(int platform_mode)
        platform_restore_cleanup(platform_mode);
        device_resume(PMSG_RECOVER);
 Finish:
+        __ftrace_enabled_restore(ftrace_save);
        resume_console();
        pm_restore_console();
        return error;
@@ -396,7 +402,7 @@ int hibernation_restore(int platform_mode)
 int hibernation_platform_enter(void)
 {
-        int error;
+        int error, ftrace_save;
        if (!hibernation_ops)
                return -ENOSYS;
@@ -411,6 +417,7 @@ int hibernation_platform_enter(void)
                goto Close;
        suspend_console();
+        ftrace_save = __ftrace_enabled_save();
        error = device_suspend(PMSG_HIBERNATE);
        if (error) {
                if (hibernation_ops->recover)
@@ -445,6 +452,7 @@ int hibernation_platform_enter(void)
        hibernation_ops->finish();
 Resume_devices:
        device_resume(PMSG_RESTORE);
+        __ftrace_enabled_restore(ftrace_save);
        resume_console();
 Close:
        hibernation_ops->end();
@@ -513,6 +521,10 @@ int hibernate(void)
        if (error)
                goto Exit;
+        error = usermodehelper_disable();
+        if (error)
+                goto Exit;
        /* Allocate memory management structures */
        error = create_basic_memory_bitmaps();
        if (error)
@@ -551,6 +563,7 @@ int hibernate(void)
        thaw_processes();
 Finish:
        free_basic_memory_bitmaps();
+        usermodehelper_enable();
 Exit:
        pm_notifier_call_chain(PM_POST_HIBERNATION);
        pm_restore_console();
@@ -627,6 +640,10 @@ static int software_resume(void)
        if (error)
                goto Finish;
+        error = usermodehelper_disable();
+        if (error)
+                goto Finish;
        error = create_basic_memory_bitmaps();
        if (error)
                goto Finish;
@@ -649,6 +666,7 @@ static int software_resume(void)
        thaw_processes();
 Done:
        free_basic_memory_bitmaps();
+        usermodehelper_enable();
 Finish:
        pm_notifier_call_chain(PM_POST_RESTORE);
        pm_restore_console();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0b7476f5d2a6..19122cf6d827 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -14,6 +14,7 @@
 #include <linux/string.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
+#include <linux/kmod.h>
 #include <linux/init.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
@@ -21,6 +22,7 @@
 #include <linux/freezer.h>
 #include <linux/vmstat.h>
 #include <linux/syscalls.h>
+#include <linux/ftrace.h>
 #include "power.h"
@@ -236,6 +238,10 @@ static int suspend_prepare(void)
        if (error)
                goto Finish;
+        error = usermodehelper_disable();
+        if (error)
+                goto Finish;
        if (suspend_freeze_processes()) {
                error = -EAGAIN;
                goto Thaw;
@@ -255,6 +261,7 @@ static int suspend_prepare(void)
 Thaw:
        suspend_thaw_processes();
+        usermodehelper_enable();
 Finish:
        pm_notifier_call_chain(PM_POST_SUSPEND);
        pm_restore_console();
@@ -310,7 +317,7 @@ static int suspend_enter(suspend_state_t state)
 */
 int suspend_devices_and_enter(suspend_state_t state)
 {
-        int error;
+        int error, ftrace_save;
        if (!suspend_ops)
                return -ENOSYS;
@@ -321,6 +328,7 @@ int suspend_devices_and_enter(suspend_state_t state)
                        goto Close;
        }
        suspend_console();
+        ftrace_save = __ftrace_enabled_save();
        suspend_test_start();
        error = device_suspend(PMSG_SUSPEND);
        if (error) {
@@ -352,6 +360,7 @@ int suspend_devices_and_enter(suspend_state_t state)
        suspend_test_start();
        device_resume(PMSG_RESUME);
        suspend_test_finish("resume devices");
+        __ftrace_enabled_restore(ftrace_save);
        resume_console();
 Close:
        if (suspend_ops->end)
@@ -373,6 +382,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 static void suspend_finish(void)
 {
        suspend_thaw_processes();
+        usermodehelper_enable();
        pm_notifier_call_chain(PM_POST_SUSPEND);
        pm_restore_console();
 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a0abf9a463f9..80ccac849e46 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -14,7 +14,6 @@
 #include <linux/module.h>
 #include <linux/file.h>
 #include <linux/utsname.h>
-#include <linux/version.h>
 #include <linux/delay.h>
 #include <linux/bitops.h>
 #include <linux/genhd.h>
diff --git a/kernel/power/user.c b/kernel/power/user.c
index a6332a313262..005b93d839ba 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -212,13 +212,20 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
        case SNAPSHOT_FREEZE:
                if (data->frozen)
                        break;
                printk("Syncing filesystems ... ");
                sys_sync();
                printk("done.\n");
-                error = freeze_processes();
+                error = usermodehelper_disable();
                if (error)
+                        break;
+                error = freeze_processes();
+                if (error) {
                        thaw_processes();
+                        usermodehelper_enable();
+                }
                if (!error)
                        data->frozen = 1;
                break;
@@ -227,6 +234,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                if (!data->frozen || data->ready)
                        break;
                thaw_processes();
+                usermodehelper_enable();
                data->frozen = 0;
                break;
diff --git a/kernel/printk.c b/kernel/printk.c
index b51b1567bb55..6341af77eb65 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -13,7 +13,7 @@
 * Fixed SMP synchronization, 08/08/99, Manfred Spraul
 *     manfred@colorfullife.com
 * Rewrote bits to get rid of console_lock
- *      01Mar01 Andrew Morton <andrewm@uow.edu.au>
+ *      01Mar01 Andrew Morton
 */
 #include <linux/kernel.h>
@@ -577,9 +577,6 @@ static int have_callable_console(void)
 * @fmt: format string
 *
 * This is printk().  It can be called from any context.  We want it to work.
- * Be aware of the fact that if oops_in_progress is not set, we might try to
- * wake klogd up which could deadlock on runqueue lock if printk() is called
- * from scheduler code.
 *
 * We try to grab the console_sem.  If we succeed, it's easy - we log the output and
 * call the console drivers.  If we fail to get the semaphore we place the output
@@ -593,6 +590,8 @@ static int have_callable_console(void)
 *
 * See also:
 * printf(3)
+ *
+ * See the vsnprintf() documentation for format string extensions over C99.
 */
 asmlinkage int printk(const char *fmt, ...)
@@ -982,10 +981,25 @@ int is_console_locked(void)
        return console_locked;
 }
-void wake_up_klogd(void)
+static DEFINE_PER_CPU(int, printk_pending);
+void printk_tick(void)
 {
-        if (!oops_in_progress && waitqueue_active(&log_wait))
+        if (__get_cpu_var(printk_pending)) {
+                __get_cpu_var(printk_pending) = 0;
                wake_up_interruptible(&log_wait);
+        }
+}
+int printk_needs_cpu(int cpu)
+{
+        return per_cpu(printk_pending, cpu);
+}
+void wake_up_klogd(void)
+{
+        if (waitqueue_active(&log_wait))
+                __raw_get_cpu_var(printk_pending) = 1;
 }
 /**
@@ -1291,22 +1305,6 @@ static int __init disable_boot_consoles(void)
 }
 late_initcall(disable_boot_consoles);
-/**
- * tty_write_message - write a message to a certain tty, not just the console.
- * @tty: the destination tty_struct
- * @msg: the message to write
- *
- * This is used for messages that need to be redirected to a specific tty.
- * We don't put it into the syslog queue right now maybe in the future if
- * really needed.
- */
-void tty_write_message(struct tty_struct *tty, char *msg)
-{
-        if (tty && tty->ops->write)
-                tty->ops->write(tty, msg, strlen(msg));
-        return;
-}
 #if defined CONFIG_PRINTK
 /*
diff --git a/kernel/profile.c b/kernel/profile.c
index cd26bed4cc26..a9e422df6bf6 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -22,6 +22,8 @@
 #include <linux/cpu.h>
 #include <linux/highmem.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
 #include <asm/sections.h>
 #include <asm/irq_regs.h>
 #include <asm/ptrace.h>
@@ -50,11 +52,11 @@ static DEFINE_PER_CPU(int, cpu_profile_flip);
 static DEFINE_MUTEX(profile_flip_mutex);
 #endif /* CONFIG_SMP */
-static int __init profile_setup(char *str)
+int profile_setup(char *str)
 {
-        static char __initdata schedstr[] = "schedule";
+        static char schedstr[] = "schedule";
-        static char __initdata sleepstr[] = "sleep";
+        static char sleepstr[] = "sleep";
-        static char __initdata kvmstr[] = "kvm";
+        static char kvmstr[] = "kvm";
        int par;
        if (!strncmp(str, sleepstr, strlen(sleepstr))) {
@@ -100,14 +102,33 @@ static int __init profile_setup(char *str)
 __setup("profile=", profile_setup);
-void __init profile_init(void)
+int profile_init(void)
 {
+        int buffer_bytes;
        if (!prof_on)
-                return;
+                return 0;
        /* only text is profiled */
        prof_len = (_etext - _stext) >> prof_shift;
-        prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
+        buffer_bytes = prof_len*sizeof(atomic_t);
+        if (!slab_is_available()) {
+                prof_buffer = alloc_bootmem(buffer_bytes);
+                return 0;
+        }
+        prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
+        if (prof_buffer)
+                return 0;
+        prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO);
+        if (prof_buffer)
+                return 0;
+        prof_buffer = vmalloc(buffer_bytes);
+        if (prof_buffer)
+                return 0;
+        return -ENOMEM;
 }
 /* Profile event notifications */
@@ -527,7 +548,7 @@ static void __init profile_nop(void *unused)
 {
 }
-static int __init create_hash_tables(void)
+static int create_hash_tables(void)
 {
        int cpu;
@@ -575,14 +596,14 @@ out_cleanup:
 #define create_hash_tables()                    ({ 0; })
 #endif
-static int __init create_proc_profile(void)
+int create_proc_profile(void)
 {
        struct proc_dir_entry *entry;
        if (!prof_on)
                return 0;
        if (create_hash_tables())
-                return -1;
+                return -ENOMEM;
        entry = proc_create("profile", S_IWUSR | S_IRUGO,
                            NULL, &proc_profile_operations);
        if (!entry)
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index aad93cdc9f68..37f72e551542 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -47,6 +47,7 @@
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
+#include <linux/time.h>
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
@@ -60,12 +61,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
 static struct rcu_ctrlblk rcu_ctrlblk = {
        .cur = -300,
        .completed = -300,
+        .pending = -300,
        .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
        .cpumask = CPU_MASK_NONE,
 };
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
        .cur = -300,
        .completed = -300,
+        .pending = -300,
        .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
        .cpumask = CPU_MASK_NONE,
 };
@@ -83,7 +86,10 @@ static void force_quiescent_state(struct rcu_data *rdp,
 {
        int cpu;
        cpumask_t cpumask;
+        unsigned long flags;
        set_need_resched();
+        spin_lock_irqsave(&rcp->lock, flags);
        if (unlikely(!rcp->signaled)) {
                rcp->signaled = 1;
                /*
@@ -109,6 +115,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
                for_each_cpu_mask_nr(cpu, cpumask)
                        smp_send_reschedule(cpu);
        }
+        spin_unlock_irqrestore(&rcp->lock, flags);
 }
 #else
 static inline void force_quiescent_state(struct rcu_data *rdp,
@@ -118,6 +125,126 @@ static inline void force_quiescent_state(struct rcu_data *rdp,
 }
 #endif
+static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
+                struct rcu_data *rdp)
+{
+        long batch;
+        head->next = NULL;
+        smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
+        /*
+         * Determine the batch number of this callback.
+         *
+         * Using ACCESS_ONCE to avoid the following error when gcc eliminates
+         * local variable "batch" and emits codes like this:
+         *      1) rdp->batch = rcp->cur + 1 # gets old value
+         *      ......
+         *      2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
+         * then [*nxttail[0], *nxttail[1]) may contain callbacks
+         * that batch# = rdp->batch, see the comment of struct rcu_data.
+         */
+        batch = ACCESS_ONCE(rcp->cur) + 1;
+        if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
+                /* process callbacks */
+                rdp->nxttail[0] = rdp->nxttail[1];
+                rdp->nxttail[1] = rdp->nxttail[2];
+                if (rcu_batch_after(batch - 1, rdp->batch))
+                        rdp->nxttail[0] = rdp->nxttail[2];
+        }
+        rdp->batch = batch;
+        *rdp->nxttail[2] = head;
+        rdp->nxttail[2] = &head->next;
+        if (unlikely(++rdp->qlen > qhimark)) {
+                rdp->blimit = INT_MAX;
+                force_quiescent_state(rdp, &rcu_ctrlblk);
+        }
+}
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
+{
+        rcp->gp_start = jiffies;
+        rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
+}
+static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+        int cpu;
+        long delta;
+        unsigned long flags;
+        /* Only let one CPU complain about others per time interval. */
+        spin_lock_irqsave(&rcp->lock, flags);
+        delta = jiffies - rcp->jiffies_stall;
+        if (delta < 2 || rcp->cur != rcp->completed) {
+                spin_unlock_irqrestore(&rcp->lock, flags);
+                return;
+        }
+        rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+        spin_unlock_irqrestore(&rcp->lock, flags);
+        /* OK, time to rat on our buddy... */
+        printk(KERN_ERR "RCU detected CPU stalls:");
+        for_each_possible_cpu(cpu) {
+                if (cpu_isset(cpu, rcp->cpumask))
+                        printk(" %d", cpu);
+        }
+        printk(" (detected by %d, t=%ld jiffies)\n",
+               smp_processor_id(), (long)(jiffies - rcp->gp_start));
+}
+static void print_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+        unsigned long flags;
+        printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
+                        smp_processor_id(), jiffies,
+                        jiffies - rcp->gp_start);
+        dump_stack();
+        spin_lock_irqsave(&rcp->lock, flags);
+        if ((long)(jiffies - rcp->jiffies_stall) >= 0)
+                rcp->jiffies_stall =
+                        jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+        spin_unlock_irqrestore(&rcp->lock, flags);
+        set_need_resched();  /* kick ourselves to get things going. */
+}
+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+        long delta;
+        delta = jiffies - rcp->jiffies_stall;
+        if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
+                /* We haven't checked in, so go dump stack. */
+                print_cpu_stall(rcp);
+        } else if (rcp->cur != rcp->completed && delta >= 2) {
+                /* They had two seconds to dump stack, so complain. */
+                print_other_cpu_stall(rcp);
+        }
+}
+#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
+{
+}
+static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+}
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 /**
 * call_rcu - Queue an RCU callback for invocation after a grace period.
 * @head: structure to be used for queueing the RCU updates.
@@ -133,18 +260,10 @@ void call_rcu(struct rcu_head *head,
                                void (*func)(struct rcu_head *rcu))
 {
        unsigned long flags;
-        struct rcu_data *rdp;
        head->func = func;
-        head->next = NULL;
        local_irq_save(flags);
-        rdp = &__get_cpu_var(rcu_data);
+        __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
-        *rdp->nxttail = head;
-        rdp->nxttail = &head->next;
-        if (unlikely(++rdp->qlen > qhimark)) {
-                rdp->blimit = INT_MAX;
-                force_quiescent_state(rdp, &rcu_ctrlblk);
-        }
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
@@ -169,20 +288,10 @@ void call_rcu_bh(struct rcu_head *head,
                                void (*func)(struct rcu_head *rcu))
 {
        unsigned long flags;
-        struct rcu_data *rdp;
        head->func = func;
-        head->next = NULL;
        local_irq_save(flags);
-        rdp = &__get_cpu_var(rcu_bh_data);
+        __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
-        *rdp->nxttail = head;
-        rdp->nxttail = &head->next;
-        if (unlikely(++rdp->qlen > qhimark)) {
-                rdp->blimit = INT_MAX;
-                force_quiescent_state(rdp, &rcu_bh_ctrlblk);
-        }
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
@@ -211,12 +320,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 static inline void raise_rcu_softirq(void)
 {
        raise_softirq(RCU_SOFTIRQ);
-        /*
-         * The smp_mb() here is required to ensure that this cpu's
-         * __rcu_process_callbacks() reads the most recently updated
-         * value of rcu->cur.
-         */
-        smp_mb();
 }
 /*
@@ -225,6 +328,7 @@ static inline void raise_rcu_softirq(void)
 */
 static void rcu_do_batch(struct rcu_data *rdp)
 {
+        unsigned long flags;
        struct rcu_head *next, *list;
        int count = 0;
@@ -239,9 +343,9 @@ static void rcu_do_batch(struct rcu_data *rdp)
        }
        rdp->donelist = list;
-        local_irq_disable();
+        local_irq_save(flags);
        rdp->qlen -= count;
-        local_irq_enable();
+        local_irq_restore(flags);
        if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
                rdp->blimit = blimit;
@@ -269,6 +373,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
 *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
 *   period (if necessary).
 */
 /*
 * Register a new batch of callbacks, and start it up if there is currently no
 * active batch and the batch to be registered has not already occurred.
@@ -276,15 +381,10 @@ static void rcu_do_batch(struct rcu_data *rdp)
 */
 static void rcu_start_batch(struct rcu_ctrlblk *rcp)
 {
-        if (rcp->next_pending &&
+        if (rcp->cur != rcp->pending &&
                        rcp->completed == rcp->cur) {
-                rcp->next_pending = 0;
-                /*
-                 * next_pending == 0 must be visible in
-                 * __rcu_process_callbacks() before it can see new value of cur.
-                 */
-                smp_wmb();
                rcp->cur++;
+                record_gp_stall_check_time(rcp);
                /*
                 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -322,6 +422,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
 static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
                                        struct rcu_data *rdp)
 {
+        unsigned long flags;
        if (rdp->quiescbatch != rcp->cur) {
                /* start new grace period: */
                rdp->qs_pending = 1;
@@ -345,7 +447,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
                return;
        rdp->qs_pending = 0;
-        spin_lock(&rcp->lock);
+        spin_lock_irqsave(&rcp->lock, flags);
        /*
         * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
         * during cpu startup. Ignore the quiescent state.
@@ -353,7 +455,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
        if (likely(rdp->quiescbatch == rcp->cur))
                cpu_quiet(rdp->cpu, rcp);
-        spin_unlock(&rcp->lock);
+        spin_unlock_irqrestore(&rcp->lock, flags);
 }
@@ -364,33 +466,38 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
 * which is dead and hence not processing interrupts.
 */
 static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
-                                struct rcu_head **tail)
+                                struct rcu_head **tail, long batch)
 {
-        local_irq_disable();
+        unsigned long flags;
-        *this_rdp->nxttail = list;
-        if (list)
+        if (list) {
-                this_rdp->nxttail = tail;
+                local_irq_save(flags);
-        local_irq_enable();
+                this_rdp->batch = batch;
+                *this_rdp->nxttail[2] = list;
+                this_rdp->nxttail[2] = tail;
+                local_irq_restore(flags);
+        }
 }
 static void __rcu_offline_cpu(struct rcu_data *this_rdp,
                                struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
-        /* if the cpu going offline owns the grace period
+        unsigned long flags;
+        /*
+         * if the cpu going offline owns the grace period
         * we can block indefinitely waiting for it, so flush
         * it here
         */
-        spin_lock_bh(&rcp->lock);
+        spin_lock_irqsave(&rcp->lock, flags);
        if (rcp->cur != rcp->completed)
                cpu_quiet(rdp->cpu, rcp);
-        spin_unlock_bh(&rcp->lock);
+        rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
-        rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
+        rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
-        rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
+        spin_unlock(&rcp->lock);
-        rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
-        local_irq_disable();
        this_rdp->qlen += rdp->qlen;
-        local_irq_enable();
+        local_irq_restore(flags);
 }
 static void rcu_offline_cpu(int cpu)
@@ -420,38 +527,52 @@ static void rcu_offline_cpu(int cpu)
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
                                        struct rcu_data *rdp)
 {
-        if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
+        unsigned long flags;
-                *rdp->donetail = rdp->curlist;
+        long completed_snap;
-                rdp->donetail = rdp->curtail;
-                rdp->curlist = NULL;
-                rdp->curtail = &rdp->curlist;
-        }
-        if (rdp->nxtlist && !rdp->curlist) {
+        if (rdp->nxtlist) {
-                local_irq_disable();
+                local_irq_save(flags);
-                rdp->curlist = rdp->nxtlist;
+                completed_snap = ACCESS_ONCE(rcp->completed);
-                rdp->curtail = rdp->nxttail;
-                rdp->nxtlist = NULL;
-                rdp->nxttail = &rdp->nxtlist;
-                local_irq_enable();
                /*
-                 * start the next batch of callbacks
+                 * move the other grace-period-completed entries to
+                 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
                 */
+                if (!rcu_batch_before(completed_snap, rdp->batch))
+                        rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
+                else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
+                        rdp->nxttail[0] = rdp->nxttail[1];
-                /* determine batch number */
+                /*
-                rdp->batch = rcp->cur + 1;
+                 * the grace period for entries in
-                /* see the comment and corresponding wmb() in
+                 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
-                 * the rcu_start_batch()
+                 * move these entries to donelist
                 */
-                smp_rmb();
+                if (rdp->nxttail[0] != &rdp->nxtlist) {
+                        *rdp->donetail = rdp->nxtlist;
+                        rdp->donetail = rdp->nxttail[0];
+                        rdp->nxtlist = *rdp->nxttail[0];
+                        *rdp->donetail = NULL;
+                        if (rdp->nxttail[1] == rdp->nxttail[0])
+                                rdp->nxttail[1] = &rdp->nxtlist;
+                        if (rdp->nxttail[2] == rdp->nxttail[0])
+                                rdp->nxttail[2] = &rdp->nxtlist;
+                        rdp->nxttail[0] = &rdp->nxtlist;
+                }
+                local_irq_restore(flags);
+                if (rcu_batch_after(rdp->batch, rcp->pending)) {
+                        unsigned long flags2;
-                if (!rcp->next_pending) {
                        /* and start it/schedule start if it's a new batch */
-                        spin_lock(&rcp->lock);
+                        spin_lock_irqsave(&rcp->lock, flags2);
-                        rcp->next_pending = 1;
+                        if (rcu_batch_after(rdp->batch, rcp->pending)) {
-                        rcu_start_batch(rcp);
+                                rcp->pending = rdp->batch;
-                        spin_unlock(&rcp->lock);
+                                rcu_start_batch(rcp);
+                        }
+                        spin_unlock_irqrestore(&rcp->lock, flags2);
                }
        }
@@ -462,21 +583,53 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
+        /*
+         * Memory references from any prior RCU read-side critical sections
+         * executed by the interrupted code must be see before any RCU
+         * grace-period manupulations below.
+         */
+        smp_mb(); /* See above block comment. */
        __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
        __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
+        /*
+         * Memory references from any later RCU read-side critical sections
+         * executed by the interrupted code must be see after any RCU
+         * grace-period manupulations above.
+         */
+        smp_mb(); /* See above block comment. */
 }
 static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
-        /* This cpu has pending rcu entries and the grace period
+        /* Check for CPU stalls, if enabled. */
-         * for them has completed.
+        check_cpu_stall(rcp);
-         */
-        if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
-                return 1;
-        /* This cpu has no pending entries, but there are new entries */
+        if (rdp->nxtlist) {
-        if (!rdp->curlist && rdp->nxtlist)
+                long completed_snap = ACCESS_ONCE(rcp->completed);
-                return 1;
+                /*
+                 * This cpu has pending rcu entries and the grace period
+                 * for them has completed.
+                 */
+                if (!rcu_batch_before(completed_snap, rdp->batch))
+                        return 1;
+                if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
+                                rdp->nxttail[0] != rdp->nxttail[1])
+                        return 1;
+                if (rdp->nxttail[0] != &rdp->nxtlist)
+                        return 1;
+                /*
+                 * This cpu has pending rcu entries and the new batch
+                 * for then hasn't been started nor scheduled start
+                 */
+                if (rcu_batch_after(rdp->batch, rcp->pending))
+                        return 1;
+        }
        /* This cpu has finished callbacks to invoke */
        if (rdp->donelist)
@@ -512,9 +665,15 @@ int rcu_needs_cpu(int cpu)
        struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
        struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
-        return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
+        return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
 }
+/*
+ * Top-level function driving RCU grace-period detection, normally
+ * invoked from the scheduler-clock interrupt.  This function simply
+ * increments counters that are read only from softirq by this same
+ * CPU, so there are no memory barriers required.
+ */
 void rcu_check_callbacks(int cpu, int user)
 {
        if (user ||
@@ -558,14 +717,17 @@ void rcu_check_callbacks(int cpu, int user)
 static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
                                                struct rcu_data *rdp)
 {
+        unsigned long flags;
+        spin_lock_irqsave(&rcp->lock, flags);
        memset(rdp, 0, sizeof(*rdp));
-        rdp->curtail = &rdp->curlist;
+        rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
-        rdp->nxttail = &rdp->nxtlist;
        rdp->donetail = &rdp->donelist;
        rdp->quiescbatch = rcp->completed;
        rdp->qs_pending = 0;
        rdp->cpu = cpu;
        rdp->blimit = blimit;
+        spin_unlock_irqrestore(&rcp->lock, flags);
 }
 static void __cpuinit rcu_online_cpu(int cpu)
@@ -610,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = {
 */
 void __init __rcu_init(void)
 {
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+        printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
        rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
                        (void *)(long)smp_processor_id());
        /* Register notifier for non-boot CPUs */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f14f372cf6f5..467d5940f624 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -77,6 +77,7 @@ void wakeme_after_rcu(struct rcu_head  *head)
 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
 * and may be nested.
 */
+void synchronize_rcu(void);     /* Makes kernel-doc tools happy */
 synchronize_rcu_xxx(synchronize_rcu, call_rcu)
 EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 27827931ca0d..ca4bbbe04aa4 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -59,14 +59,6 @@
 #include <linux/rcupreempt_trace.h>
 /*
- * Macro that prevents the compiler from reordering accesses, but does
- * absolutely -nothing- to prevent CPUs from reordering.  This is used
- * only to mediate communication between mainline code and hardware
- * interrupt and NMI handlers.
- */
-#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
-/*
 * PREEMPT_RCU data structures.
 */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 5edf82c34bbc..35c2d3360ecf 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -308,11 +308,16 @@ out:
 static int __init rcupreempt_trace_init(void)
 {
+        int ret;
        mutex_init(&rcupreempt_trace_mutex);
        rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
        if (!rcupreempt_trace_buf)
                return 1;
-        return rcupreempt_debugfs_init();
+        ret = rcupreempt_debugfs_init();
+        if (ret)
+                kfree(rcupreempt_trace_buf);
+        return ret;
 }
 static void __exit rcupreempt_trace_cleanup(void)
diff --git a/kernel/resource.c b/kernel/resource.c
index f5b518eabefe..4089d12af6e0 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -38,10 +38,6 @@ EXPORT_SYMBOL(iomem_resource);
 static DEFINE_RWLOCK(resource_lock);
-#ifdef CONFIG_PROC_FS
-enum { MAX_IORES_LEVEL = 5 };
 static void *r_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct resource *p = v;
@@ -53,6 +49,10 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
        return p->sibling;
 }
+#ifdef CONFIG_PROC_FS
+enum { MAX_IORES_LEVEL = 5 };
 static void *r_start(struct seq_file *m, loff_t *pos)
        __acquires(resource_lock)
 {
@@ -362,35 +362,21 @@ int allocate_resource(struct resource *root, struct resource *new,
 EXPORT_SYMBOL(allocate_resource);
-/**
+/*
- * insert_resource - Inserts a resource in the resource tree
+ * Insert a resource into the resource tree. If successful, return NULL,
- * @parent: parent of the new resource
+ * otherwise return the conflicting resource (compare to __request_resource())
- * @new: new resource to insert
- *
- * Returns 0 on success, -EBUSY if the resource can't be inserted.
- *
- * This function is equivalent to request_resource when no conflict
- * happens. If a conflict happens, and the conflicting resources
- * entirely fit within the range of the new resource, then the new
- * resource is inserted and the conflicting resources become children of
- * the new resource.
 */
-int insert_resource(struct resource *parent, struct resource *new)
+static struct resource * __insert_resource(struct resource *parent, struct resource *new)
 {
-        int result;
        struct resource *first, *next;
-        write_lock(&resource_lock);
        for (;; parent = first) {
-                result = 0;
                first = __request_resource(parent, new);
                if (!first)
-                        goto out;
+                        return first;
-                result = -EBUSY;
                if (first == parent)
-                        goto out;
+                        return first;
                if ((first->start > new->start) || (first->end < new->end))
                        break;
@@ -401,15 +387,13 @@ int insert_resource(struct resource *parent, struct resource *new)
        for (next = first; ; next = next->sibling) {
                /* Partial overlap? Bad, and unfixable */
                if (next->start < new->start || next->end > new->end)
-                        goto out;
+                        return next;
                if (!next->sibling)
                        break;
                if (next->sibling->start > new->end)
                        break;
        }
-        result = 0;
        new->parent = parent;
        new->sibling = next->sibling;
        new->child = first;
@@ -426,10 +410,64 @@ int insert_resource(struct resource *parent, struct resource *new)
                        next = next->sibling;
                next->sibling = new;
        }
+        return NULL;
+}
- out:
+/**
+ * insert_resource - Inserts a resource in the resource tree
+ * @parent: parent of the new resource
+ * @new: new resource to insert
+ *
+ * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ *
+ * This function is equivalent to request_resource when no conflict
+ * happens. If a conflict happens, and the conflicting resources
+ * entirely fit within the range of the new resource, then the new
+ * resource is inserted and the conflicting resources become children of
+ * the new resource.
+ */
+int insert_resource(struct resource *parent, struct resource *new)
+{
+        struct resource *conflict;
+        write_lock(&resource_lock);
+        conflict = __insert_resource(parent, new);
+        write_unlock(&resource_lock);
+        return conflict ? -EBUSY : 0;
+}
+/**
+ * insert_resource_expand_to_fit - Insert a resource into the resource tree
+ * @root: root resource descriptor
+ * @new: new resource to insert
+ *
+ * Insert a resource into the resource tree, possibly expanding it in order
+ * to make it encompass any conflicting resources.
+ */
+void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
+{
+        if (new->parent)
+                return;
+        write_lock(&resource_lock);
+        for (;;) {
+                struct resource *conflict;
+                conflict = __insert_resource(root, new);
+                if (!conflict)
+                        break;
+                if (conflict == root)
+                        break;
+                /* Ok, expand resource to cover the conflict, then try again .. */
+                if (conflict->start < new->start)
+                        new->start = conflict->start;
+                if (conflict->end > new->end)
+                        new->end = conflict->end;
+                printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
+        }
        write_unlock(&resource_lock);
-        return result;
 }
 /**
@@ -478,6 +516,70 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
        return result;
 }
+static void __init __reserve_region_with_split(struct resource *root,
+                resource_size_t start, resource_size_t end,
+                const char *name)
+{
+        struct resource *parent = root;
+        struct resource *conflict;
+        struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
+        if (!res)
+                return;
+        res->name = name;
+        res->start = start;
+        res->end = end;
+        res->flags = IORESOURCE_BUSY;
+        for (;;) {
+                conflict = __request_resource(parent, res);
+                if (!conflict)
+                        break;
+                if (conflict != parent) {
+                        parent = conflict;
+                        if (!(conflict->flags & IORESOURCE_BUSY))
+                                continue;
+                }
+                /* Uhhuh, that didn't work out.. */
+                kfree(res);
+                res = NULL;
+                break;
+        }
+        if (!res) {
+                /* failed, split and try again */
+                /* conflict covered whole area */
+                if (conflict->start <= start && conflict->end >= end)
+                        return;
+                if (conflict->start > start)
+                        __reserve_region_with_split(root, start, conflict->start-1, name);
+                if (!(conflict->flags & IORESOURCE_BUSY)) {
+                        resource_size_t common_start, common_end;
+                        common_start = max(conflict->start, start);
+                        common_end = min(conflict->end, end);
+                        if (common_start < common_end)
+                                __reserve_region_with_split(root, common_start, common_end, name);
+                }
+                if (conflict->end < end)
+                        __reserve_region_with_split(root, conflict->end+1, end, name);
+        }
+}
+void reserve_region_with_split(struct resource *root,
+                resource_size_t start, resource_size_t end,
+                const char *name)
+{
+        write_lock(&resource_lock);
+        __reserve_region_with_split(root, start, end, name);
+        write_unlock(&resource_lock);
+}
 EXPORT_SYMBOL(adjust_resource);
 /**
@@ -524,33 +626,34 @@ struct resource * __request_region(struct resource *parent,
 {
        struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
-        if (res) {
+        if (!res)
-                res->name = name;
+                return NULL;
-                res->start = start;
-                res->end = start + n - 1;
-                res->flags = IORESOURCE_BUSY;
-                write_lock(&resource_lock);
+        res->name = name;
+        res->start = start;
+        res->end = start + n - 1;
+        res->flags = IORESOURCE_BUSY;
-                for (;;) {
+        write_lock(&resource_lock);
-                        struct resource *conflict;
-                        conflict = __request_resource(parent, res);
+        for (;;) {
-                        if (!conflict)
+                struct resource *conflict;
-                                break;
-                        if (conflict != parent) {
-                                parent = conflict;
-                                if (!(conflict->flags & IORESOURCE_BUSY))
-                                        continue;
-                        }
-                        /* Uhhuh, that didn't work out.. */
+                conflict = __request_resource(parent, res);
-                        kfree(res);
+                if (!conflict)
-                        res = NULL;
                        break;
+                if (conflict != parent) {
+                        parent = conflict;
+                        if (!(conflict->flags & IORESOURCE_BUSY))
+                                continue;
                }
-                write_unlock(&resource_lock);
+                /* Uhhuh, that didn't work out.. */
+                kfree(res);
+                res = NULL;
+                break;
        }
+        write_unlock(&resource_lock);
        return res;
 }
 EXPORT_SYMBOL(__request_region);
@@ -725,3 +828,40 @@ static int __init reserve_setup(char *str)
 }
 __setup("reserve=", reserve_setup);
+/*
+ * Check if the requested addr and size spans more than any slot in the
+ * iomem resource tree.
+ */
+int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
+{
+        struct resource *p = &iomem_resource;
+        int err = 0;
+        loff_t l;
+        read_lock(&resource_lock);
+        for (p = p->child; p ; p = r_next(NULL, p, &l)) {
+                /*
+                 * We can probably skip the resources without
+                 * IORESOURCE_IO attribute?
+                 */
+                if (p->start >= addr + size)
+                        continue;
+                if (p->end < addr)
+                        continue;
+                if (p->start <= addr && (p->end >= addr + size - 1))
+                        continue;
+                printk(KERN_WARNING "resource map sanity check conflict: "
+                       "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
+                       (unsigned long long)addr,
+                       (unsigned long long)(addr + size - 1),
+                       (unsigned long long)p->start,
+                       (unsigned long long)p->end,
+                       p->name);
+                err = -1;
+                break;
+        }
+        read_unlock(&resource_lock);
+        return err;
+}
diff --git a/kernel/sched.c b/kernel/sched.c
index 9a1ddb84e26d..09a8c15748f1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -201,14 +201,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
        hrtimer_init(&rt_b->rt_period_timer,
                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        rt_b->rt_period_timer.function = sched_rt_period_timer;
-        rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+        rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
+}
+static inline int rt_bandwidth_enabled(void)
+{
+        return sysctl_sched_rt_runtime >= 0;
 }
 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
        ktime_t now;
-        if (rt_b->rt_runtime == RUNTIME_INF)
+        if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
                return;
        if (hrtimer_active(&rt_b->rt_period_timer))
@@ -298,9 +303,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 #endif /* CONFIG_RT_GROUP_SCHED */
-#else /* !CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_USER_SCHED */
 #define root_task_group init_task_group
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_USER_SCHED */
 /* task_group_lock serializes add/remove of task groups and also changes to
 * a task group's cpu shares.
@@ -604,9 +609,9 @@ struct rq {
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
+static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
 {
-        rq->curr->sched_class->check_preempt_curr(rq, p);
+        rq->curr->sched_class->check_preempt_curr(rq, p, sync);
 }
 static inline int cpu_of(struct rq *rq)
@@ -1087,7 +1092,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
        return NOTIFY_DONE;
 }
-static void init_hrtick(void)
+static __init void init_hrtick(void)
 {
        hotcpu_notifier(hotplug_hrtick, 0);
 }
@@ -1102,7 +1107,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
        hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
 }
-static void init_hrtick(void)
+static inline void init_hrtick(void)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1119,9 +1124,9 @@ static void init_rq_hrtick(struct rq *rq)
        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        rq->hrtick_timer.function = hrtick;
-        rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+        rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 }
-#else
+#else   /* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
 {
 }
@@ -1133,7 +1138,7 @@ static inline void init_rq_hrtick(struct rq *rq)
 static inline void init_hrtick(void)
 {
 }
-#endif
+#endif  /* CONFIG_SCHED_HRTICK */
 /*
 * resched_task - mark a task 'to be rescheduled now'.
@@ -1380,38 +1385,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
        update_load_sub(&rq->load, load);
 }
-#ifdef CONFIG_SMP
+#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
-static unsigned long source_load(int cpu, int type);
+typedef int (*tg_visitor)(struct task_group *, void *);
-static unsigned long target_load(int cpu, int type);
-static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        if (rq->nr_running)
-                rq->avg_load_per_task = rq->load.weight / rq->nr_running;
-        return rq->avg_load_per_task;
-}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
 /*
 * Iterate the full tree, calling @down when first entering a node and @up when
 * leaving it for the final time.
 */
-static void
+static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
-walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
 {
        struct task_group *parent, *child;
+        int ret;
        rcu_read_lock();
        parent = &root_task_group;
 down:
-        (*down)(parent, cpu, sd);
+        ret = (*down)(parent, data);
+        if (ret)
+                goto out_unlock;
        list_for_each_entry_rcu(child, &parent->children, siblings) {
                parent = child;
                goto down;
@@ -1419,14 +1410,42 @@ down:
 up:
                continue;
        }
-        (*up)(parent, cpu, sd);
+        ret = (*up)(parent, data);
+        if (ret)
+                goto out_unlock;
        child = parent;
        parent = parent->parent;
        if (parent)
                goto up;
+out_unlock:
        rcu_read_unlock();
+        return ret;
+}
+static int tg_nop(struct task_group *tg, void *data)
+{
+        return 0;
 }
+#endif
+#ifdef CONFIG_SMP
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        if (rq->nr_running)
+                rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+        return rq->avg_load_per_task;
+}
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
@@ -1486,11 +1505,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
 * This needs to be done in a bottom-up fashion because the rq weight of a
 * parent group depends on the shares of its child groups.
 */
-static void
+static int tg_shares_up(struct task_group *tg, void *data)
-tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
        unsigned long rq_weight = 0;
        unsigned long shares = 0;
+        struct sched_domain *sd = data;
        int i;
        for_each_cpu_mask(i, sd->span) {
@@ -1515,6 +1534,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
                __update_group_shares_cpu(tg, i, shares, rq_weight);
                spin_unlock_irqrestore(&rq->lock, flags);
        }
+        return 0;
 }
 /*
@@ -1522,10 +1543,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
 * This needs to be done in a top-down fashion because the load of a child
 * group is a fraction of its parents load.
 */
-static void
+static int tg_load_down(struct task_group *tg, void *data)
-tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
        unsigned long load;
+        long cpu = (long)data;
        if (!tg->parent) {
                load = cpu_rq(cpu)->load.weight;
@@ -1536,11 +1557,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
        }
        tg->cfs_rq[cpu]->h_load = load;
-}
-static void
+        return 0;
-tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
 }
 static void update_shares(struct sched_domain *sd)
@@ -1550,7 +1568,7 @@ static void update_shares(struct sched_domain *sd)
        if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
                sd->last_update = now;
-                walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+                walk_tg_tree(tg_nop, tg_shares_up, sd);
        }
 }
@@ -1561,9 +1579,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
        spin_lock(&rq->lock);
 }
-static void update_h_load(int cpu)
+static void update_h_load(long cpu)
 {
-        walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
+        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 #else
@@ -1921,11 +1939,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                running = task_running(rq, p);
                on_rq = p->se.on_rq;
                ncsw = 0;
-                if (!match_state || p->state == match_state) {
+                if (!match_state || p->state == match_state)
-                        ncsw = p->nivcsw + p->nvcsw;
+                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-                        if (unlikely(!ncsw))
-                                ncsw = 1;
-                }
                task_rq_unlock(rq, &flags);
                /*
@@ -2285,7 +2300,7 @@ out_running:
        trace_mark(kernel_sched_wakeup,
                "pid %d state %ld ## rq %p task %p rq->curr %p",
                p->pid, p->state, rq, p, rq->curr);
-        check_preempt_curr(rq, p);
+        check_preempt_curr(rq, p, sync);
        p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
@@ -2420,7 +2435,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
        trace_mark(kernel_sched_wakeup_new,
                "pid %d state %ld ## rq %p task %p rq->curr %p",
                p->pid, p->state, rq, p, rq->curr);
-        check_preempt_curr(rq, p);
+        check_preempt_curr(rq, p, 0);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
                p->sched_class->task_wake_up(rq, p);
@@ -2880,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
         * Note that idle threads have a prio of MAX_PRIO, for this test
         * to be always true for them.
         */
-        check_preempt_curr(this_rq, p);
+        check_preempt_curr(this_rq, p, 0);
 }
 /*
@@ -4037,23 +4052,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 /*
- * Return p->sum_exec_runtime plus any more ns on the sched_clock
+ * Return any ns on the sched_clock that have not yet been banked in
- * that have not yet been banked in case the task is currently running.
+ * @p in case that task is currently running.
 */
-unsigned long long task_sched_runtime(struct task_struct *p)
+unsigned long long task_delta_exec(struct task_struct *p)
 {
        unsigned long flags;
-        u64 ns, delta_exec;
        struct rq *rq;
+        u64 ns = 0;
        rq = task_rq_lock(p, &flags);
-        ns = p->se.sum_exec_runtime;
        if (task_current(rq, p)) {
+                u64 delta_exec;
                update_rq_clock(rq);
                delta_exec = rq->clock - p->se.exec_start;
                if ((s64)delta_exec > 0)
-                        ns += delta_exec;
+                        ns = delta_exec;
        }
        task_rq_unlock(rq, &flags);
        return ns;
@@ -4070,6 +4088,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
        cputime64_t tmp;
        p->utime = cputime_add(p->utime, cputime);
+        account_group_user_time(p, cputime);
        /* Add user time to cpustat. */
        tmp = cputime_to_cputime64(cputime);
@@ -4094,6 +4113,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
        tmp = cputime_to_cputime64(cputime);
        p->utime = cputime_add(p->utime, cputime);
+        account_group_user_time(p, cputime);
        p->gtime = cputime_add(p->gtime, cputime);
        cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -4129,6 +4149,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
        }
        p->stime = cputime_add(p->stime, cputime);
+        account_group_system_time(p, cputime);
        /* Add system time to cpustat. */
        tmp = cputime_to_cputime64(cputime);
@@ -4170,6 +4191,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
        if (p == rq->idle) {
                p->stime = cputime_add(p->stime, steal);
+                account_group_system_time(p, steal);
                if (atomic_read(&rq->nr_iowait) > 0)
                        cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
                else
@@ -4179,6 +4201,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
 }
 /*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+cputime_t task_utime(struct task_struct *p)
+{
+        return p->utime;
+}
+cputime_t task_stime(struct task_struct *p)
+{
+        return p->stime;
+}
+#else
+cputime_t task_utime(struct task_struct *p)
+{
+        clock_t utime = cputime_to_clock_t(p->utime),
+                total = utime + cputime_to_clock_t(p->stime);
+        u64 temp;
+        /*
+         * Use CFS's precise accounting:
+         */
+        temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
+        if (total) {
+                temp *= utime;
+                do_div(temp, total);
+        }
+        utime = (clock_t)temp;
+        p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
+        return p->prev_utime;
+}
+cputime_t task_stime(struct task_struct *p)
+{
+        clock_t stime;
+        /*
+         * Use CFS's precise accounting. (we subtract utime from
+         * the total, to make sure the total observed by userspace
+         * grows monotonically - apps rely on that):
+         */
+        stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
+                        cputime_to_clock_t(task_utime(p));
+        if (stime >= 0)
+                p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+        return p->prev_stime;
+}
+#endif
+inline cputime_t task_gtime(struct task_struct *p)
+{
+        return p->gtime;
+}
+/*
 * This function gets called by the timer code, with HZ frequency.
 * We call it with interrupts disabled.
 *
@@ -4568,6 +4649,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ */
 void complete(struct completion *x)
 {
        unsigned long flags;
@@ -4579,6 +4669,12 @@ void complete(struct completion *x)
 }
 EXPORT_SYMBOL(complete);
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ */
 void complete_all(struct completion *x)
 {
        unsigned long flags;
@@ -4599,10 +4695,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
                wait.flags |= WQ_FLAG_EXCLUSIVE;
                __add_wait_queue_tail(&x->wait, &wait);
                do {
-                        if ((state == TASK_INTERRUPTIBLE &&
+                        if (signal_pending_state(state, current)) {
-                             signal_pending(current)) ||
-                            (state == TASK_KILLABLE &&
-                             fatal_signal_pending(current))) {
                                timeout = -ERESTARTSYS;
                                break;
                        }
@@ -4630,12 +4723,31 @@ wait_for_common(struct completion *x, long timeout, int state)
        return timeout;
 }
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
 void __sched wait_for_completion(struct completion *x)
 {
        wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion);
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ */
 unsigned long __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 {
@@ -4643,6 +4755,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 }
 EXPORT_SYMBOL(wait_for_completion_timeout);
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ */
 int __sched wait_for_completion_interruptible(struct completion *x)
 {
        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4652,6 +4771,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible);
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ */
 unsigned long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
                                          unsigned long timeout)
@@ -4660,6 +4787,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ */
 int __sched wait_for_completion_killable(struct completion *x)
 {
        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@ -5062,7 +5196,8 @@ recheck:
                 * Do not allow realtime tasks into groups that have no runtime
                 * assigned.
                 */
-                if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+                if (rt_bandwidth_enabled() && rt_policy(policy) &&
+                                task_group(p)->rt_bandwidth.rt_runtime == 0)
                        return -EPERM;
 #endif
@@ -5898,7 +6033,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        set_task_cpu(p, dest_cpu);
        if (on_rq) {
                activate_task(rq_dest, p, 0);
-                check_preempt_curr(rq_dest, p);
+                check_preempt_curr(rq_dest, p, 0);
        }
 done:
        ret = 1;
@@ -6223,7 +6358,7 @@ set_table_entry(struct ctl_table *entry,
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-        struct ctl_table *table = sd_alloc_ctl_entry(12);
+        struct ctl_table *table = sd_alloc_ctl_entry(13);
        if (table == NULL)
                return NULL;
@@ -6251,7 +6386,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
                sizeof(int), 0644, proc_dointvec_minmax);
        set_table_entry(&table[10], "flags", &sd->flags,
                sizeof(int), 0644, proc_dointvec_minmax);
-        /* &table[11] is terminator */
+        set_table_entry(&table[11], "name", sd->name,
+                CORENAME_MAX_SIZE, 0444, proc_dostring);
+        /* &table[12] is terminator */
        return table;
 }
@@ -7135,13 +7272,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
 */
+#ifdef CONFIG_SCHED_DEBUG
+# define SD_INIT_NAME(sd, type)         sd->name = #type
+#else
+# define SD_INIT_NAME(sd, type)         do { } while (0)
+#endif
 #define SD_INIT(sd, type)       sd_init_##type(sd)
 #define SD_INIT_FUNC(type)      \
 static noinline void sd_init_##type(struct sched_domain *sd)    \
 {                                                               \
        memset(sd, 0, sizeof(*sd));                             \
        *sd = SD_##type##_INIT;                                 \
        sd->level = SD_LV_##type;                               \
+        SD_INIT_NAME(sd, type);                                 \
 }
 SD_INIT_FUNC(CPU)
@@ -7637,24 +7782,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 * and partition_sched_domains() will fallback to the single partition
 * 'fallback_doms', it also forces the domains to be rebuilt.
 *
+ * If doms_new==NULL it will be replaced with cpu_online_map.
+ * ndoms_new==0 is a special case for destroying existing domains.
+ * It will not create the default domain.
+ *
 * Call with hotplug lock held
 */
 void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
                             struct sched_domain_attr *dattr_new)
 {
-        int i, j;
+        int i, j, n;
        mutex_lock(&sched_domains_mutex);
        /* always unregister in case we don't destroy any domains */
        unregister_sched_domain_sysctl();
-        if (doms_new == NULL)
+        n = doms_new ? ndoms_new : 0;
-                ndoms_new = 0;
        /* Destroy deleted domains */
        for (i = 0; i < ndoms_cur; i++) {
-                for (j = 0; j < ndoms_new; j++) {
+                for (j = 0; j < n; j++) {
                        if (cpus_equal(doms_cur[i], doms_new[j])
                            && dattrs_equal(dattr_cur, i, dattr_new, j))
                                goto match1;
@@ -7667,7 +7815,6 @@ match1:
        if (doms_new == NULL) {
                ndoms_cur = 0;
-                ndoms_new = 1;
                doms_new = &fallback_doms;
                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
                dattr_new = NULL;
@@ -7704,8 +7851,13 @@ match2:
 int arch_reinit_sched_domains(void)
 {
        get_online_cpus();
+        /* Destroy domains first to force the rebuild */
+        partition_sched_domains(0, NULL, NULL);
        rebuild_sched_domains();
        put_online_cpus();
        return 0;
 }
@@ -7789,7 +7941,7 @@ static int update_sched_domains(struct notifier_block *nfb,
        case CPU_ONLINE_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                partition_sched_domains(0, NULL, NULL);
+                partition_sched_domains(1, NULL, NULL);
                return NOTIFY_OK;
        default:
@@ -8176,20 +8328,25 @@ void __might_sleep(char *file, int line)
 #ifdef in_atomic
        static unsigned long prev_jiffy;        /* ratelimiting */
-        if ((in_atomic() || irqs_disabled()) &&
+        if ((!in_atomic() && !irqs_disabled()) ||
-            system_state == SYSTEM_RUNNING && !oops_in_progress) {
+                    system_state != SYSTEM_RUNNING || oops_in_progress)
-                if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+                return;
-                        return;
+        if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-                prev_jiffy = jiffies;
+                return;
-                printk(KERN_ERR "BUG: sleeping function called from invalid"
+        prev_jiffy = jiffies;
-                                " context at %s:%d\n", file, line);
-                printk("in_atomic():%d, irqs_disabled():%d\n",
+        printk(KERN_ERR
-                        in_atomic(), irqs_disabled());
+                "BUG: sleeping function called from invalid context at %s:%d\n",
-                debug_show_held_locks(current);
+                        file, line);
-                if (irqs_disabled())
+        printk(KERN_ERR
-                        print_irqtrace_events(current);
+                "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
-                dump_stack();
+                        in_atomic(), irqs_disabled(),
-        }
+                        current->pid, current->comm);
+        debug_show_held_locks(current);
+        if (irqs_disabled())
+                print_irqtrace_events(current);
+        dump_stack();
 #endif
 }
 EXPORT_SYMBOL(__might_sleep);
@@ -8687,73 +8844,95 @@ static DEFINE_MUTEX(rt_constraints_mutex);
 static unsigned long to_ratio(u64 period, u64 runtime)
 {
        if (runtime == RUNTIME_INF)
-                return 1ULL << 16;
+                return 1ULL << 20;
-        return div64_u64(runtime << 16, period);
+        return div64_u64(runtime << 20, period);
 }
-#ifdef CONFIG_CGROUP_SCHED
+/* Must be called with tasklist_lock held */
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+static inline int tg_has_rt_tasks(struct task_group *tg)
 {
-        struct task_group *tgi, *parent = tg->parent;
+        struct task_struct *g, *p;
-        unsigned long total = 0;
-        if (!parent) {
+        do_each_thread(g, p) {
-                if (global_rt_period() < period)
+                if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
-                        return 0;
+                        return 1;
+        } while_each_thread(g, p);
-                return to_ratio(period, runtime) <
+        return 0;
-                        to_ratio(global_rt_period(), global_rt_runtime());
+}
-        }
-        if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
+struct rt_schedulable_data {
-                return 0;
+        struct task_group *tg;
+        u64 rt_period;
+        u64 rt_runtime;
+};
-        rcu_read_lock();
+static int tg_schedulable(struct task_group *tg, void *data)
-        list_for_each_entry_rcu(tgi, &parent->children, siblings) {
+{
-                if (tgi == tg)
+        struct rt_schedulable_data *d = data;
-                        continue;
+        struct task_group *child;
+        unsigned long total, sum = 0;
+        u64 period, runtime;
+        period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+        runtime = tg->rt_bandwidth.rt_runtime;
-                total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
+        if (tg == d->tg) {
-                                tgi->rt_bandwidth.rt_runtime);
+                period = d->rt_period;
+                runtime = d->rt_runtime;
        }
-        rcu_read_unlock();
-        return total + to_ratio(period, runtime) <=
+        /*
-                to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
+         * Cannot have more runtime than the period.
-                                parent->rt_bandwidth.rt_runtime);
+         */
-}
+        if (runtime > period && runtime != RUNTIME_INF)
-#elif defined CONFIG_USER_SCHED
+                return -EINVAL;
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-{
-        struct task_group *tgi;
-        unsigned long total = 0;
-        unsigned long global_ratio =
-                to_ratio(global_rt_period(), global_rt_runtime());
-        rcu_read_lock();
+        /*
-        list_for_each_entry_rcu(tgi, &task_groups, list) {
+         * Ensure we don't starve existing RT tasks.
-                if (tgi == tg)
+         */
-                        continue;
+        if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+                return -EBUSY;
+        total = to_ratio(period, runtime);
-                total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
+        /*
-                                tgi->rt_bandwidth.rt_runtime);
+         * Nobody can have more than the global setting allows.
+         */
+        if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+                return -EINVAL;
+        /*
+         * The sum of our children's runtime should not exceed our own.
+         */
+        list_for_each_entry_rcu(child, &tg->children, siblings) {
+                period = ktime_to_ns(child->rt_bandwidth.rt_period);
+                runtime = child->rt_bandwidth.rt_runtime;
+                if (child == d->tg) {
+                        period = d->rt_period;
+                        runtime = d->rt_runtime;
+                }
+                sum += to_ratio(period, runtime);
        }
-        rcu_read_unlock();
-        return total + to_ratio(period, runtime) < global_ratio;
+        if (sum > total)
+                return -EINVAL;
+        return 0;
 }
-#endif
-/* Must be called with tasklist_lock held */
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-static inline int tg_has_rt_tasks(struct task_group *tg)
 {
-        struct task_struct *g, *p;
+        struct rt_schedulable_data data = {
-        do_each_thread(g, p) {
+                .tg = tg,
-                if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+                .rt_period = period,
-                        return 1;
+                .rt_runtime = runtime,
-        } while_each_thread(g, p);
+        };
-        return 0;
+        return walk_tg_tree(tg_schedulable, tg_nop, &data);
 }
 static int tg_set_bandwidth(struct task_group *tg,
@@ -8763,14 +8942,9 @@ static int tg_set_bandwidth(struct task_group *tg,
        mutex_lock(&rt_constraints_mutex);
        read_lock(&tasklist_lock);
-        if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
+        err = __rt_schedulable(tg, rt_period, rt_runtime);
-                err = -EBUSY;
+        if (err)
                goto unlock;
-        }
-        if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
-                err = -EINVAL;
-                goto unlock;
-        }
        spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
        tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8839,16 +9013,25 @@ long sched_group_rt_period(struct task_group *tg)
 static int sched_rt_global_constraints(void)
 {
-        struct task_group *tg = &root_task_group;
+        u64 runtime, period;
-        u64 rt_runtime, rt_period;
        int ret = 0;
-        rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+        if (sysctl_sched_rt_period <= 0)
-        rt_runtime = tg->rt_bandwidth.rt_runtime;
+                return -EINVAL;
+        runtime = global_rt_runtime();
+        period = global_rt_period();
+        /*
+         * Sanity check on the sysctl variables.
+         */
+        if (runtime > period && runtime != RUNTIME_INF)
+                return -EINVAL;
        mutex_lock(&rt_constraints_mutex);
-        if (!__rt_schedulable(tg, rt_period, rt_runtime))
+        read_lock(&tasklist_lock);
-                ret = -EINVAL;
+        ret = __rt_schedulable(NULL, 0, 0);
+        read_unlock(&tasklist_lock);
        mutex_unlock(&rt_constraints_mutex);
        return ret;
@@ -8859,6 +9042,9 @@ static int sched_rt_global_constraints(void)
        unsigned long flags;
        int i;
+        if (sysctl_sched_rt_period <= 0)
+                return -EINVAL;
        spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
        for_each_possible_cpu(i) {
                struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@ -8919,7 +9105,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
        if (!cgrp->parent) {
                /* This is early initialization for the top cgroup */
-                init_task_group.css.cgroup = cgrp;
                return &init_task_group.css;
        }
@@ -8928,9 +9113,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
        if (IS_ERR(tg))
                return ERR_PTR(-ENOMEM);
-        /* Bind the cgroup to task_group object we just created */
-        tg->css.cgroup = cgrp;
        return &tg->css;
 }
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 204991a0bfa7..81787248b60f 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -12,19 +12,17 @@
 *
 * Create a semi stable clock from a mixture of other events, including:
 *  - gtod
- *  - jiffies
 *  - sched_clock()
 *  - explicit idle events
 *
 * We use gtod as base and the unstable clock deltas. The deltas are filtered,
- * making it monotonic and keeping it within an expected window.  This window
+ * making it monotonic and keeping it within an expected window.
- * is set up using jiffies.
 *
 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
 * that is otherwise invisible (TSC gets stopped).
 *
 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
- * consistent between cpus (never more than 1 jiffies difference).
+ * consistent between cpus (never more than 2 jiffies difference).
 */
 #include <linux/sched.h>
 #include <linux/percpu.h>
@@ -54,7 +52,6 @@ struct sched_clock_data {
         */
        raw_spinlock_t          lock;
-        unsigned long           tick_jiffies;
        u64                     tick_raw;
        u64                     tick_gtod;
        u64                     clock;
@@ -75,14 +72,12 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
 void sched_clock_init(void)
 {
        u64 ktime_now = ktime_to_ns(ktime_get());
-        unsigned long now_jiffies = jiffies;
        int cpu;
        for_each_possible_cpu(cpu) {
                struct sched_clock_data *scd = cpu_sdc(cpu);
                scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
-                scd->tick_jiffies = now_jiffies;
                scd->tick_raw = 0;
                scd->tick_gtod = ktime_now;
                scd->clock = ktime_now;
@@ -92,46 +87,51 @@ void sched_clock_init(void)
 }
 /*
+ * min,max except they take wrapping into account
+ */
+static inline u64 wrap_min(u64 x, u64 y)
+{
+        return (s64)(x - y) < 0 ? x : y;
+}
+static inline u64 wrap_max(u64 x, u64 y)
+{
+        return (s64)(x - y) > 0 ? x : y;
+}
+/*
 * update the percpu scd from the raw @now value
 *
 *  - filter out backward motion
- *  - use jiffies to generate a min,max window to clip the raw values
+ *  - use the GTOD tick value to create a window to filter crazy TSC values
 */
 static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
 {
-        unsigned long now_jiffies = jiffies;
-        long delta_jiffies = now_jiffies - scd->tick_jiffies;
-        u64 clock = scd->clock;
-        u64 min_clock, max_clock;
        s64 delta = now - scd->tick_raw;
+        u64 clock, min_clock, max_clock;
        WARN_ON_ONCE(!irqs_disabled());
-        min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
-        if (unlikely(delta < 0)) {
+        if (unlikely(delta < 0))
-                clock++;
+                delta = 0;
-                goto out;
-        }
-        max_clock = min_clock + TICK_NSEC;
+        /*
+         * scd->clock = clamp(scd->tick_gtod + delta,
+         *                    max(scd->tick_gtod, scd->clock),
+         *                    max(scd->clock, scd->tick_gtod + TICK_NSEC));
+         */
-        if (unlikely(clock + delta > max_clock)) {
+        clock = scd->tick_gtod + delta;
-                if (clock < max_clock)
+        min_clock = wrap_max(scd->tick_gtod, scd->clock);
-                        clock = max_clock;
+        max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
-                else
-                        clock++;
-        } else {
-                clock += delta;
-        }
- out:
+        clock = wrap_max(clock, min_clock);
-        if (unlikely(clock < min_clock))
+        clock = wrap_min(clock, max_clock);
-                clock = min_clock;
-        scd->tick_jiffies = now_jiffies;
        scd->clock = clock;
-        return clock;
+        return scd->clock;
 }
 static void lock_double_clock(struct sched_clock_data *data1,
@@ -171,7 +171,7 @@ u64 sched_clock_cpu(int cpu)
                 * larger time as the latest time for both
                 * runqueues. (this creates monotonic movement)
                 */
-                if (likely(remote_clock < this_clock)) {
+                if (likely((s64)(remote_clock - this_clock) < 0)) {
                        clock = this_clock;
                        scd->clock = clock;
                } else {
@@ -207,14 +207,9 @@ void sched_clock_tick(void)
        now = sched_clock();
        __raw_spin_lock(&scd->lock);
-        __update_sched_clock(scd, now);
-        /*
-         * update tick_gtod after __update_sched_clock() because that will
-         * already observe 1 new jiffy; adding a new tick_gtod to that would
-         * increase the clock 2 jiffies.
-         */
        scd->tick_raw = now;
        scd->tick_gtod = now_gtod;
+        __update_sched_clock(scd, now);
        __raw_spin_unlock(&scd->lock);
 }
@@ -232,18 +227,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
 */
 void sched_clock_idle_wakeup_event(u64 delta_ns)
 {
-        struct sched_clock_data *scd = this_scd();
+        sched_clock_tick();
-        /*
-         * Override the previous timestamp and ignore all
-         * sched_clock() deltas that occured while we idled,
-         * and use the PM-provided delta_ns to advance the
-         * rq clock:
-         */
-        __raw_spin_lock(&scd->lock);
-        scd->clock += delta_ns;
-        __raw_spin_unlock(&scd->lock);
        touch_softlockup_watchdog();
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index bbe6b31c3c56..ad958c1ec708 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -333,12 +333,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        unsigned long flags;
        int num_threads = 1;
-        rcu_read_lock();
        if (lock_task_sighand(p, &flags)) {
                num_threads = atomic_read(&p->signal->count);
                unlock_task_sighand(p, &flags);
        }
-        rcu_read_unlock();
        SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
        SEQ_printf(m,
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fb8994c6d4bb..f604dae71316 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -409,64 +409,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 /*
- * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
- * that it favours >=0 over <0.
- *
- *   -20         |
- *               |
- *     0 --------+-------
- *             .'
- *    19     .'
- *
- */
-static unsigned long
-calc_delta_asym(unsigned long delta, struct sched_entity *se)
-{
-        struct load_weight lw = {
-                .weight = NICE_0_LOAD,
-                .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
-        };
-        for_each_sched_entity(se) {
-                struct load_weight *se_lw = &se->load;
-                unsigned long rw = cfs_rq_of(se)->load.weight;
-#ifdef CONFIG_FAIR_SCHED_GROUP
-                struct cfs_rq *cfs_rq = se->my_q;
-                struct task_group *tg = NULL
-                if (cfs_rq)
-                        tg = cfs_rq->tg;
-                if (tg && tg->shares < NICE_0_LOAD) {
-                        /*
-                         * scale shares to what it would have been had
-                         * tg->weight been NICE_0_LOAD:
-                         *
-                         *   weight = 1024 * shares / tg->weight
-                         */
-                        lw.weight *= se->load.weight;
-                        lw.weight /= tg->shares;
-                        lw.inv_weight = 0;
-                        se_lw = &lw;
-                        rw += lw.weight - se->load.weight;
-                } else
-#endif
-                if (se->load.weight < NICE_0_LOAD) {
-                        se_lw = &lw;
-                        rw += NICE_0_LOAD - se->load.weight;
-                }
-                delta = calc_delta_mine(delta, rw, se_lw);
-        }
-        return delta;
-}
-/*
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
 */
@@ -507,6 +449,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
                struct task_struct *curtask = task_of(curr);
                cpuacct_charge(curtask, delta_exec);
+                account_group_exec_runtime(curtask, delta_exec);
        }
 }
@@ -586,11 +529,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        update_load_add(&cfs_rq->load, se->load.weight);
        if (!parent_entity(se))
                inc_cpu_load(rq_of(cfs_rq), se->load.weight);
-        if (entity_is_task(se))
+        if (entity_is_task(se)) {
                add_cfs_task_weight(cfs_rq, se->load.weight);
+                list_add(&se->group_node, &cfs_rq->tasks);
+        }
        cfs_rq->nr_running++;
        se->on_rq = 1;
-        list_add(&se->group_node, &cfs_rq->tasks);
 }
 static void
@@ -599,11 +543,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        update_load_sub(&cfs_rq->load, se->load.weight);
        if (!parent_entity(se))
                dec_cpu_load(rq_of(cfs_rq), se->load.weight);
-        if (entity_is_task(se))
+        if (entity_is_task(se)) {
                add_cfs_task_weight(cfs_rq, -se->load.weight);
+                list_del_init(&se->group_node);
+        }
        cfs_rq->nr_running--;
        se->on_rq = 0;
-        list_del_init(&se->group_node);
 }
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1085,7 +1030,6 @@ static long effective_load(struct task_group *tg, int cpu,
                long wl, long wg)
 {
        struct sched_entity *se = tg->se[cpu];
-        long more_w;
        if (!tg->parent)
                return wl;
@@ -1097,18 +1041,17 @@ static long effective_load(struct task_group *tg, int cpu,
        if (!wl && sched_feat(ASYM_EFF_LOAD))
                return wl;
-        /*
-         * Instead of using this increment, also add the difference
-         * between when the shares were last updated and now.
-         */
-        more_w = se->my_q->load.weight - se->my_q->rq_weight;
-        wl += more_w;
-        wg += more_w;
        for_each_sched_entity(se) {
-#define D(n) (likely(n) ? (n) : 1)
                long S, rw, s, a, b;
+                long more_w;
+                /*
+                 * Instead of using this increment, also add the difference
+                 * between when the shares were last updated and now.
+                 */
+                more_w = se->my_q->load.weight - se->my_q->rq_weight;
+                wl += more_w;
+                wg += more_w;
                S = se->my_q->tg->shares;
                s = se->my_q->shares;
@@ -1117,7 +1060,11 @@ static long effective_load(struct task_group *tg, int cpu,
                a = S*(rw + wl);
                b = S*rw + s*wg;
-                wl = s*(a-b)/D(b);
+                wl = s*(a-b);
+                if (likely(b))
+                        wl /= b;
                /*
                 * Assume the group is already running and will
                 * thus already be accounted for in the weight.
@@ -1126,7 +1073,6 @@ static long effective_load(struct task_group *tg, int cpu,
                 * alter the group weight.
                 */
                wg = 0;
-#undef D
        }
        return wl;
@@ -1143,7 +1089,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 #endif
 static int
-wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
+wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
            struct task_struct *p, int prev_cpu, int this_cpu, int sync,
            int idx, unsigned long load, unsigned long this_load,
            unsigned int imbalance)
@@ -1158,6 +1104,11 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
        if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
                return 0;
+        if (!sync && sched_feat(SYNC_WAKEUPS) &&
+            curr->se.avg_overlap < sysctl_sched_migration_cost &&
+            p->se.avg_overlap < sysctl_sched_migration_cost)
+                sync = 1;
        /*
         * If sync wakeup then subtract the (maximum possible)
         * effect of the currently running task from the load
@@ -1182,17 +1133,14 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
         * a reasonable amount of time then attract this newly
         * woken task:
         */
-        if (sync && balanced) {
+        if (sync && balanced)
-                if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
+                return 1;
-                    p->se.avg_overlap < sysctl_sched_migration_cost)
-                        return 1;
-        }
        schedstat_inc(p, se.nr_wakeups_affine_attempts);
        tl_per_task = cpu_avg_load_per_task(this_cpu);
-        if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
+        if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
-                        balanced) {
+                        tl_per_task)) {
                /*
                 * This domain has SD_WAKE_AFFINE and
                 * p is cache cold in this domain, and
@@ -1211,16 +1159,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
        struct sched_domain *sd, *this_sd = NULL;
        int prev_cpu, this_cpu, new_cpu;
        unsigned long load, this_load;
-        struct rq *rq, *this_rq;
+        struct rq *this_rq;
        unsigned int imbalance;
        int idx;
        prev_cpu        = task_cpu(p);
-        rq              = task_rq(p);
        this_cpu        = smp_processor_id();
        this_rq         = cpu_rq(this_cpu);
        new_cpu         = prev_cpu;
+        if (prev_cpu == this_cpu)
+                goto out;
        /*
         * 'this_sd' is the first domain that both
         * this_cpu and prev_cpu are present in:
@@ -1248,13 +1197,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
        load = source_load(prev_cpu, idx);
        this_load = target_load(this_cpu, idx);
-        if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
+        if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
                                     load, this_load, imbalance))
                return this_cpu;
-        if (prev_cpu == this_cpu)
-                goto out;
        /*
         * Start passive balancing when half the imbalance_pct
         * limit is reached.
@@ -1281,62 +1227,20 @@ static unsigned long wakeup_gran(struct sched_entity *se)
         * + nice tasks.
         */
        if (sched_feat(ASYM_GRAN))
-                gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
+                gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
-        else
-                gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
        return gran;
 }
 /*
- * Should 'se' preempt 'curr'.
- *
- *             |s1
- *        |s2
- *   |s3
- *         g
- *      |<--->|c
- *
- *  w(c, s1) = -1
- *  w(c, s2) =  0
- *  w(c, s3) =  1
- *
- */
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
-{
-        s64 gran, vdiff = curr->vruntime - se->vruntime;
-        if (vdiff < 0)
-                return -1;
-        gran = wakeup_gran(curr);
-        if (vdiff > gran)
-                return 1;
-        return 0;
-}
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-        int depth = 0;
-        for_each_sched_entity(se)
-                depth++;
-        return depth;
-}
-/*
 * Preempt the current task with a newly woken task if needed:
 */
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 {
        struct task_struct *curr = rq->curr;
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
        struct sched_entity *se = &curr->se, *pse = &p->se;
-        int se_depth, pse_depth;
+        s64 delta_exec;
        if (unlikely(rt_prio(p->prio))) {
                update_rq_clock(rq);
@@ -1351,6 +1255,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
        cfs_rq_of(pse)->next = pse;
        /*
+         * We can come here with TIF_NEED_RESCHED already set from new task
+         * wake up path.
+         */
+        if (test_tsk_need_resched(curr))
+                return;
+        /*
         * Batch tasks do not preempt (their preemption is driven by
         * the tick):
         */
@@ -1360,33 +1271,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
-        /*
+        if (sched_feat(WAKEUP_OVERLAP) && (sync ||
-         * preemption test can be made between sibling entities who are in the
+                        (se->avg_overlap < sysctl_sched_migration_cost &&
-         * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
+                         pse->avg_overlap < sysctl_sched_migration_cost))) {
-         * both tasks until we find their ancestors who are siblings of common
+                resched_task(curr);
-         * parent.
+                return;
-         */
-        /* First walk up until both entities are at same depth */
-        se_depth = depth_se(se);
-        pse_depth = depth_se(pse);
-        while (se_depth > pse_depth) {
-                se_depth--;
-                se = parent_entity(se);
-        }
-        while (pse_depth > se_depth) {
-                pse_depth--;
-                pse = parent_entity(pse);
-        }
-        while (!is_same_group(se, pse)) {
-                se = parent_entity(se);
-                pse = parent_entity(pse);
        }
-        if (wakeup_preempt_entity(se, pse) == 1)
+        delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+        if (delta_exec > wakeup_gran(pse))
                resched_task(curr);
 }
@@ -1445,19 +1338,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
        if (next == &cfs_rq->tasks)
                return NULL;
-        /* Skip over entities that are not tasks */
+        se = list_entry(next, struct sched_entity, group_node);
-        do {
+        p = task_of(se);
-                se = list_entry(next, struct sched_entity, group_node);
+        cfs_rq->balance_iterator = next->next;
-                next = next->next;
-        } while (next != &cfs_rq->tasks && !entity_is_task(se));
-        if (next == &cfs_rq->tasks)
-                return NULL;
-        cfs_rq->balance_iterator = next;
-        if (entity_is_task(se))
-                p = task_of(se);
        return p;
 }
@@ -1507,7 +1390,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        rcu_read_lock();
        update_h_load(busiest_cpu);
-        list_for_each_entry(tg, &task_groups, list) {
+        list_for_each_entry_rcu(tg, &task_groups, list) {
                struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
                unsigned long busiest_h_load = busiest_cfs_rq->h_load;
                unsigned long busiest_weight = busiest_cfs_rq->load.weight;
@@ -1620,10 +1503,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
                 * 'current' within the tree based on its new key value.
                 */
                swap(curr->vruntime, se->vruntime);
+                resched_task(rq->curr);
        }
        enqueue_task_fair(rq, p, 0);
-        resched_task(rq->curr);
 }
 /*
@@ -1642,7 +1525,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p,
                if (p->prio > oldprio)
                        resched_task(rq->curr);
        } else
-                check_preempt_curr(rq, p);
+                check_preempt_curr(rq, p, 0);
 }
 /*
@@ -1659,7 +1542,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p,
        if (running)
                resched_task(rq->curr);
        else
-                check_preempt_curr(rq, p);
+                check_preempt_curr(rq, p, 0);
 }
 /* Account for a task changing its policy or group.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 862b06bd560a..7c9e8f4a049f 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -8,6 +8,7 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
 SCHED_FEAT(HRTICK, 1)
 SCHED_FEAT(DOUBLE_TICK, 0)
 SCHED_FEAT(ASYM_GRAN, 1)
-SCHED_FEAT(LB_BIAS, 0)
+SCHED_FEAT(LB_BIAS, 1)
 SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
+SCHED_FEAT(WAKEUP_OVERLAP, 0)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3a4f92dbbe66..dec4ccabe2f5 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
 /*
 * Idle tasks are unconditionally rescheduled:
 */
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
 {
        resched_task(rq->idle);
 }
@@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p,
        if (running)
                resched_task(rq->curr);
        else
-                check_preempt_curr(rq, p);
+                check_preempt_curr(rq, p, 0);
 }
 static void prio_changed_idle(struct rq *rq, struct task_struct *p,
@@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
                if (p->prio > oldprio)
                        resched_task(rq->curr);
        } else
-                check_preempt_curr(rq, p);
+                check_preempt_curr(rq, p, 0);
 }
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 998ba54b4543..b446dc87494f 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
+        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
        struct sched_rt_entity *rt_se = rt_rq->rt_se;
-        if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
+        if (rt_rq->rt_nr_running) {
-                struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+                if (rt_se && !on_rt_rq(rt_se))
+                        enqueue_rt_entity(rt_se);
-                enqueue_rt_entity(rt_se);
                if (rt_rq->highest_prio < curr->prio)
                        resched_task(curr);
        }
@@ -199,6 +199,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
+        if (rt_rq->rt_nr_running)
+                resched_task(rq_of_rt_rq(rt_rq)->curr);
 }
 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -229,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_SMP
+/*
+ * We ran out of runtime, see if we can borrow some from our neighbours.
+ */
 static int do_balance_runtime(struct rt_rq *rt_rq)
 {
        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -248,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
                        continue;
                spin_lock(&iter->rt_runtime_lock);
+                /*
+                 * Either all rqs have inf runtime and there's nothing to steal
+                 * or __disable_runtime() below sets a specific rq to inf to
+                 * indicate its been disabled and disalow stealing.
+                 */
                if (iter->rt_runtime == RUNTIME_INF)
                        goto next;
+                /*
+                 * From runqueues with spare time, take 1/n part of their
+                 * spare time, but no more than our period.
+                 */
                diff = iter->rt_runtime - iter->rt_time;
                if (diff > 0) {
                        diff = div_u64((u64)diff, weight);
@@ -272,6 +286,9 @@ next:
        return more;
 }
+/*
+ * Ensure this RQ takes back all the runtime it lend to its neighbours.
+ */
 static void __disable_runtime(struct rq *rq)
 {
        struct root_domain *rd = rq->rd;
@@ -287,17 +304,33 @@ static void __disable_runtime(struct rq *rq)
                spin_lock(&rt_b->rt_runtime_lock);
                spin_lock(&rt_rq->rt_runtime_lock);
+                /*
+                 * Either we're all inf and nobody needs to borrow, or we're
+                 * already disabled and thus have nothing to do, or we have
+                 * exactly the right amount of runtime to take out.
+                 */
                if (rt_rq->rt_runtime == RUNTIME_INF ||
                                rt_rq->rt_runtime == rt_b->rt_runtime)
                        goto balanced;
                spin_unlock(&rt_rq->rt_runtime_lock);
+                /*
+                 * Calculate the difference between what we started out with
+                 * and what we current have, that's the amount of runtime
+                 * we lend and now have to reclaim.
+                 */
                want = rt_b->rt_runtime - rt_rq->rt_runtime;
+                /*
+                 * Greedy reclaim, take back as much as we can.
+                 */
                for_each_cpu_mask(i, rd->span) {
                        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
                        s64 diff;
+                        /*
+                         * Can't reclaim from ourselves or disabled runqueues.
+                         */
                        if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
                                continue;
@@ -317,8 +350,16 @@ static void __disable_runtime(struct rq *rq)
                }
                spin_lock(&rt_rq->rt_runtime_lock);
+                /*
+                 * We cannot be left wanting - that would mean some runtime
+                 * leaked out of the system.
+                 */
                BUG_ON(want);
 balanced:
+                /*
+                 * Disable all the borrow logic by pretending we have inf
+                 * runtime - in which case borrowing doesn't make sense.
+                 */
                rt_rq->rt_runtime = RUNTIME_INF;
                spin_unlock(&rt_rq->rt_runtime_lock);
                spin_unlock(&rt_b->rt_runtime_lock);
@@ -341,6 +382,9 @@ static void __enable_runtime(struct rq *rq)
        if (unlikely(!scheduler_running))
                return;
+        /*
+         * Reset each runqueue's bandwidth settings
+         */
        for_each_leaf_rt_rq(rt_rq, rq) {
                struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -348,6 +392,7 @@ static void __enable_runtime(struct rq *rq)
                spin_lock(&rt_rq->rt_runtime_lock);
                rt_rq->rt_runtime = rt_b->rt_runtime;
                rt_rq->rt_time = 0;
+                rt_rq->rt_throttled = 0;
                spin_unlock(&rt_rq->rt_runtime_lock);
                spin_unlock(&rt_b->rt_runtime_lock);
        }
@@ -386,7 +431,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
        int i, idle = 1;
        cpumask_t span;
-        if (rt_b->rt_runtime == RUNTIME_INF)
+        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                return 1;
        span = sched_rt_period_mask();
@@ -438,9 +483,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 {
        u64 runtime = sched_rt_runtime(rt_rq);
-        if (runtime == RUNTIME_INF)
-                return 0;
        if (rt_rq->rt_throttled)
                return rt_rq_throttled(rt_rq);
@@ -484,16 +526,23 @@ static void update_curr_rt(struct rq *rq)
        schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
        curr->se.sum_exec_runtime += delta_exec;
+        account_group_exec_runtime(curr, delta_exec);
        curr->se.exec_start = rq->clock;
        cpuacct_charge(curr, delta_exec);
+        if (!rt_bandwidth_enabled())
+                return;
        for_each_sched_rt_entity(rt_se) {
                rt_rq = rt_rq_of_se(rt_se);
                spin_lock(&rt_rq->rt_runtime_lock);
-                rt_rq->rt_time += delta_exec;
+                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
-                if (sched_rt_runtime_exceeded(rt_rq))
+                        rt_rq->rt_time += delta_exec;
-                        resched_task(curr);
+                        if (sched_rt_runtime_exceeded(rt_rq))
+                                resched_task(curr);
+                }
                spin_unlock(&rt_rq->rt_runtime_lock);
        }
 }
@@ -782,7 +831,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 /*
 * Preempt the current task with a newly woken task if needed:
 */
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
+static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
 {
        if (p->prio < rq->curr->prio) {
                resched_task(rq->curr);
@@ -1411,7 +1460,7 @@ static void watchdog(struct rq *rq, struct task_struct *p)
                p->rt.timeout++;
                next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
                if (p->rt.timeout > next)
-                        p->it_sched_expires = p->se.sum_exec_runtime;
+                        p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
        }
 }
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 8385d43987e2..b8c156979cf2 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -270,3 +270,89 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
 #define sched_info_switch(t, next)              do { } while (0)
 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+/*
+ * The following are functions that support scheduler-internal time accounting.
+ * These functions are generally called at the timer tick.  None of this depends
+ * on CONFIG_SCHEDSTATS.
+ */
+/**
+ * account_group_user_time - Maintain utime for a thread group.
+ *
+ * @tsk:        Pointer to task structure.
+ * @cputime:    Time value by which to increment the utime field of the
+ *              thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the utime field there.
+ */
+static inline void account_group_user_time(struct task_struct *tsk,
+                                           cputime_t cputime)
+{
+        struct signal_struct *sig;
+        sig = tsk->signal;
+        if (unlikely(!sig))
+                return;
+        if (sig->cputime.totals) {
+                struct task_cputime *times;
+                times = per_cpu_ptr(sig->cputime.totals, get_cpu());
+                times->utime = cputime_add(times->utime, cputime);
+                put_cpu_no_resched();
+        }
+}
+/**
+ * account_group_system_time - Maintain stime for a thread group.
+ *
+ * @tsk:        Pointer to task structure.
+ * @cputime:    Time value by which to increment the stime field of the
+ *              thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the stime field there.
+ */
+static inline void account_group_system_time(struct task_struct *tsk,
+                                             cputime_t cputime)
+{
+        struct signal_struct *sig;
+        sig = tsk->signal;
+        if (unlikely(!sig))
+                return;
+        if (sig->cputime.totals) {
+                struct task_cputime *times;
+                times = per_cpu_ptr(sig->cputime.totals, get_cpu());
+                times->stime = cputime_add(times->stime, cputime);
+                put_cpu_no_resched();
+        }
+}
+/**
+ * account_group_exec_runtime - Maintain exec runtime for a thread group.
+ *
+ * @tsk:        Pointer to task structure.
+ * @ns:         Time value by which to increment the sum_exec_runtime field
+ *              of the thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the sum_exec_runtime field there.
+ */
+static inline void account_group_exec_runtime(struct task_struct *tsk,
+                                              unsigned long long ns)
+{
+        struct signal_struct *sig;
+        sig = tsk->signal;
+        if (unlikely(!sig))
+                return;
+        if (sig->cputime.totals) {
+                struct task_cputime *times;
+                times = per_cpu_ptr(sig->cputime.totals, get_cpu());
+                times->sum_exec_runtime += ns;
+                put_cpu_no_resched();
+        }
+}
diff --git a/kernel/signal.c b/kernel/signal.c
index e661b01d340f..6eea5826d618 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1338,6 +1338,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
        struct siginfo info;
        unsigned long flags;
        struct sighand_struct *psig;
+        struct task_cputime cputime;
        int ret = sig;
        BUG_ON(sig == -1);
@@ -1368,10 +1369,9 @@ int do_notify_parent(struct task_struct *tsk, int sig)
        info.si_uid = tsk->uid;
-        info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
+        thread_group_cputime(tsk, &cputime);
-                                                       tsk->signal->utime));
+        info.si_utime = cputime_to_jiffies(cputime.utime);
-        info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
+        info.si_stime = cputime_to_jiffies(cputime.stime);
-                                                       tsk->signal->stime));
        info.si_status = tsk->exit_code & 0x7f;
        if (tsk->exit_code & 0x80)
diff --git a/kernel/smp.c b/kernel/smp.c
index 782e2b93e465..f362a8553777 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -210,8 +210,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 {
        struct call_single_data d;
        unsigned long flags;
-        /* prevent preemption and reschedule on another processor */
+        /* prevent preemption and reschedule on another processor,
+           as well as CPU removal */
        int me = get_cpu();
+        int err = 0;
        /* Can deadlock when called with interrupts disabled */
        WARN_ON(irqs_disabled());
@@ -220,7 +222,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
                local_irq_save(flags);
                func(info);
                local_irq_restore(flags);
-        } else {
+        } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) {
                struct call_single_data *data = NULL;
                if (!wait) {
@@ -236,10 +238,12 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
                data->func = func;
                data->info = info;
                generic_exec_single(cpu, data);
+        } else {
+                err = -ENXIO;   /* CPU not online */
        }
        put_cpu();
-        return 0;
+        return err;
 }
 EXPORT_SYMBOL(smp_call_function_single);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c506f266a6b9..7110daeb9a90 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -6,6 +6,8 @@
 *      Distribute under GPLv2.
 *
 *      Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+ *
+ *      Remote softirq infrastructure is by Jens Axboe.
 */
 #include <linux/module.h>
@@ -46,7 +48,7 @@ irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
 EXPORT_SYMBOL(irq_stat);
 #endif
-static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp;
+static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
 static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
@@ -205,7 +207,18 @@ restart:
        do {
                if (pending & 1) {
+                        int prev_count = preempt_count();
                        h->action(h);
+                        if (unlikely(prev_count != preempt_count())) {
+                                printk(KERN_ERR "huh, entered softirq %td %p"
+                                       "with preempt_count %08x,"
+                                       " exited with %08x?\n", h - softirq_vec,
+                                       h->action, prev_count, preempt_count());
+                                preempt_count() = prev_count;
+                        }
                        rcu_bh_qsctr_inc(cpu);
                }
                h++;
@@ -254,16 +267,12 @@ asmlinkage void do_softirq(void)
 */
 void irq_enter(void)
 {
-#ifdef CONFIG_NO_HZ
        int cpu = smp_processor_id();
        if (idle_cpu(cpu) && !in_interrupt())
-                tick_nohz_stop_idle(cpu);
+                tick_check_idle(cpu);
-#endif
        __irq_enter();
-#ifdef CONFIG_NO_HZ
-        if (idle_cpu(cpu))
-                tick_nohz_update_jiffies();
-#endif
 }
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -463,17 +472,144 @@ void tasklet_kill(struct tasklet_struct *t)
 EXPORT_SYMBOL(tasklet_kill);
+DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
+EXPORT_PER_CPU_SYMBOL(softirq_work_list);
+static void __local_trigger(struct call_single_data *cp, int softirq)
+{
+        struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
+        list_add_tail(&cp->list, head);
+        /* Trigger the softirq only if the list was previously empty.  */
+        if (head->next == &cp->list)
+                raise_softirq_irqoff(softirq);
+}
+#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
+static void remote_softirq_receive(void *data)
+{
+        struct call_single_data *cp = data;
+        unsigned long flags;
+        int softirq;
+        softirq = cp->priv;
+        local_irq_save(flags);
+        __local_trigger(cp, softirq);
+        local_irq_restore(flags);
+}
+static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+{
+        if (cpu_online(cpu)) {
+                cp->func = remote_softirq_receive;
+                cp->info = cp;
+                cp->flags = 0;
+                cp->priv = softirq;
+                __smp_call_function_single(cpu, cp);
+                return 0;
+        }
+        return 1;
+}
+#else /* CONFIG_USE_GENERIC_SMP_HELPERS */
+static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+{
+        return 1;
+}
+#endif
+/**
+ * __send_remote_softirq - try to schedule softirq work on a remote cpu
+ * @cp: private SMP call function data area
+ * @cpu: the remote cpu
+ * @this_cpu: the currently executing cpu
+ * @softirq: the softirq for the work
+ *
+ * Attempt to schedule softirq work on a remote cpu.  If this cannot be
+ * done, the work is instead queued up on the local cpu.
+ *
+ * Interrupts must be disabled.
+ */
+void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
+{
+        if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
+                __local_trigger(cp, softirq);
+}
+EXPORT_SYMBOL(__send_remote_softirq);
+/**
+ * send_remote_softirq - try to schedule softirq work on a remote cpu
+ * @cp: private SMP call function data area
+ * @cpu: the remote cpu
+ * @softirq: the softirq for the work
+ *
+ * Like __send_remote_softirq except that disabling interrupts and
+ * computing the current cpu is done for the caller.
+ */
+void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+{
+        unsigned long flags;
+        int this_cpu;
+        local_irq_save(flags);
+        this_cpu = smp_processor_id();
+        __send_remote_softirq(cp, cpu, this_cpu, softirq);
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL(send_remote_softirq);
+static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
+                                               unsigned long action, void *hcpu)
+{
+        /*
+         * If a CPU goes away, splice its entries to the current CPU
+         * and trigger a run of the softirq
+         */
+        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+                int cpu = (unsigned long) hcpu;
+                int i;
+                local_irq_disable();
+                for (i = 0; i < NR_SOFTIRQS; i++) {
+                        struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
+                        struct list_head *local_head;
+                        if (list_empty(head))
+                                continue;
+                        local_head = &__get_cpu_var(softirq_work_list[i]);
+                        list_splice_init(head, local_head);
+                        raise_softirq_irqoff(i);
+                }
+                local_irq_enable();
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {
+        .notifier_call  = remote_softirq_cpu_notify,
+};
 void __init softirq_init(void)
 {
        int cpu;
        for_each_possible_cpu(cpu) {
+                int i;
                per_cpu(tasklet_vec, cpu).tail =
                        &per_cpu(tasklet_vec, cpu).head;
                per_cpu(tasklet_hi_vec, cpu).tail =
                        &per_cpu(tasklet_hi_vec, cpu).head;
+                for (i = 0; i < NR_SOFTIRQS; i++)
+                        INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
        }
+        register_hotcpu_notifier(&remote_softirq_cpu_notifier);
        open_softirq(TASKLET_SOFTIRQ, tasklet_action);
        open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index b75b492fbfcf..3953e4aed733 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -226,14 +226,15 @@ static void check_hung_uninterruptible_tasks(int this_cpu)
         * If the system crashed already then all bets are off,
         * do not report extra hung tasks:
         */
-        if ((tainted & TAINT_DIE) || did_panic)
+        if (test_taint(TAINT_DIE) || did_panic)
                return;
        read_lock(&tasklist_lock);
        do_each_thread(g, t) {
                if (!--max_count)
                        goto unlock;
-                if (t->state & TASK_UNINTERRUPTIBLE)
+                /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
+                if (t->state == TASK_UNINTERRUPTIBLE)
                        check_hung_task(t, now);
        } while_each_thread(g, t);
 unlock:
diff --git a/kernel/sys.c b/kernel/sys.c
index 038a7bc0901d..53879cdae483 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -853,38 +853,28 @@ asmlinkage long sys_setfsgid(gid_t gid)
        return old_fsgid;
 }
+void do_sys_times(struct tms *tms)
+{
+        struct task_cputime cputime;
+        cputime_t cutime, cstime;
+        spin_lock_irq(&current->sighand->siglock);
+        thread_group_cputime(current, &cputime);
+        cutime = current->signal->cutime;
+        cstime = current->signal->cstime;
+        spin_unlock_irq(&current->sighand->siglock);
+        tms->tms_utime = cputime_to_clock_t(cputime.utime);
+        tms->tms_stime = cputime_to_clock_t(cputime.stime);
+        tms->tms_cutime = cputime_to_clock_t(cutime);
+        tms->tms_cstime = cputime_to_clock_t(cstime);
+}
 asmlinkage long sys_times(struct tms __user * tbuf)
 {
-        /*
-         *      In the SMP world we might just be unlucky and have one of
-         *      the times increment as we use it. Since the value is an
-         *      atomically safe type this is just fine. Conceptually its
-         *      as if the syscall took an instant longer to occur.
-         */
        if (tbuf) {
                struct tms tmp;
-                struct task_struct *tsk = current;
-                struct task_struct *t;
+                do_sys_times(&tmp);
-                cputime_t utime, stime, cutime, cstime;
-                spin_lock_irq(&tsk->sighand->siglock);
-                utime = tsk->signal->utime;
-                stime = tsk->signal->stime;
-                t = tsk;
-                do {
-                        utime = cputime_add(utime, t->utime);
-                        stime = cputime_add(stime, t->stime);
-                        t = next_thread(t);
-                } while (t != tsk);
-                cutime = tsk->signal->cutime;
-                cstime = tsk->signal->cstime;
-                spin_unlock_irq(&tsk->sighand->siglock);
-                tmp.tms_utime = cputime_to_clock_t(utime);
-                tmp.tms_stime = cputime_to_clock_t(stime);
-                tmp.tms_cutime = cputime_to_clock_t(cutime);
-                tmp.tms_cstime = cputime_to_clock_t(cstime);
                if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
                        return -EFAULT;
        }
@@ -1060,9 +1050,7 @@ asmlinkage long sys_setsid(void)
        group_leader->signal->leader = 1;
        __set_special_pids(sid);
-        spin_lock(&group_leader->sighand->siglock);
+        proc_clear_tty(group_leader);
-        group_leader->signal->tty = NULL;
-        spin_unlock(&group_leader->sighand->siglock);
        err = session;
 out:
@@ -1351,8 +1339,10 @@ asmlinkage long sys_sethostname(char __user *name, int len)
        down_write(&uts_sem);
        errno = -EFAULT;
        if (!copy_from_user(tmp, name, len)) {
-                memcpy(utsname()->nodename, tmp, len);
+                struct new_utsname *u = utsname();
-                utsname()->nodename[len] = 0;
+                memcpy(u->nodename, tmp, len);
+                memset(u->nodename + len, 0, sizeof(u->nodename) - len);
                errno = 0;
        }
        up_write(&uts_sem);
@@ -1364,15 +1354,17 @@ asmlinkage long sys_sethostname(char __user *name, int len)
 asmlinkage long sys_gethostname(char __user *name, int len)
 {
        int i, errno;
+        struct new_utsname *u;
        if (len < 0)
                return -EINVAL;
        down_read(&uts_sem);
-        i = 1 + strlen(utsname()->nodename);
+        u = utsname();
+        i = 1 + strlen(u->nodename);
        if (i > len)
                i = len;
        errno = 0;
-        if (copy_to_user(name, utsname()->nodename, i))
+        if (copy_to_user(name, u->nodename, i))
                errno = -EFAULT;
        up_read(&uts_sem);
        return errno;
@@ -1397,8 +1389,10 @@ asmlinkage long sys_setdomainname(char __user *name, int len)
        down_write(&uts_sem);
        errno = -EFAULT;
        if (!copy_from_user(tmp, name, len)) {
-                memcpy(utsname()->domainname, tmp, len);
+                struct new_utsname *u = utsname();
-                utsname()->domainname[len] = 0;
+                memcpy(u->domainname, tmp, len);
+                memset(u->domainname + len, 0, sizeof(u->domainname) - len);
                errno = 0;
        }
        up_write(&uts_sem);
@@ -1445,21 +1439,28 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
 asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 {
        struct rlimit new_rlim, *old_rlim;
-        unsigned long it_prof_secs;
        int retval;
        if (resource >= RLIM_NLIMITS)
                return -EINVAL;
        if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
                return -EFAULT;
-        if (new_rlim.rlim_cur > new_rlim.rlim_max)
-                return -EINVAL;
        old_rlim = current->signal->rlim + resource;
        if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
            !capable(CAP_SYS_RESOURCE))
                return -EPERM;
-        if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
-                return -EPERM;
+        if (resource == RLIMIT_NOFILE) {
+                if (new_rlim.rlim_max == RLIM_INFINITY)
+                        new_rlim.rlim_max = sysctl_nr_open;
+                if (new_rlim.rlim_cur == RLIM_INFINITY)
+                        new_rlim.rlim_cur = sysctl_nr_open;
+                if (new_rlim.rlim_max > sysctl_nr_open)
+                        return -EPERM;
+        }
+        if (new_rlim.rlim_cur > new_rlim.rlim_max)
+                return -EINVAL;
        retval = security_task_setrlimit(resource, &new_rlim);
        if (retval)
@@ -1491,18 +1492,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
        if (new_rlim.rlim_cur == RLIM_INFINITY)
                goto out;
-        it_prof_secs = cputime_to_secs(current->signal->it_prof_expires);
+        update_rlimit_cpu(new_rlim.rlim_cur);
-        if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
-                unsigned long rlim_cur = new_rlim.rlim_cur;
-                cputime_t cputime;
-                cputime = secs_to_cputime(rlim_cur);
-                read_lock(&tasklist_lock);
-                spin_lock_irq(&current->sighand->siglock);
-                set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
-                spin_unlock_irq(&current->sighand->siglock);
-                read_unlock(&tasklist_lock);
-        }
 out:
        return 0;
 }
@@ -1540,11 +1530,8 @@ out:
 *
 */
-static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r,
+static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
-                                     cputime_t *utimep, cputime_t *stimep)
 {
-        *utimep = cputime_add(*utimep, t->utime);
-        *stimep = cputime_add(*stimep, t->stime);
        r->ru_nvcsw += t->nvcsw;
        r->ru_nivcsw += t->nivcsw;
        r->ru_minflt += t->min_flt;
@@ -1558,12 +1545,13 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
        struct task_struct *t;
        unsigned long flags;
        cputime_t utime, stime;
+        struct task_cputime cputime;
        memset((char *) r, 0, sizeof *r);
        utime = stime = cputime_zero;
        if (who == RUSAGE_THREAD) {
-                accumulate_thread_rusage(p, r, &utime, &stime);
+                accumulate_thread_rusage(p, r);
                goto out;
        }
@@ -1586,8 +1574,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
                                break;
                case RUSAGE_SELF:
-                        utime = cputime_add(utime, p->signal->utime);
+                        thread_group_cputime(p, &cputime);
-                        stime = cputime_add(stime, p->signal->stime);
+                        utime = cputime_add(utime, cputime.utime);
+                        stime = cputime_add(stime, cputime.stime);
                        r->ru_nvcsw += p->signal->nvcsw;
                        r->ru_nivcsw += p->signal->nivcsw;
                        r->ru_minflt += p->signal->min_flt;
@@ -1596,7 +1585,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
                        r->ru_oublock += p->signal->oublock;
                        t = p;
                        do {
-                                accumulate_thread_rusage(t, r, &utime, &stime);
+                                accumulate_thread_rusage(t, r);
                                t = next_thread(t);
                        } while (t != p);
                        break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 08d6e1bb99ac..a77b27b11b04 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -125,6 +125,12 @@ cond_syscall(sys_vm86old);
 cond_syscall(sys_vm86);
 cond_syscall(compat_sys_ipc);
 cond_syscall(compat_sys_sysctl);
+cond_syscall(sys_flock);
+cond_syscall(sys_io_setup);
+cond_syscall(sys_io_destroy);
+cond_syscall(sys_io_submit);
+cond_syscall(sys_io_cancel);
+cond_syscall(sys_io_getevents);
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index fe4713347275..617d41e4d6a0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -80,7 +80,6 @@ extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
-extern int maps_protect;
 extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
 #ifdef CONFIG_RCU_TORTURE_TEST
@@ -97,7 +96,7 @@ static int sixty = 60;
 static int neg_one = -1;
 #endif
-#ifdef CONFIG_MMU
+#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
 static int two = 2;
 #endif
@@ -118,10 +117,8 @@ extern char modprobe_path[];
 extern int sg_big_buff;
 #endif
-#ifdef __sparc__
+#ifdef CONFIG_SPARC
-extern char reboot_command [];
+#include <asm/system.h>
-extern int stop_a_enabled;
-extern int scons_pwroff;
 #endif
 #ifdef __hppa__
@@ -152,13 +149,14 @@ extern int max_lock_depth;
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
-static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp,
+static int proc_taint(struct ctl_table *table, int write, struct file *filp,
                               void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
 static struct ctl_table root_table[];
 static struct ctl_table_root sysctl_table_root;
 static struct ctl_table_header root_table_header = {
+        .count = 1,
        .ctl_table = root_table,
        .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
        .root = &sysctl_table_root,
@@ -381,10 +379,9 @@ static struct ctl_table kern_table[] = {
 #ifdef CONFIG_PROC_SYSCTL
        {
                .procname       = "tainted",
-                .data           = &tainted,
+                .maxlen         = sizeof(long),
-                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_taint,
+                .proc_handler   = &proc_taint,
        },
 #endif
 #ifdef CONFIG_LATENCYTOP
@@ -414,7 +411,7 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
-#ifdef __sparc__
+#ifdef CONFIG_SPARC
        {
                .ctl_name       = KERN_SPARC_REBOOT,
                .procname       = "reboot-cmd",
@@ -809,16 +806,6 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &proc_dointvec,
        },
 #endif
-#ifdef CONFIG_PROC_FS
-        {
-                .ctl_name       = CTL_UNNUMBERED,
-                .procname       = "maps_protect",
-                .data           = &maps_protect,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
-        },
-#endif
        {
                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "poweroff_cmd",
@@ -1260,6 +1247,7 @@ static struct ctl_table fs_table[] = {
                .extra1         = &minolduid,
                .extra2         = &maxolduid,
        },
+#ifdef CONFIG_FILE_LOCKING
        {
                .ctl_name       = FS_LEASES,
                .procname       = "leases-enable",
@@ -1268,6 +1256,7 @@ static struct ctl_table fs_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
+#endif
 #ifdef CONFIG_DNOTIFY
        {
                .ctl_name       = FS_DIR_NOTIFY,
@@ -1279,6 +1268,7 @@ static struct ctl_table fs_table[] = {
        },
 #endif
 #ifdef CONFIG_MMU
+#ifdef CONFIG_FILE_LOCKING
        {
                .ctl_name       = FS_LEASE_TIME,
                .procname       = "lease-break-time",
@@ -1290,6 +1280,8 @@ static struct ctl_table fs_table[] = {
                .extra1         = &zero,
                .extra2         = &two,
        },
+#endif
+#ifdef CONFIG_AIO
        {
                .procname       = "aio-nr",
                .data           = &aio_nr,
@@ -1304,6 +1296,7 @@ static struct ctl_table fs_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_doulongvec_minmax,
        },
+#endif /* CONFIG_AIO */
 #ifdef CONFIG_INOTIFY_USER
        {
                .ctl_name       = FS_INOTIFY,
@@ -1509,7 +1502,6 @@ void register_sysctl_root(struct ctl_table_root *root)
 /* Perform the actual read/write of a sysctl table entry. */
 static int do_sysctl_strategy(struct ctl_table_root *root,
                        struct ctl_table *table,
-                        int __user *name, int nlen,
                        void __user *oldval, size_t __user *oldlenp,
                        void __user *newval, size_t newlen)
 {
@@ -1523,8 +1515,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
                return -EPERM;
        if (table->strategy) {
-                rc = table->strategy(table, name, nlen, oldval, oldlenp,
+                rc = table->strategy(table, oldval, oldlenp, newval, newlen);
-                                     newval, newlen);
                if (rc < 0)
                        return rc;
                if (rc > 0)
@@ -1534,8 +1525,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
        /* If there is no strategy routine, or if the strategy returns
         * zero, proceed with automatic r/w */
        if (table->data && table->maxlen) {
-                rc = sysctl_data(table, name, nlen, oldval, oldlenp,
+                rc = sysctl_data(table, oldval, oldlenp, newval, newlen);
-                                 newval, newlen);
                if (rc < 0)
                        return rc;
        }
@@ -1567,7 +1557,7 @@ repeat:
                                table = table->child;
                                goto repeat;
                        }
-                        error = do_sysctl_strategy(root, table, name, nlen,
+                        error = do_sysctl_strategy(root, table,
                                                   oldval, oldlenp,
                                                   newval, newlen);
                        return error;
@@ -2236,49 +2226,39 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
                            NULL,NULL);
 }
-#define OP_SET  0
-#define OP_AND  1
-#define OP_OR   2
-static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
-                                      int *valp,
-                                      int write, void *data)
-{
-        int op = *(int *)data;
-        if (write) {
-                int val = *negp ? -*lvalp : *lvalp;
-                switch(op) {
-                case OP_SET:    *valp = val; break;
-                case OP_AND:    *valp &= val; break;
-                case OP_OR:     *valp |= val; break;
-                }
-        } else {
-                int val = *valp;
-                if (val < 0) {
-                        *negp = -1;
-                        *lvalp = (unsigned long)-val;
-                } else {
-                        *negp = 0;
-                        *lvalp = (unsigned long)val;
-                }
-        }
-        return 0;
-}
 /*
- *      Taint values can only be increased
+ * Taint values can only be increased
+ * This means we can safely use a temporary.
 */
-static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp,
+static int proc_taint(struct ctl_table *table, int write, struct file *filp,
                               void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-        int op;
+        struct ctl_table t;
+        unsigned long tmptaint = get_taint();
+        int err;
        if (write && !capable(CAP_SYS_ADMIN))
                return -EPERM;
-        op = OP_OR;
+        t = *table;
-        return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+        t.data = &tmptaint;
-                                do_proc_dointvec_bset_conv,&op);
+        err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos);
+        if (err < 0)
+                return err;
+        if (write) {
+                /*
+                 * Poor man's atomic or. Not worth adding a primitive
+                 * to everyone's atomic.h for this
+                 */
+                int i;
+                for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
+                        if ((tmptaint >> i) & 1)
+                                add_taint(i);
+                }
+        }
+        return err;
 }
 struct do_proc_dointvec_minmax_conv_param {
@@ -2726,7 +2706,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 */
 /* The generic sysctl data routine (used if no strategy routine supplied) */
-int sysctl_data(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_data(struct ctl_table *table,
                void __user *oldval, size_t __user *oldlenp,
                void __user *newval, size_t newlen)
 {
@@ -2760,7 +2740,7 @@ int sysctl_data(struct ctl_table *table, int __user *name, int nlen,
 }
 /* The generic string strategy routine: */
-int sysctl_string(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_string(struct ctl_table *table,
                  void __user *oldval, size_t __user *oldlenp,
                  void __user *newval, size_t newlen)
 {
@@ -2806,7 +2786,7 @@ int sysctl_string(struct ctl_table *table, int __user *name, int nlen,
 * are between the minimum and maximum values given in the arrays
 * table->extra1 and table->extra2, respectively.
 */
-int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_intvec(struct ctl_table *table,
                void __user *oldval, size_t __user *oldlenp,
                void __user *newval, size_t newlen)
 {
@@ -2842,7 +2822,7 @@ int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen,
 }
 /* Strategy function to convert jiffies to seconds */ 
-int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_jiffies(struct ctl_table *table,
                void __user *oldval, size_t __user *oldlenp,
                void __user *newval, size_t newlen)
 {
@@ -2876,7 +2856,7 @@ int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen,
 }
 /* Strategy function to convert jiffies to seconds */ 
-int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_ms_jiffies(struct ctl_table *table,
                void __user *oldval, size_t __user *oldlenp,
                void __user *newval, size_t newlen)
 {
@@ -2931,35 +2911,35 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
        return error;
 }
-int sysctl_data(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_data(struct ctl_table *table,
                  void __user *oldval, size_t __user *oldlenp,
                  void __user *newval, size_t newlen)
 {
        return -ENOSYS;
 }
-int sysctl_string(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_string(struct ctl_table *table,
                  void __user *oldval, size_t __user *oldlenp,
                  void __user *newval, size_t newlen)
 {
        return -ENOSYS;
 }
-int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_intvec(struct ctl_table *table,
                void __user *oldval, size_t __user *oldlenp,
                void __user *newval, size_t newlen)
 {
        return -ENOSYS;
 }
-int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_jiffies(struct ctl_table *table,
                void __user *oldval, size_t __user *oldlenp,
                void __user *newval, size_t newlen)
 {
        return -ENOSYS;
 }
-int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen,
+int sysctl_ms_jiffies(struct ctl_table *table,
                void __user *oldval, size_t __user *oldlenp,
                void __user *newval, size_t newlen)
 {
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 8d53106a0a92..95ed42951e0a 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -3,7 +3,6 @@
 #
 config TICK_ONESHOT
        bool
-        default n
 config NO_HZ
        bool "Tickless System (Dynamic Ticks)"
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 3d1e3e1a1971..f8d968063cea 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -72,6 +72,16 @@ void clockevents_set_mode(struct clock_event_device *dev,
 }
 /**
+ * clockevents_shutdown - shutdown the device and clear next_event
+ * @dev:        device to shutdown
+ */
+void clockevents_shutdown(struct clock_event_device *dev)
+{
+        clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+        dev->next_event.tv64 = KTIME_MAX;
+}
+/**
 * clockevents_program_event - Reprogram the clock event device.
 * @expires:    absolute expiry time (monotonic clock)
 *
@@ -177,7 +187,7 @@ void clockevents_register_device(struct clock_event_device *dev)
 /*
 * Noop handler when we shut down an event device
 */
-static void clockevents_handle_noop(struct clock_event_device *dev)
+void clockevents_handle_noop(struct clock_event_device *dev)
 {
 }
@@ -199,7 +209,6 @@ void clockevents_exchange_device(struct clock_event_device *old,
         * released list and do a notify add later.
         */
        if (old) {
-                old->event_handler = clockevents_handle_noop;
                clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
                list_del(&old->list);
                list_add(&old->list, &clockevents_released);
@@ -207,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
        if (new) {
                BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
-                clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN);
+                clockevents_shutdown(new);
        }
        local_irq_restore(flags);
 }
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 093d4acf993b..9ed2eec97526 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -325,6 +325,9 @@ int clocksource_register(struct clocksource *c)
        unsigned long flags;
        int ret;
+        /* save mult_orig on registration */
+        c->mult_orig = c->mult;
        spin_lock_irqsave(&clocksource_lock, flags);
        ret = clocksource_enqueue(c);
        if (!ret)
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 4c256fdb8875..1ca99557e929 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,6 +61,7 @@ struct clocksource clocksource_jiffies = {
        .read           = jiffies_read,
        .mask           = 0xffffffff, /*32bits*/
        .mult           = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
+        .mult_orig      = NSEC_PER_JIFFY << JIFFIES_SHIFT,
        .shift          = JIFFIES_SHIFT,
 };
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5125ddd8196b..1a20715bfd6e 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -10,13 +10,13 @@
 #include <linux/mm.h>
 #include <linux/time.h>
-#include <linux/timer.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
 #include <linux/hrtimer.h>
 #include <linux/capability.h>
 #include <linux/math64.h>
 #include <linux/clocksource.h>
+#include <linux/workqueue.h>
 #include <asm/timex.h>
 /*
@@ -218,11 +218,11 @@ void second_overflow(void)
 /* Disable the cmos update - used by virtualization and embedded */
 int no_sync_cmos_clock  __read_mostly;
-static void sync_cmos_clock(unsigned long dummy);
+static void sync_cmos_clock(struct work_struct *work);
-static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
+static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
-static void sync_cmos_clock(unsigned long dummy)
+static void sync_cmos_clock(struct work_struct *work)
 {
        struct timespec now, next;
        int fail = 1;
@@ -245,7 +245,7 @@ static void sync_cmos_clock(unsigned long dummy)
        if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
                fail = update_persistent_clock(now);
-        next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec;
+        next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
        if (next.tv_nsec <= 0)
                next.tv_nsec += NSEC_PER_SEC;
@@ -258,13 +258,13 @@ static void sync_cmos_clock(unsigned long dummy)
                next.tv_sec++;
                next.tv_nsec -= NSEC_PER_SEC;
        }
-        mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next));
+        schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
 }
 static void notify_cmos_timer(void)
 {
        if (!no_sync_cmos_clock)
-                mod_timer(&sync_cmos_timer, jiffies + 1);
+                schedule_delayed_work(&sync_cmos_work, 0);
 }
 #else
@@ -277,38 +277,50 @@ static inline void notify_cmos_timer(void) { }
 int do_adjtimex(struct timex *txc)
 {
        struct timespec ts;
-        long save_adjust, sec;
        int result;
-        /* In order to modify anything, you gotta be super-user! */
+        /* Validate the data before disabling interrupts */
-        if (txc->modes && !capable(CAP_SYS_TIME))
+        if (txc->modes & ADJ_ADJTIME) {
-                return -EPERM;
-        /* Now we validate the data before disabling interrupts */
-        if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
                /* singleshot must not be used with any other mode bits */
-                if (txc->modes & ~ADJ_OFFSET_SS_READ)
+                if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
                        return -EINVAL;
+                if (!(txc->modes & ADJ_OFFSET_READONLY) &&
+                    !capable(CAP_SYS_TIME))
+                        return -EPERM;
+        } else {
+                /* In order to modify anything, you gotta be super-user! */
+                 if (txc->modes && !capable(CAP_SYS_TIME))
+                        return -EPERM;
+                /* if the quartz is off by more than 10% something is VERY wrong! */
+                if (txc->modes & ADJ_TICK &&
+                    (txc->tick <  900000/USER_HZ ||
+                     txc->tick > 1100000/USER_HZ))
+                                return -EINVAL;
+                if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
+                        hrtimer_cancel(&leap_timer);
        }
-        /* if the quartz is off by more than 10% something is VERY wrong ! */
-        if (txc->modes & ADJ_TICK)
-                if (txc->tick <  900000/USER_HZ ||
-                    txc->tick > 1100000/USER_HZ)
-                        return -EINVAL;
-        if (time_state != TIME_OK && txc->modes & ADJ_STATUS)
-                hrtimer_cancel(&leap_timer);
        getnstimeofday(&ts);
        write_seqlock_irq(&xtime_lock);
-        /* Save for later - semantics of adjtime is to return old value */
-        save_adjust = time_adjust;
        /* If there are input parameters, then process them */
+        if (txc->modes & ADJ_ADJTIME) {
+                long save_adjust = time_adjust;
+                if (!(txc->modes & ADJ_OFFSET_READONLY)) {
+                        /* adjtime() is independent from ntp_adjtime() */
+                        time_adjust = txc->offset;
+                        ntp_update_frequency();
+                }
+                txc->offset = save_adjust;
+                goto adj_done;
+        }
        if (txc->modes) {
+                long sec;
                if (txc->modes & ADJ_STATUS) {
                        if ((time_status & STA_PLL) &&
                            !(txc->status & STA_PLL)) {
@@ -375,13 +387,8 @@ int do_adjtimex(struct timex *txc)
                if (txc->modes & ADJ_TAI && txc->constant > 0)
                        time_tai = txc->constant;
-                if (txc->modes & ADJ_OFFSET) {
+                if (txc->modes & ADJ_OFFSET)
-                        if (txc->modes == ADJ_OFFSET_SINGLESHOT)
+                        ntp_update_offset(txc->offset);
-                                /* adjtime() is independent from ntp_adjtime() */
-                                time_adjust = txc->offset;
-                        else
-                                ntp_update_offset(txc->offset);
-                }
                if (txc->modes & ADJ_TICK)
                        tick_usec = txc->tick;
@@ -389,22 +396,18 @@ int do_adjtimex(struct timex *txc)
                        ntp_update_frequency();
        }
+        txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
+                                  NTP_SCALE_SHIFT);
+        if (!(time_status & STA_NANO))
+                txc->offset /= NSEC_PER_USEC;
+adj_done:
        result = time_state;    /* mostly `TIME_OK' */
        if (time_status & (STA_UNSYNC|STA_CLOCKERR))
                result = TIME_ERROR;
-        if ((txc->modes == ADJ_OFFSET_SINGLESHOT) ||
+        txc->freq          = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
-            (txc->modes == ADJ_OFFSET_SS_READ))
+                                         (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);
-                txc->offset = save_adjust;
-        else {
-                txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
-                                          NTP_SCALE_SHIFT);
-                if (!(time_status & STA_NANO))
-                        txc->offset /= NSEC_PER_USEC;
-        }
-        txc->freq          = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
-                                         (s64)PPM_SCALE_INV,
-                                         NTP_SCALE_SHIFT);
        txc->maxerror      = time_maxerror;
        txc->esterror      = time_esterror;
        txc->status        = time_status;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 31463d370b94..f98a1b7b16e9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -175,6 +175,8 @@ static void tick_do_periodic_broadcast(void)
 */
 static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
 {
+        ktime_t next;
        tick_do_periodic_broadcast();
        /*
@@ -185,10 +187,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
        /*
         * Setup the next period for devices, which do not have
-         * periodic mode:
+         * periodic mode. We read dev->next_event first and add to it
+         * when the event alrady expired. clockevents_program_event()
+         * sets dev->next_event only when the event is really
+         * programmed to the device.
         */
-        for (;;) {
+        for (next = dev->next_event; ;) {
-                ktime_t next = ktime_add(dev->next_event, tick_period);
+                next = ktime_add(next, tick_period);
                if (!clockevents_program_event(dev, next, ktime_get()))
                        return;
@@ -205,7 +210,7 @@ static void tick_do_broadcast_on_off(void *why)
        struct clock_event_device *bc, *dev;
        struct tick_device *td;
        unsigned long flags, *reason = why;
-        int cpu;
+        int cpu, bc_stopped;
        spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -223,14 +228,16 @@ static void tick_do_broadcast_on_off(void *why)
        if (!tick_device_is_functional(dev))
                goto out;
+        bc_stopped = cpus_empty(tick_broadcast_mask);
        switch (*reason) {
        case CLOCK_EVT_NOTIFY_BROADCAST_ON:
        case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
                if (!cpu_isset(cpu, tick_broadcast_mask)) {
                        cpu_set(cpu, tick_broadcast_mask);
-                        if (td->mode == TICKDEV_MODE_PERIODIC)
+                        if (tick_broadcast_device.mode ==
-                                clockevents_set_mode(dev,
+                            TICKDEV_MODE_PERIODIC)
-                                                     CLOCK_EVT_MODE_SHUTDOWN);
+                                clockevents_shutdown(dev);
                }
                if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
                        tick_broadcast_force = 1;
@@ -239,15 +246,17 @@ static void tick_do_broadcast_on_off(void *why)
                if (!tick_broadcast_force &&
                    cpu_isset(cpu, tick_broadcast_mask)) {
                        cpu_clear(cpu, tick_broadcast_mask);
-                        if (td->mode == TICKDEV_MODE_PERIODIC)
+                        if (tick_broadcast_device.mode ==
+                            TICKDEV_MODE_PERIODIC)
                                tick_setup_periodic(dev, 0);
                }
                break;
        }
-        if (cpus_empty(tick_broadcast_mask))
+        if (cpus_empty(tick_broadcast_mask)) {
-                clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+                if (!bc_stopped)
-        else {
+                        clockevents_shutdown(bc);
+        } else if (bc_stopped) {
                if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
                        tick_broadcast_start_periodic(bc);
                else
@@ -298,7 +307,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
        if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
                if (bc && cpus_empty(tick_broadcast_mask))
-                        clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+                        clockevents_shutdown(bc);
        }
        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -313,7 +322,7 @@ void tick_suspend_broadcast(void)
        bc = tick_broadcast_device.evtdev;
        if (bc)
-                clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+                clockevents_shutdown(bc);
        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
@@ -364,16 +373,8 @@ cpumask_t *tick_get_broadcast_oneshot_mask(void)
 static int tick_broadcast_set_event(ktime_t expires, int force)
 {
        struct clock_event_device *bc = tick_broadcast_device.evtdev;
-        ktime_t now = ktime_get();
-        int res;
+        return tick_dev_program_event(bc, expires, force);
-        for(;;) {
-                res = clockevents_program_event(bc, expires, now);
-                if (!res || !force)
-                        return res;
-                now = ktime_get();
-                expires = ktime_add(now, ktime_set(0, bc->min_delta_ns));
-        }
 }
 int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -383,6 +384,19 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 }
 /*
+ * Called from irq_enter() when idle was interrupted to reenable the
+ * per cpu device.
+ */
+void tick_check_oneshot_broadcast(int cpu)
+{
+        if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
+                struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
+                clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
+        }
+}
+/*
 * Handle oneshot mode broadcasting
 */
 static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
@@ -491,14 +505,52 @@ static void tick_broadcast_clear_oneshot(int cpu)
        cpu_clear(cpu, tick_broadcast_oneshot_mask);
 }
+static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires)
+{
+        struct tick_device *td;
+        int cpu;
+        for_each_cpu_mask_nr(cpu, *mask) {
+                td = &per_cpu(tick_cpu_device, cpu);
+                if (td->evtdev)
+                        td->evtdev->next_event = expires;
+        }
+}
 /**
 * tick_broadcast_setup_oneshot - setup the broadcast device
 */
 void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
-        bc->event_handler = tick_handle_oneshot_broadcast;
+        /* Set it up only once ! */
-        clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+        if (bc->event_handler != tick_handle_oneshot_broadcast) {
-        bc->next_event.tv64 = KTIME_MAX;
+                int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
+                int cpu = smp_processor_id();
+                cpumask_t mask;
+                bc->event_handler = tick_handle_oneshot_broadcast;
+                clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+                /* Take the do_timer update */
+                tick_do_timer_cpu = cpu;
+                /*
+                 * We must be careful here. There might be other CPUs
+                 * waiting for periodic broadcast. We need to set the
+                 * oneshot_mask bits for those and program the
+                 * broadcast device to fire.
+                 */
+                mask = tick_broadcast_mask;
+                cpu_clear(cpu, mask);
+                cpus_or(tick_broadcast_oneshot_mask,
+                        tick_broadcast_oneshot_mask, mask);
+                if (was_periodic && !cpus_empty(mask)) {
+                        tick_broadcast_init_next_event(&mask, tick_next_period);
+                        tick_broadcast_set_event(tick_next_period, 1);
+                } else
+                        bc->next_event.tv64 = KTIME_MAX;
+        }
 }
 /*
@@ -538,4 +590,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
        spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
+/*
+ * Check, whether the broadcast device is in one shot mode
+ */
+int tick_broadcast_oneshot_active(void)
+{
+        return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
+}
 #endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 80c4336f4188..df12434b43ca 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
 */
 ktime_t tick_next_period;
 ktime_t tick_period;
-int tick_do_timer_cpu __read_mostly = -1;
+int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
 DEFINE_SPINLOCK(tick_device_lock);
 /*
@@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
        if (!tick_device_is_functional(dev))
                return;
-        if (dev->features & CLOCK_EVT_FEAT_PERIODIC) {
+        if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
+            !tick_broadcast_oneshot_active()) {
                clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
        } else {
                unsigned long seq;
@@ -148,7 +149,7 @@ static void tick_setup_device(struct tick_device *td,
                 * If no cpu took the do_timer update, assign it to
                 * this cpu:
                 */
-                if (tick_do_timer_cpu == -1) {
+                if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
                        tick_do_timer_cpu = cpu;
                        tick_next_period = ktime_get();
                        tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
@@ -161,6 +162,7 @@ static void tick_setup_device(struct tick_device *td,
        } else {
                handler = td->evtdev->event_handler;
                next_event = td->evtdev->next_event;
+                td->evtdev->event_handler = clockevents_handle_noop;
        }
        td->evtdev = newdev;
@@ -248,7 +250,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
         * not give it back to the clockevents layer !
         */
        if (tick_is_broadcast_device(curdev)) {
-                clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN);
+                clockevents_shutdown(curdev);
                curdev = NULL;
        }
        clockevents_exchange_device(curdev, newdev);
@@ -299,7 +301,8 @@ static void tick_shutdown(unsigned int *cpup)
        if (*cpup == tick_do_timer_cpu) {
                int cpu = first_cpu(cpu_online_map);
-                tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1;
+                tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu :
+                        TICK_DO_TIMER_NONE;
        }
        spin_unlock_irqrestore(&tick_device_lock, flags);
 }
@@ -310,7 +313,7 @@ static void tick_suspend(void)
        unsigned long flags;
        spin_lock_irqsave(&tick_device_lock, flags);
-        clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
+        clockevents_shutdown(td->evtdev);
        spin_unlock_irqrestore(&tick_device_lock, flags);
 }
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f13f2b7f4fd4..b1c05bf75ee0 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
 /*
 * tick internal variable and functions used by low/high res code
 */
+#define TICK_DO_TIMER_NONE      -1
+#define TICK_DO_TIMER_BOOT      -2
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 extern spinlock_t tick_device_lock;
 extern ktime_t tick_next_period;
@@ -10,6 +14,8 @@ extern int tick_do_timer_cpu __read_mostly;
 extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
 extern void tick_handle_periodic(struct clock_event_device *dev);
+extern void clockevents_shutdown(struct clock_event_device *dev);
 /*
 * NO_HZ / high resolution timer shared code
 */
@@ -17,6 +23,8 @@ extern void tick_handle_periodic(struct clock_event_device *dev);
 extern void tick_setup_oneshot(struct clock_event_device *newdev,
                               void (*handler)(struct clock_event_device *),
                               ktime_t nextevt);
+extern int tick_dev_program_event(struct clock_event_device *dev,
+                                  ktime_t expires, int force);
 extern int tick_program_event(ktime_t expires, int force);
 extern void tick_oneshot_notify(void);
 extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
@@ -27,6 +35,8 @@ extern void tick_broadcast_oneshot_control(unsigned long reason);
 extern void tick_broadcast_switch_to_oneshot(void);
 extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
 extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
+extern int tick_broadcast_oneshot_active(void);
+extern void tick_check_oneshot_broadcast(int cpu);
 # else /* BROADCAST */
 static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
@@ -35,6 +45,8 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
 static inline void tick_broadcast_switch_to_oneshot(void) { }
 static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+static inline int tick_broadcast_oneshot_active(void) { return 0; }
+static inline void tick_check_oneshot_broadcast(int cpu) { }
 # endif /* !BROADCAST */
 #else /* !ONESHOT */
@@ -64,6 +76,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 {
        return 0;
 }
+static inline int tick_broadcast_oneshot_active(void) { return 0; }
 #endif /* !TICK_ONESHOT */
 /*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 450c04935b66..2e8de678e767 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -23,24 +23,56 @@
 #include "tick-internal.h"
 /**
- * tick_program_event
+ * tick_program_event internal worker function
 */
-int tick_program_event(ktime_t expires, int force)
+int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
+                           int force)
 {
-        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
        ktime_t now = ktime_get();
+        int i;
-        while (1) {
+        for (i = 0;;) {
                int ret = clockevents_program_event(dev, expires, now);
                if (!ret || !force)
                        return ret;
+                /*
+                 * We tried 2 times to program the device with the given
+                 * min_delta_ns. If that's not working then we double it
+                 * and emit a warning.
+                 */
+                if (++i > 2) {
+                        /* Increase the min. delta and try again */
+                        if (!dev->min_delta_ns)
+                                dev->min_delta_ns = 5000;
+                        else
+                                dev->min_delta_ns += dev->min_delta_ns >> 1;
+                        printk(KERN_WARNING
+                               "CE: %s increasing min_delta_ns to %lu nsec\n",
+                               dev->name ? dev->name : "?",
+                               dev->min_delta_ns << 1);
+                        i = 0;
+                }
                now = ktime_get();
-                expires = ktime_add(now, ktime_set(0, dev->min_delta_ns));
+                expires = ktime_add_ns(now, dev->min_delta_ns);
        }
 }
 /**
+ * tick_program_event
+ */
+int tick_program_event(ktime_t expires, int force)
+{
+        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+        return tick_dev_program_event(dev, expires, force);
+}
+/**
 * tick_resume_onshot - resume oneshot mode
 */
 void tick_resume_oneshot(void)
@@ -61,7 +93,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
 {
        newdev->event_handler = handler;
        clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
-        clockevents_program_event(newdev, next_event, ktime_get());
+        tick_dev_program_event(newdev, next_event, 1);
 }
 /**
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f5da526424a9..0581c11fe6c6 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
 #include <linux/profile.h>
 #include <linux/sched.h>
 #include <linux/tick.h>
+#include <linux/module.h>
 #include <asm/irq_regs.h>
@@ -75,6 +76,9 @@ static void tick_do_update_jiffies64(ktime_t now)
                                                           incr * ticks);
                }
                do_timer(++ticks);
+                /* Keep the tick_next_period variable up to date */
+                tick_next_period = ktime_add(last_jiffies_update, tick_period);
        }
        write_sequnlock(&xtime_lock);
 }
@@ -151,7 +155,7 @@ void tick_nohz_update_jiffies(void)
        touch_softlockup_watchdog();
 }
-void tick_nohz_stop_idle(int cpu)
+static void tick_nohz_stop_idle(int cpu)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -162,6 +166,8 @@ void tick_nohz_stop_idle(int cpu)
                ts->idle_lastupdate = now;
                ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
                ts->idle_active = 0;
+                sched_clock_idle_wakeup_event(0);
        }
 }
@@ -177,6 +183,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
        }
        ts->idle_entrytime = now;
        ts->idle_active = 1;
+        sched_clock_idle_sleep_event();
        return now;
 }
@@ -184,9 +191,17 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-        *last_update_time = ktime_to_us(ts->idle_lastupdate);
+        if (!tick_nohz_enabled)
+                return -1;
+        if (ts->idle_active)
+                *last_update_time = ktime_to_us(ts->idle_lastupdate);
+        else
+                *last_update_time = ktime_to_us(ktime_get());
        return ktime_to_us(ts->idle_sleeptime);
 }
+EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 /**
 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
@@ -218,7 +233,7 @@ void tick_nohz_stop_sched_tick(int inidle)
         */
        if (unlikely(!cpu_online(cpu))) {
                if (cpu == tick_do_timer_cpu)
-                        tick_do_timer_cpu = -1;
+                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
        }
        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
@@ -255,7 +270,7 @@ void tick_nohz_stop_sched_tick(int inidle)
        next_jiffies = get_next_timer_interrupt(last_jiffies);
        delta_jiffies = next_jiffies - last_jiffies;
-        if (rcu_needs_cpu(cpu))
+        if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
                delta_jiffies = 1;
        /*
         * Do not stop the tick, if we are only one off
@@ -300,7 +315,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                 * invoked.
                 */
                if (cpu == tick_do_timer_cpu)
-                        tick_do_timer_cpu = -1;
+                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
                ts->idle_sleeps++;
@@ -362,6 +377,32 @@ ktime_t tick_nohz_get_sleep_length(void)
        return ts->sleep_length;
 }
+static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
+{
+        hrtimer_cancel(&ts->sched_timer);
+        ts->sched_timer.expires = ts->idle_tick;
+        while (1) {
+                /* Forward the time to expire in the future */
+                hrtimer_forward(&ts->sched_timer, now, tick_period);
+                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
+                        hrtimer_start(&ts->sched_timer,
+                                      ts->sched_timer.expires,
+                                      HRTIMER_MODE_ABS);
+                        /* Check, if the timer was already in the past */
+                        if (hrtimer_active(&ts->sched_timer))
+                                break;
+                } else {
+                        if (!tick_program_event(ts->sched_timer.expires, 0))
+                                break;
+                }
+                /* Update jiffies and reread time */
+                tick_do_update_jiffies64(now);
+                now = ktime_get();
+        }
+}
 /**
 * tick_nohz_restart_sched_tick - restart the idle tick from the idle task
 *
@@ -415,28 +456,7 @@ void tick_nohz_restart_sched_tick(void)
         */
        ts->tick_stopped  = 0;
        ts->idle_exittime = now;
-        hrtimer_cancel(&ts->sched_timer);
+        tick_nohz_restart(ts, now);
-        ts->sched_timer.expires = ts->idle_tick;
-        while (1) {
-                /* Forward the time to expire in the future */
-                hrtimer_forward(&ts->sched_timer, now, tick_period);
-                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
-                        hrtimer_start(&ts->sched_timer,
-                                      ts->sched_timer.expires,
-                                      HRTIMER_MODE_ABS);
-                        /* Check, if the timer was already in the past */
-                        if (hrtimer_active(&ts->sched_timer))
-                                break;
-                } else {
-                        if (!tick_program_event(ts->sched_timer.expires, 0))
-                                break;
-                }
-                /* Update jiffies and reread time */
-                tick_do_update_jiffies64(now);
-                now = ktime_get();
-        }
        local_irq_enable();
 }
@@ -465,7 +485,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
         * this duty, then the jiffies update is still serialized by
         * xtime_lock.
         */
-        if (unlikely(tick_do_timer_cpu == -1))
+        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
                tick_do_timer_cpu = cpu;
        /* Check, if the jiffies need an update */
@@ -488,10 +508,6 @@ static void tick_nohz_handler(struct clock_event_device *dev)
        update_process_times(user_mode(regs));
        profile_tick(CPU_PROFILING);
-        /* Do not restart, when we are in the idle loop */
-        if (ts->tick_stopped)
-                return;
        while (tick_nohz_reprogram(ts, now)) {
                now = ktime_get();
                tick_do_update_jiffies64(now);
@@ -537,6 +553,27 @@ static void tick_nohz_switch_to_nohz(void)
               smp_processor_id());
 }
+/*
+ * When NOHZ is enabled and the tick is stopped, we need to kick the
+ * tick timer from irq_enter() so that the jiffies update is kept
+ * alive during long running softirqs. That's ugly as hell, but
+ * correctness is key even if we need to fix the offending softirq in
+ * the first place.
+ *
+ * Note, this is different to tick_nohz_restart. We just kick the
+ * timer and do not touch the other magic bits which need to be done
+ * when idle is left.
+ */
+static void tick_nohz_kick_tick(int cpu)
+{
+        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+        if (!ts->tick_stopped)
+                return;
+        tick_nohz_restart(ts, ktime_get());
+}
 #else
 static inline void tick_nohz_switch_to_nohz(void) { }
@@ -544,6 +581,19 @@ static inline void tick_nohz_switch_to_nohz(void) { }
 #endif /* NO_HZ */
 /*
+ * Called from irq_enter to notify about the possible interruption of idle()
+ */
+void tick_check_idle(int cpu)
+{
+        tick_check_oneshot_broadcast(cpu);
+#ifdef CONFIG_NO_HZ
+        tick_nohz_stop_idle(cpu);
+        tick_nohz_update_jiffies();
+        tick_nohz_kick_tick(cpu);
+#endif
+}
+/*
 * High resolution timer specific code
 */
 #ifdef CONFIG_HIGH_RES_TIMERS
@@ -567,7 +617,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
         * this duty, then the jiffies update is still serialized by
         * xtime_lock.
         */
-        if (unlikely(tick_do_timer_cpu == -1))
+        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
                tick_do_timer_cpu = cpu;
 #endif
@@ -596,10 +646,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
                profile_tick(CPU_PROFILING);
        }
-        /* Do not restart, when we are in the idle loop */
-        if (ts->tick_stopped)
-                return HRTIMER_NORESTART;
        hrtimer_forward(timer, now, tick_period);
        return HRTIMER_RESTART;
@@ -619,7 +665,7 @@ void tick_setup_sched_timer(void)
         */
        hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
        ts->sched_timer.function = tick_sched_timer;
-        ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+        ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
        /* Get the next period (per cpu) */
        ts->sched_timer.expires = tick_init_jiffy_update();
@@ -643,17 +689,21 @@ void tick_setup_sched_timer(void)
                ts->nohz_mode = NOHZ_MODE_HIGHRES;
 #endif
 }
+#endif /* HIGH_RES_TIMERS */
+#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS
 void tick_cancel_sched_timer(int cpu)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+# ifdef CONFIG_HIGH_RES_TIMERS
        if (ts->sched_timer.base)
                hrtimer_cancel(&ts->sched_timer);
+# endif
        ts->nohz_mode = NOHZ_MODE_INACTIVE;
 }
-#endif /* HIGH_RES_TIMERS */
+#endif
 /**
 * Async notification about clocksource changes
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e91c29f961c9..e7acfb482a68 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -58,27 +58,26 @@ struct clocksource *clock;
 #ifdef CONFIG_GENERIC_TIME
 /**
- * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
+ * clocksource_forward_now - update clock to the current time
 *
- * private function, must hold xtime_lock lock when being
+ * Forward the current clock to update its state since the last call to
- * called. Returns the number of nanoseconds since the
+ * update_wall_time(). This is useful before significant clock changes,
- * last call to update_wall_time() (adjusted by NTP scaling)
+ * as it avoids having to deal with this time offset explicitly.
 */
-static inline s64 __get_nsec_offset(void)
+static void clocksource_forward_now(void)
 {
        cycle_t cycle_now, cycle_delta;
-        s64 ns_offset;
+        s64 nsec;
-        /* read clocksource: */
        cycle_now = clocksource_read(clock);
-        /* calculate the delta since the last update_wall_time: */
        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+        clock->cycle_last = cycle_now;
-        /* convert to nanoseconds: */
+        nsec = cyc2ns(clock, cycle_delta);
-        ns_offset = cyc2ns(clock, cycle_delta);
+        timespec_add_ns(&xtime, nsec);
-        return ns_offset;
+        nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
+        clock->raw_time.tv_nsec += nsec;
 }
 /**
@@ -89,6 +88,7 @@ static inline s64 __get_nsec_offset(void)
 */
 void getnstimeofday(struct timespec *ts)
 {
+        cycle_t cycle_now, cycle_delta;
        unsigned long seq;
        s64 nsecs;
@@ -96,7 +96,15 @@ void getnstimeofday(struct timespec *ts)
                seq = read_seqbegin(&xtime_lock);
                *ts = xtime;
-                nsecs = __get_nsec_offset();
+                /* read clocksource: */
+                cycle_now = clocksource_read(clock);
+                /* calculate the delta since the last update_wall_time: */
+                cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+                /* convert to nanoseconds: */
+                nsecs = cyc2ns(clock, cycle_delta);
        } while (read_seqretry(&xtime_lock, seq));
@@ -129,22 +137,22 @@ EXPORT_SYMBOL(do_gettimeofday);
 */
 int do_settimeofday(struct timespec *tv)
 {
+        struct timespec ts_delta;
        unsigned long flags;
-        time_t wtm_sec, sec = tv->tv_sec;
-        long wtm_nsec, nsec = tv->tv_nsec;
        if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;
        write_seqlock_irqsave(&xtime_lock, flags);
-        nsec -= __get_nsec_offset();
+        clocksource_forward_now();
+        ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
+        ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
+        wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta);
-        wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
+        xtime = *tv;
-        wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-        set_normalized_timespec(&xtime, sec, nsec);
-        set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
        update_xtime_cache(0);
        clock->error = 0;
@@ -170,22 +178,19 @@ EXPORT_SYMBOL(do_settimeofday);
 static void change_clocksource(void)
 {
        struct clocksource *new;
-        cycle_t now;
-        u64 nsec;
        new = clocksource_get_next();
        if (clock == new)
                return;
-        new->cycle_last = 0;
+        clocksource_forward_now();
-        now = clocksource_read(new);
-        nsec =  __get_nsec_offset();
-        timespec_add_ns(&xtime, nsec);
-        clock = new;
+        new->raw_time = clock->raw_time;
-        clock->cycle_last = now;
+        clock = new;
+        clock->cycle_last = 0;
+        clock->cycle_last = clocksource_read(new);
        clock->error = 0;
        clock->xtime_nsec = 0;
        clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -200,11 +205,44 @@ static void change_clocksource(void)
         */
 }
 #else
+static inline void clocksource_forward_now(void) { }
 static inline void change_clocksource(void) { }
-static inline s64 __get_nsec_offset(void) { return 0; }
 #endif
 /**
+ * getrawmonotonic - Returns the raw monotonic time in a timespec
+ * @ts:         pointer to the timespec to be set
+ *
+ * Returns the raw monotonic time (completely un-modified by ntp)
+ */
+void getrawmonotonic(struct timespec *ts)
+{
+        unsigned long seq;
+        s64 nsecs;
+        cycle_t cycle_now, cycle_delta;
+        do {
+                seq = read_seqbegin(&xtime_lock);
+                /* read clocksource: */
+                cycle_now = clocksource_read(clock);
+                /* calculate the delta since the last update_wall_time: */
+                cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+                /* convert to nanoseconds: */
+                nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
+                *ts = clock->raw_time;
+        } while (read_seqretry(&xtime_lock, seq));
+        timespec_add_ns(ts, nsecs);
+}
+EXPORT_SYMBOL(getrawmonotonic);
+/**
 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
 */
 int timekeeping_valid_for_hres(void)
@@ -265,8 +303,6 @@ void __init timekeeping_init(void)
 static int timekeeping_suspended;
 /* time in seconds when suspend began */
 static unsigned long timekeeping_suspend_time;
-/* xtime offset when we went into suspend */
-static s64 timekeeping_suspend_nsecs;
 /**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -292,8 +328,6 @@ static int timekeeping_resume(struct sys_device *dev)
                wall_to_monotonic.tv_sec -= sleep_length;
                total_sleep_time += sleep_length;
        }
-        /* Make sure that we have the correct xtime reference */
-        timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
        update_xtime_cache(0);
        /* re-base the last cycle value */
        clock->cycle_last = 0;
@@ -319,8 +353,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
        timekeeping_suspend_time = read_persistent_clock();
        write_seqlock_irqsave(&xtime_lock, flags);
-        /* Get the current xtime offset */
+        clocksource_forward_now();
-        timekeeping_suspend_nsecs = __get_nsec_offset();
        timekeeping_suspended = 1;
        write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -454,23 +487,29 @@ void update_wall_time(void)
 #else
        offset = clock->cycle_interval;
 #endif
-        clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
+        clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
        /* normally this loop will run just once, however in the
         * case of lost or late ticks, it will accumulate correctly.
         */
        while (offset >= clock->cycle_interval) {
                /* accumulate one interval */
-                clock->xtime_nsec += clock->xtime_interval;
-                clock->cycle_last += clock->cycle_interval;
                offset -= clock->cycle_interval;
+                clock->cycle_last += clock->cycle_interval;
+                clock->xtime_nsec += clock->xtime_interval;
                if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
                        clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
                        xtime.tv_sec++;
                        second_overflow();
                }
+                clock->raw_time.tv_nsec += clock->raw_interval;
+                if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
+                        clock->raw_time.tv_nsec -= NSEC_PER_SEC;
+                        clock->raw_time.tv_sec++;
+                }
                /* accumulate error between NTP and clock interval */
                clock->error += tick_length;
                clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
@@ -479,9 +518,12 @@ void update_wall_time(void)
        /* correct the clock when NTP error is too big */
        clocksource_adjust(offset);
-        /* store full nanoseconds into xtime */
+        /* store full nanoseconds into xtime after rounding it up and
-        xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
+         * add the remainder to the error difference.
+         */
+        xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1;
        clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
+        clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);
        update_xtime_cache(cyc2ns(clock, offset));
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a40e20fd0001..f6426911e35a 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -47,13 +47,14 @@ static void print_name_offset(struct seq_file *m, void *sym)
 }
 static void
-print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
+print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
+            int idx, u64 now)
 {
 #ifdef CONFIG_TIMER_STATS
        char tmp[TASK_COMM_LEN + 1];
 #endif
        SEQ_printf(m, " #%d: ", idx);
-        print_name_offset(m, timer);
+        print_name_offset(m, taddr);
        SEQ_printf(m, ", ");
        print_name_offset(m, timer->function);
        SEQ_printf(m, ", S:%02lx", timer->state);
@@ -99,7 +100,7 @@ next_one:
                tmp = *timer;
                spin_unlock_irqrestore(&base->cpu_base->lock, flags);
-                print_timer(m, &tmp, i, now);
+                print_timer(m, timer, &tmp, i, now);
                next++;
                goto next_one;
        }
@@ -109,6 +110,7 @@ next_one:
 static void
 print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
 {
+        SEQ_printf(m, "  .base:       %p\n", base);
        SEQ_printf(m, "  .index:      %d\n",
                        base->index);
        SEQ_printf(m, "  .resolution: %Lu nsecs\n",
@@ -183,12 +185,16 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 static void
-print_tickdevice(struct seq_file *m, struct tick_device *td)
+print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 {
        struct clock_event_device *dev = td->evtdev;
        SEQ_printf(m, "\n");
        SEQ_printf(m, "Tick Device: mode:     %d\n", td->mode);
+        if (cpu < 0)
+                SEQ_printf(m, "Broadcast device\n");
+        else
+                SEQ_printf(m, "Per CPU device: %d\n", cpu);
        SEQ_printf(m, "Clock Event Device: ");
        if (!dev) {
@@ -222,7 +228,7 @@ static void timer_list_show_tickdevices(struct seq_file *m)
        int cpu;
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
-        print_tickdevice(m, tick_get_broadcast_device());
+        print_tickdevice(m, tick_get_broadcast_device(), -1);
        SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
                   tick_get_broadcast_mask()->bits[0]);
 #ifdef CONFIG_TICK_ONESHOT
@@ -232,7 +238,7 @@ static void timer_list_show_tickdevices(struct seq_file *m)
        SEQ_printf(m, "\n");
 #endif
        for_each_online_cpu(cpu)
-                   print_tickdevice(m, tick_get_device(cpu));
+                print_tickdevice(m, tick_get_device(cpu), cpu);
        SEQ_printf(m, "\n");
 }
 #else
@@ -244,7 +250,7 @@ static int timer_list_show(struct seq_file *m, void *v)
        u64 now = ktime_to_ns(ktime_get());
        int cpu;
-        SEQ_printf(m, "Timer List Version: v0.3\n");
+        SEQ_printf(m, "Timer List Version: v0.4\n");
        SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
        SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
diff --git a/kernel/timer.c b/kernel/timer.c
index e8019cc3418d..56becf373c58 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -978,6 +978,7 @@ void update_process_times(int user_tick)
        run_local_timers();
        if (rcu_pending(cpu))
                rcu_check_callbacks(cpu, user_tick);
+        printk_tick();
        scheduler_tick();
        run_posix_cpu_timers(p);
 }
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index bb948e52ce20..db58fb66a135 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -202,7 +202,7 @@ static void start_stack_timer(int cpu)
        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        hrtimer->function = stack_trace_timer_fn;
-        hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+        hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
        hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
 }
diff --git a/kernel/user.c b/kernel/user.c
index 865ecf57a096..39d6159fae43 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
 {
        struct user_struct *up = container_of(kobj, struct user_struct, kobj);
-        return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
+        return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
 }
 static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
@@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
        unsigned long rt_runtime;
        int rc;
-        sscanf(buf, "%lu", &rt_runtime);
+        sscanf(buf, "%ld", &rt_runtime);
        rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index a9ab0596de44..532858fa5b88 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -6,7 +6,6 @@
 */
 #include <linux/module.h>
-#include <linux/version.h>
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 64d398f12444..815237a55af8 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -12,7 +12,6 @@
 #include <linux/module.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
-#include <linux/version.h>
 #include <linux/err.h>
 #include <linux/slab.h>
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index fe3a56c2256d..3b34b3545936 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -12,7 +12,6 @@
 #include <linux/module.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
-#include <linux/version.h>
 #include <linux/sysctl.h>
 static void *get_uts(ctl_table *table, int write)
@@ -61,7 +60,7 @@ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
 #ifdef CONFIG_SYSCTL_SYSCALL
 /* The generic string strategy routine: */
-static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
+static int sysctl_uts_string(ctl_table *table,
                  void __user *oldval, size_t __user *oldlenp,
                  void __user *newval, size_t newlen)
 {
@@ -70,8 +69,7 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
        write = newval && newlen;
        memcpy(&uts_table, table, sizeof(uts_table));
        uts_table.data = get_uts(table, write);
-        r = sysctl_string(&uts_table, name, nlen,
+        r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen);
-                oldval, oldlenp, newval, newlen);
        put_uts(table, write, uts_table.data);
        return r;
 }
diff --git a/kernel/wait.c b/kernel/wait.c
index c275c56cf2d3..cd87131f2fc2 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -72,12 +72,7 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
        spin_lock_irqsave(&q->lock, flags);
        if (list_empty(&wait->task_list))
                __add_wait_queue(q, wait);
-        /*
+        set_current_state(state);
-         * don't alter the task state if this is just going to
-         * queue an async wait queue callback
-         */
-        if (is_sync_wait(wait))
-                set_current_state(state);
        spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(prepare_to_wait);
@@ -91,12 +86,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
        spin_lock_irqsave(&q->lock, flags);
        if (list_empty(&wait->task_list))
                __add_wait_queue_tail(q, wait);
-        /*
+        set_current_state(state);
-         * don't alter the task state if this is just going to
-         * queue an async wait queue callback
-         */
-        if (is_sync_wait(wait))
-                set_current_state(state);
        spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4048e92aa04f..714afad46539 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -9,7 +9,7 @@
 * Derived from the taskqueue/keventd code by:
 *
 *   David Woodhouse <dwmw2@infradead.org>
- *   Andrew Morton <andrewm@uow.edu.au>
+ *   Andrew Morton
 *   Kai Petzke <wpp@marie.physik.tu-berlin.de>
 *   Theodore Ts'o <tytso@mit.edu>
 *