24 files changed, 555 insertions, 517 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0669b70fa6a3..9fdba03dc1fc 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,8 +52,23 @@ config PREEMPT
 endchoice
+config PREEMPT_RCU
+        bool "Preemptible RCU"
+        depends on PREEMPT
+        default n
+        help
+          This option reduces the latency of the kernel by making certain
+          RCU sections preemptible. Normally RCU code is non-preemptible, if
+          this option is selected then read-only RCU sections become
+          preemptible. This helps latency, but may expose bugs due to
+          now-naive assumptions about each RCU read-side critical section
+          remaining on a given CPU through its execution.
+          Say N if you are unsure.
 config RCU_TRACE
        bool "Enable tracing for RCU - currently stats in debugfs"
+        depends on PREEMPT_RCU
        select DEBUG_FS
        default y
        help
diff --git a/kernel/audit.c b/kernel/audit.c
index 2eeea9a14240..10c4930c2bbf 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -170,7 +170,9 @@ void audit_panic(const char *message)
                        printk(KERN_ERR "audit: %s\n", message);
                break;
        case AUDIT_FAIL_PANIC:
-                panic("audit: %s\n", message);
+                /* test audit_pid since printk is always losey, why bother? */
+                if (audit_pid)
+                        panic("audit: %s\n", message);
                break;
        }
 }
@@ -352,6 +354,7 @@ static int kauditd_thread(void *dummy)
                                if (err < 0) {
                                        BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
                                        printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
+                                        audit_log_lost("auditd dissapeared\n");
                                        audit_pid = 0;
                                }
                        } else {
@@ -1350,17 +1353,19 @@ void audit_log_end(struct audit_buffer *ab)
        if (!audit_rate_check()) {
                audit_log_lost("rate limit exceeded");
        } else {
+                struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
                if (audit_pid) {
-                        struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
                        nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
                        skb_queue_tail(&audit_skb_queue, ab->skb);
                        ab->skb = NULL;
                        wake_up_interruptible(&kauditd_wait);
-                } else if (printk_ratelimit()) {
+                } else if (nlh->nlmsg_type != AUDIT_EOE) {
-                        struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
+                        if (printk_ratelimit()) {
-                        printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, ab->skb->data + NLMSG_SPACE(0));
+                                printk(KERN_NOTICE "type=%d %s\n",
-                } else {
+                                        nlh->nlmsg_type,
-                        audit_log_lost("printk limit exceeded\n");
+                                        ab->skb->data + NLMSG_SPACE(0));
+                        } else
+                                audit_log_lost("printk limit exceeded\n");
                }
        }
        audit_buffer_free(ab);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2087d6de67ea..782262e4107d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1070,7 +1070,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
                 * so we can be sure nothing was lost.
                 */
                if ((i == 0) && (too_long))
-                        audit_log_format(*ab, "a%d_len=%ld ", arg_num,
+                        audit_log_format(*ab, "a%d_len=%zu ", arg_num,
                                         has_cntl ? 2*len : len);
                /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d8abe996e009..e9c2fb01e89b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2232,7 +2232,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        mutex_lock(&cgroup_mutex);
-        cgrp->flags = 0;
        INIT_LIST_HEAD(&cgrp->sibling);
        INIT_LIST_HEAD(&cgrp->children);
        INIT_LIST_HEAD(&cgrp->css_sets);
@@ -2242,6 +2241,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        cgrp->root = parent->root;
        cgrp->top_cgroup = parent->top_cgroup;
+        if (notify_on_release(parent))
+                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
        for_each_subsys(root, ss) {
                struct cgroup_subsys_state *css = ss->create(ss, cgrp);
                if (IS_ERR(css)) {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3e296ed81d4d..a1b61f414228 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -322,8 +322,8 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 * Call without callback_mutex or task_lock() held.  May be
 * called with or without cgroup_mutex held.  Thanks in part to
 * 'the_top_cpuset_hack', the task's cpuset pointer will never
- * be NULL.  This routine also might acquire callback_mutex and
+ * be NULL.  This routine also might acquire callback_mutex during
- * current->mm->mmap_sem during call.
+ * call.
 *
 * Reading current->cpuset->mems_generation doesn't need task_lock
 * to guard the current->cpuset derefence, because it is guarded
diff --git a/kernel/exit.c b/kernel/exit.c
index 506a957b665a..53872bf993fa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -214,20 +214,19 @@ struct pid *session_of_pgrp(struct pid *pgrp)
 static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
 {
        struct task_struct *p;
-        int ret = 1;
        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
-                if (p == ignored_task
+                if ((p == ignored_task) ||
-                                || p->exit_state
+                    (p->exit_state && thread_group_empty(p)) ||
-                                || is_global_init(p->real_parent))
+                    is_global_init(p->real_parent))
                        continue;
                if (task_pgrp(p->real_parent) != pgrp &&
-                    task_session(p->real_parent) == task_session(p)) {
+                    task_session(p->real_parent) == task_session(p))
-                        ret = 0;
+                        return 0;
-                        break;
-                }
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
-        return ret;     /* (sighing) "Often!" */
+        return 1;
 }
 int is_current_pgrp_orphaned(void)
@@ -255,6 +254,37 @@ static int has_stopped_jobs(struct pid *pgrp)
        return retval;
 }
+/*
+ * Check to see if any process groups have become orphaned as
+ * a result of our exiting, and if they have any stopped jobs,
+ * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ */
+static void
+kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
+{
+        struct pid *pgrp = task_pgrp(tsk);
+        struct task_struct *ignored_task = tsk;
+        if (!parent)
+                 /* exit: our father is in a different pgrp than
+                  * we are and we were the only connection outside.
+                  */
+                parent = tsk->real_parent;
+        else
+                /* reparent: our child is in a different pgrp than
+                 * we are, and it was the only connection outside.
+                 */
+                ignored_task = NULL;
+        if (task_pgrp(parent) != pgrp &&
+            task_session(parent) == task_session(tsk) &&
+            will_become_orphaned_pgrp(pgrp, ignored_task) &&
+            has_stopped_jobs(pgrp)) {
+                __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
+                __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
+        }
+}
 /**
 * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
 *
@@ -635,22 +665,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
            p->exit_signal != -1 && thread_group_empty(p))
                do_notify_parent(p, p->exit_signal);
-        /*
+        kill_orphaned_pgrp(p, father);
-         * process group orphan check
-         * Case ii: Our child is in a different pgrp
-         * than we are, and it was the only connection
-         * outside, so the child pgrp is now orphaned.
-         */
-        if ((task_pgrp(p) != task_pgrp(father)) &&
-            (task_session(p) == task_session(father))) {
-                struct pid *pgrp = task_pgrp(p);
-                if (will_become_orphaned_pgrp(pgrp, NULL) &&
-                    has_stopped_jobs(pgrp)) {
-                        __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
-                        __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
-                }
-        }
 }
 /*
@@ -735,11 +750,9 @@ static void forget_original_parent(struct task_struct *father)
 * Send signals to all our closest relatives so that they know
 * to properly mourn us..
 */
-static void exit_notify(struct task_struct *tsk)
+static void exit_notify(struct task_struct *tsk, int group_dead)
 {
        int state;
-        struct task_struct *t;
-        struct pid *pgrp;
        /*
         * This does two things:
@@ -753,25 +766,8 @@ static void exit_notify(struct task_struct *tsk)
        exit_task_namespaces(tsk);
        write_lock_irq(&tasklist_lock);
-        /*
+        if (group_dead)
-         * Check to see if any process groups have become orphaned
+                kill_orphaned_pgrp(tsk->group_leader, NULL);
-         * as a result of our exiting, and if they have any stopped
-         * jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
-         *
-         * Case i: Our father is in a different pgrp than we are
-         * and we were the only connection outside, so our pgrp
-         * is about to become orphaned.
-         */
-        t = tsk->real_parent;
-        pgrp = task_pgrp(tsk);
-        if ((task_pgrp(t) != pgrp) &&
-            (task_session(t) == task_session(tsk)) &&
-            will_become_orphaned_pgrp(pgrp, tsk) &&
-            has_stopped_jobs(pgrp)) {
-                __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
-                __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
-        }
        /* Let father know we died
         *
@@ -788,8 +784,8 @@ static void exit_notify(struct task_struct *tsk)
         * the same after a fork.
         */
        if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
-            ( tsk->parent_exec_id != t->self_exec_id  ||
+            (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
-              tsk->self_exec_id != tsk->parent_exec_id)
+             tsk->self_exec_id != tsk->parent_exec_id)
            && !capable(CAP_KILL))
                tsk->exit_signal = SIGCHLD;
@@ -986,7 +982,7 @@ NORET_TYPE void do_exit(long code)
                module_put(tsk->binfmt->module);
        proc_exit_connector(tsk);
-        exit_notify(tsk);
+        exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
        mpol_free(tsk->mempolicy);
        tsk->mempolicy = NULL;
@@ -1382,7 +1378,7 @@ unlock_sig:
        if (!retval && infop)
                retval = put_user(0, &infop->si_errno);
        if (!retval && infop)
-                retval = put_user(why, &infop->si_code);
+                retval = put_user((short)why, &infop->si_code);
        if (!retval && infop)
                retval = put_user(exit_code, &infop->si_status);
        if (!retval && infop)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 7a86e6432338..fcfb580c3afc 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -498,27 +498,36 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
        return 0;
 }
+/*
+ * If we have a symbol_name argument, look it up and add the offset field
+ * to it. This way, we can specify a relative address to a symbol.
+ */
+static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
+{
+        kprobe_opcode_t *addr = p->addr;
+        if (p->symbol_name) {
+                if (addr)
+                        return NULL;
+                kprobe_lookup_name(p->symbol_name, addr);
+        }
+        if (!addr)
+                return NULL;
+        return (kprobe_opcode_t *)(((char *)addr) + p->offset);
+}
 static int __kprobes __register_kprobe(struct kprobe *p,
        unsigned long called_from)
 {
        int ret = 0;
        struct kprobe *old_p;
        struct module *probed_mod;
+        kprobe_opcode_t *addr;
-        /*
+        addr = kprobe_addr(p);
-         * If we have a symbol_name argument look it up,
+        if (!addr)
-         * and add it to the address.  That way the addr
-         * field can either be global or relative to a symbol.
-         */
-        if (p->symbol_name) {
-                if (p->addr)
-                        return -EINVAL;
-                kprobe_lookup_name(p->symbol_name, p->addr);
-        }
-        if (!p->addr)
                return -EINVAL;
-        p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset);
+        p->addr = addr;
        if (!kernel_text_address((unsigned long) p->addr) ||
            in_kprobes_functions((unsigned long) p->addr))
@@ -678,8 +687,7 @@ void __kprobes unregister_jprobe(struct jprobe *jp)
        unregister_kprobe(&jp->kp);
 }
-#ifdef ARCH_SUPPORTS_KRETPROBES
+#ifdef CONFIG_KRETPROBES
 /*
 * This kprobe pre_handler is registered with every kretprobe. When probe
 * hits it will set up the return probe.
@@ -722,12 +730,12 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
        int ret = 0;
        struct kretprobe_instance *inst;
        int i;
-        void *addr = rp->kp.addr;
+        void *addr;
        if (kretprobe_blacklist_size) {
-                if (addr == NULL)
+                addr = kprobe_addr(&rp->kp);
-                        kprobe_lookup_name(rp->kp.symbol_name, addr);
+                if (!addr)
-                addr += rp->kp.offset;
+                        return -EINVAL;
                for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
                        if (kretprobe_blacklist[i].addr == addr)
@@ -769,8 +777,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
        return ret;
 }
-#else /* ARCH_SUPPORTS_KRETPROBES */
+#else /* CONFIG_KRETPROBES */
 int __kprobes register_kretprobe(struct kretprobe *rp)
 {
        return -ENOSYS;
@@ -781,8 +788,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 {
        return 0;
 }
+#endif /* CONFIG_KRETPROBES */
-#endif /* ARCH_SUPPORTS_KRETPROBES */
 void __kprobes unregister_kretprobe(struct kretprobe *rp)
 {
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 3574379f4d62..81a4e4a3f087 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -779,6 +779,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
         * parallel walking of the hash-list safe:
         */
        list_add_tail_rcu(&class->hash_entry, hash_head);
+        /*
+         * Add it to the global list of classes:
+         */
+        list_add_tail_rcu(&class->lock_entry, &all_lock_classes);
        if (verbose(class)) {
                graph_unlock();
@@ -2282,10 +2286,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
                        return 0;
                break;
        case LOCK_USED:
-                /*
-                 * Add it to the global list of classes:
-                 */
-                list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes);
                debug_atomic_dec(&nr_unused_locks);
                break;
        default:
diff --git a/kernel/marker.c b/kernel/marker.c
index 50effc01d9a2..48a4ea5afffd 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -698,14 +698,12 @@ int marker_probe_unregister(const char *name,
 {
        struct marker_entry *entry;
        struct marker_probe_closure *old;
-        int ret = 0;
+        int ret = -ENOENT;
        mutex_lock(&markers_mutex);
        entry = get_marker(name);
-        if (!entry) {
+        if (!entry)
-                ret = -ENOENT;
                goto end;
-        }
        if (entry->rcu_pending)
                rcu_barrier();
        old = marker_entry_remove_probe(entry, probe, probe_private);
@@ -713,12 +711,15 @@ int marker_probe_unregister(const char *name,
        marker_update_probes();         /* may update entry */
        mutex_lock(&markers_mutex);
        entry = get_marker(name);
+        if (!entry)
+                goto end;
        entry->oldptr = old;
        entry->rcu_pending = 1;
        /* write rcu_pending before calling the RCU callback */
        smp_wmb();
        call_rcu(&entry->rcu, free_old_closure);
        remove_marker(name);    /* Ignore busy error message */
+        ret = 0;
 end:
        mutex_unlock(&markers_mutex);
        return ret;
diff --git a/kernel/module.c b/kernel/module.c
index 901cd6ac2f11..5d437bffd8dc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1933,8 +1933,15 @@ static struct module *load_module(void __user *umod,
        /* Set up license info based on the info section */
        set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
+        /*
+         * ndiswrapper is under GPL by itself, but loads proprietary modules.
+         * Don't use add_taint_module(), as it would prevent ndiswrapper from
+         * using GPL-only symbols it needs.
+         */
        if (strcmp(mod->name, "ndiswrapper") == 0)
-                add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+                add_taint(TAINT_PROPRIETARY_MODULE);
+        /* driverloader was caught wrongly pretending to be under GPL */
        if (strcmp(mod->name, "driverloader") == 0)
                add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
@@ -2171,10 +2178,20 @@ sys_init_module(void __user *umod,
                wake_up(&module_wq);
                return ret;
        }
+        if (ret > 0) {
+                printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, "
+                                    "it should follow 0/-E convention\n"
+                       KERN_WARNING "%s: loading module anyway...\n",
+                       __func__, mod->name, ret,
+                       __func__);
+                dump_stack();
+        }
-        /* Now it's a first class citizen! */
+        /* Now it's a first class citizen!  Wake up anyone waiting for it. */
-        mutex_lock(&module_mutex);
        mod->state = MODULE_STATE_LIVE;
+        wake_up(&module_wq);
+        mutex_lock(&module_mutex);
        /* Drop initial reference. */
        module_put(mod);
        unwind_remove_table(mod->unwind_info, 1);
@@ -2183,7 +2200,6 @@ sys_init_module(void __user *umod,
        mod->init_size = 0;
        mod->init_text_size = 0;
        mutex_unlock(&module_mutex);
-        wake_up(&module_wq);
        return 0;
 }
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 7c2118f9597f..f1d0b345c9ba 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -75,22 +75,15 @@ void refrigerator(void)
        __set_current_state(save);
 }
-static void fake_signal_wake_up(struct task_struct *p, int resume)
+static void fake_signal_wake_up(struct task_struct *p)
 {
        unsigned long flags;
        spin_lock_irqsave(&p->sighand->siglock, flags);
-        signal_wake_up(p, resume);
+        signal_wake_up(p, 0);
        spin_unlock_irqrestore(&p->sighand->siglock, flags);
 }
-static void send_fake_signal(struct task_struct *p)
-{
-        if (task_is_stopped(p))
-                force_sig_specific(SIGSTOP, p);
-        fake_signal_wake_up(p, task_is_stopped(p));
-}
 static int has_mm(struct task_struct *p)
 {
        return (p->mm && !(p->flags & PF_BORROWED_MM));
@@ -121,7 +114,7 @@ static int freeze_task(struct task_struct *p, int with_mm_only)
        if (freezing(p)) {
                if (has_mm(p)) {
                        if (!signal_pending(p))
-                                fake_signal_wake_up(p, 0);
+                                fake_signal_wake_up(p);
                } else {
                        if (with_mm_only)
                                ret = 0;
@@ -135,7 +128,7 @@ static int freeze_task(struct task_struct *p, int with_mm_only)
                } else {
                        if (has_mm(p)) {
                                set_freeze_flag(p);
-                                send_fake_signal(p);
+                                fake_signal_wake_up(p);
                        } else {
                                if (with_mm_only) {
                                        ret = 0;
@@ -182,15 +175,17 @@ static int try_to_freeze_tasks(int freeze_user_space)
                        if (frozen(p) || !freezeable(p))
                                continue;
-                        if (task_is_traced(p) && frozen(p->parent)) {
-                                cancel_freezing(p);
-                                continue;
-                        }
                        if (!freeze_task(p, freeze_user_space))
                                continue;
-                        if (!freezer_should_skip(p))
+                        /*
+                         * Now that we've done set_freeze_flag, don't
+                         * perturb a task in TASK_STOPPED or TASK_TRACED.
+                         * It is "frozen enough".  If the task does wake
+                         * up, it will immediately call try_to_freeze.
+                         */
+                        if (!task_is_stopped_or_traced(p) &&
+                            !freezer_should_skip(p))
                                todo++;
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
diff --git a/kernel/printk.c b/kernel/printk.c
index bee36100f110..9adc2a473e6e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -666,7 +666,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        }
        /* Emit the output into the temporary buffer */
        printed_len += vscnprintf(printk_buf + printed_len,
-                                  sizeof(printk_buf), fmt, args);
+                                  sizeof(printk_buf) - printed_len, fmt, args);
        /*
         * Copy the output into log_buf.  If the caller didn't provide
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 987cfb7ade89..e9517014b57c 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -23,6 +23,10 @@
 *              to Suparna Bhattacharya for pushing me completely away
 *              from atomic instructions on the read side.
 *
+ *  - Added handling of Dynamic Ticks
+ *      Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
+ *                     - Steven Rostedt <srostedt@redhat.com>
+ *
 * Papers:  http://www.rdrop.com/users/paulmck/RCU
 *
 * Design Document: http://lwn.net/Articles/253651/
@@ -409,6 +413,212 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
        }
 }
+#ifdef CONFIG_NO_HZ
+DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
+static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
+static DEFINE_PER_CPU(int, rcu_update_flag);
+/**
+ * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
+ *
+ * If the CPU was idle with dynamic ticks active, this updates the
+ * dynticks_progress_counter to let the RCU handling know that the
+ * CPU is active.
+ */
+void rcu_irq_enter(void)
+{
+        int cpu = smp_processor_id();
+        if (per_cpu(rcu_update_flag, cpu))
+                per_cpu(rcu_update_flag, cpu)++;
+        /*
+         * Only update if we are coming from a stopped ticks mode
+         * (dynticks_progress_counter is even).
+         */
+        if (!in_interrupt() &&
+            (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
+                /*
+                 * The following might seem like we could have a race
+                 * with NMI/SMIs. But this really isn't a problem.
+                 * Here we do a read/modify/write, and the race happens
+                 * when an NMI/SMI comes in after the read and before
+                 * the write. But NMI/SMIs will increment this counter
+                 * twice before returning, so the zero bit will not
+                 * be corrupted by the NMI/SMI which is the most important
+                 * part.
+                 *
+                 * The only thing is that we would bring back the counter
+                 * to a postion that it was in during the NMI/SMI.
+                 * But the zero bit would be set, so the rest of the
+                 * counter would again be ignored.
+                 *
+                 * On return from the IRQ, the counter may have the zero
+                 * bit be 0 and the counter the same as the return from
+                 * the NMI/SMI. If the state machine was so unlucky to
+                 * see that, it still doesn't matter, since all
+                 * RCU read-side critical sections on this CPU would
+                 * have already completed.
+                 */
+                per_cpu(dynticks_progress_counter, cpu)++;
+                /*
+                 * The following memory barrier ensures that any
+                 * rcu_read_lock() primitives in the irq handler
+                 * are seen by other CPUs to follow the above
+                 * increment to dynticks_progress_counter. This is
+                 * required in order for other CPUs to correctly
+                 * determine when it is safe to advance the RCU
+                 * grace-period state machine.
+                 */
+                smp_mb(); /* see above block comment. */
+                /*
+                 * Since we can't determine the dynamic tick mode from
+                 * the dynticks_progress_counter after this routine,
+                 * we use a second flag to acknowledge that we came
+                 * from an idle state with ticks stopped.
+                 */
+                per_cpu(rcu_update_flag, cpu)++;
+                /*
+                 * If we take an NMI/SMI now, they will also increment
+                 * the rcu_update_flag, and will not update the
+                 * dynticks_progress_counter on exit. That is for
+                 * this IRQ to do.
+                 */
+        }
+}
+/**
+ * rcu_irq_exit - Called from exiting Hard irq context.
+ *
+ * If the CPU was idle with dynamic ticks active, update the
+ * dynticks_progress_counter to put let the RCU handling be
+ * aware that the CPU is going back to idle with no ticks.
+ */
+void rcu_irq_exit(void)
+{
+        int cpu = smp_processor_id();
+        /*
+         * rcu_update_flag is set if we interrupted the CPU
+         * when it was idle with ticks stopped.
+         * Once this occurs, we keep track of interrupt nesting
+         * because a NMI/SMI could also come in, and we still
+         * only want the IRQ that started the increment of the
+         * dynticks_progress_counter to be the one that modifies
+         * it on exit.
+         */
+        if (per_cpu(rcu_update_flag, cpu)) {
+                if (--per_cpu(rcu_update_flag, cpu))
+                        return;
+                /* This must match the interrupt nesting */
+                WARN_ON(in_interrupt());
+                /*
+                 * If an NMI/SMI happens now we are still
+                 * protected by the dynticks_progress_counter being odd.
+                 */
+                /*
+                 * The following memory barrier ensures that any
+                 * rcu_read_unlock() primitives in the irq handler
+                 * are seen by other CPUs to preceed the following
+                 * increment to dynticks_progress_counter. This
+                 * is required in order for other CPUs to determine
+                 * when it is safe to advance the RCU grace-period
+                 * state machine.
+                 */
+                smp_mb(); /* see above block comment. */
+                per_cpu(dynticks_progress_counter, cpu)++;
+                WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
+        }
+}
+static void dyntick_save_progress_counter(int cpu)
+{
+        per_cpu(rcu_dyntick_snapshot, cpu) =
+                per_cpu(dynticks_progress_counter, cpu);
+}
+static inline int
+rcu_try_flip_waitack_needed(int cpu)
+{
+        long curr;
+        long snap;
+        curr = per_cpu(dynticks_progress_counter, cpu);
+        snap = per_cpu(rcu_dyntick_snapshot, cpu);
+        smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+        /*
+         * If the CPU remained in dynticks mode for the entire time
+         * and didn't take any interrupts, NMIs, SMIs, or whatever,
+         * then it cannot be in the middle of an rcu_read_lock(), so
+         * the next rcu_read_lock() it executes must use the new value
+         * of the counter.  So we can safely pretend that this CPU
+         * already acknowledged the counter.
+         */
+        if ((curr == snap) && ((curr & 0x1) == 0))
+                return 0;
+        /*
+         * If the CPU passed through or entered a dynticks idle phase with
+         * no active irq handlers, then, as above, we can safely pretend
+         * that this CPU already acknowledged the counter.
+         */
+        if ((curr - snap) > 2 || (snap & 0x1) == 0)
+                return 0;
+        /* We need this CPU to explicitly acknowledge the counter flip. */
+        return 1;
+}
+static inline int
+rcu_try_flip_waitmb_needed(int cpu)
+{
+        long curr;
+        long snap;
+        curr = per_cpu(dynticks_progress_counter, cpu);
+        snap = per_cpu(rcu_dyntick_snapshot, cpu);
+        smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+        /*
+         * If the CPU remained in dynticks mode for the entire time
+         * and didn't take any interrupts, NMIs, SMIs, or whatever,
+         * then it cannot have executed an RCU read-side critical section
+         * during that time, so there is no need for it to execute a
+         * memory barrier.
+         */
+        if ((curr == snap) && ((curr & 0x1) == 0))
+                return 0;
+        /*
+         * If the CPU either entered or exited an outermost interrupt,
+         * SMI, NMI, or whatever handler, then we know that it executed
+         * a memory barrier when doing so.  So we don't need another one.
+         */
+        if (curr != snap)
+                return 0;
+        /* We need the CPU to execute a memory barrier. */
+        return 1;
+}
+#else /* !CONFIG_NO_HZ */
+# define dyntick_save_progress_counter(cpu)     do { } while (0)
+# define rcu_try_flip_waitack_needed(cpu)       (1)
+# define rcu_try_flip_waitmb_needed(cpu)        (1)
+#endif /* CONFIG_NO_HZ */
 /*
 * Get here when RCU is idle.  Decide whether we need to
 * move out of idle state, and return non-zero if so.
@@ -447,8 +657,10 @@ rcu_try_flip_idle(void)
        /* Now ask each CPU for acknowledgement of the flip. */
-        for_each_cpu_mask(cpu, rcu_cpu_online_map)
+        for_each_cpu_mask(cpu, rcu_cpu_online_map) {
                per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
+                dyntick_save_progress_counter(cpu);
+        }
        return 1;
 }
@@ -464,7 +676,8 @@ rcu_try_flip_waitack(void)
        RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
        for_each_cpu_mask(cpu, rcu_cpu_online_map)
-                if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
+                if (rcu_try_flip_waitack_needed(cpu) &&
+                    per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
                        RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
                        return 0;
                }
@@ -509,8 +722,10 @@ rcu_try_flip_waitzero(void)
        smp_mb();  /*  ^^^^^^^^^^^^ */
        /* Call for a memory barrier from each CPU. */
-        for_each_cpu_mask(cpu, rcu_cpu_online_map)
+        for_each_cpu_mask(cpu, rcu_cpu_online_map) {
                per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
+                dyntick_save_progress_counter(cpu);
+        }
        RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
        return 1;
@@ -528,7 +743,8 @@ rcu_try_flip_waitmb(void)
        RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
        for_each_cpu_mask(cpu, rcu_cpu_online_map)
-                if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
+                if (rcu_try_flip_waitmb_needed(cpu) &&
+                    per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
                        RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
                        return 0;
                }
@@ -702,8 +918,9 @@ void rcu_offline_cpu(int cpu)
         * fix.
         */
+        local_irq_save(flags);
        rdp = RCU_DATA_ME();
-        spin_lock_irqsave(&rdp->lock, flags);
+        spin_lock(&rdp->lock);
        *rdp->nexttail = list;
        if (list)
                rdp->nexttail = tail;
@@ -735,9 +952,11 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 {
        unsigned long flags;
        struct rcu_head *next, *list;
-        struct rcu_data *rdp = RCU_DATA_ME();
+        struct rcu_data *rdp;
-        spin_lock_irqsave(&rdp->lock, flags);
+        local_irq_save(flags);
+        rdp = RCU_DATA_ME();
+        spin_lock(&rdp->lock);
        list = rdp->donelist;
        if (list == NULL) {
                spin_unlock_irqrestore(&rdp->lock, flags);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 16cbec2d5d60..efbfc0fc232f 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -113,6 +113,7 @@ ssize_t res_counter_write(struct res_counter *counter, int member,
        ret = -EINVAL;
+        strstrip(buf);
        if (write_strategy) {
                if (write_strategy(buf, &tmp)) {
                        goto out_free;
diff --git a/kernel/sched.c b/kernel/sched.c
index b387a8de26a5..1cb53fb1fe3d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -174,41 +174,6 @@ struct task_group {
        struct sched_entity **se;
        /* runqueue "owned" by this group on each cpu */
        struct cfs_rq **cfs_rq;
-        /*
-         * shares assigned to a task group governs how much of cpu bandwidth
-         * is allocated to the group. The more shares a group has, the more is
-         * the cpu bandwidth allocated to it.
-         *
-         * For ex, lets say that there are three task groups, A, B and C which
-         * have been assigned shares 1000, 2000 and 3000 respectively. Then,
-         * cpu bandwidth allocated by the scheduler to task groups A, B and C
-         * should be:
-         *
-         *      Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
-         *      Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
-         *      Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
-         *
-         * The weight assigned to a task group's schedulable entities on every
-         * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
-         * group's shares. For ex: lets say that task group A has been
-         * assigned shares of 1000 and there are two CPUs in a system. Then,
-         *
-         *  tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
-         *
-         * Note: It's not necessary that each of a task's group schedulable
-         *       entity have the same weight on all CPUs. If the group
-         *       has 2 of its tasks on CPU0 and 1 task on CPU1, then a
-         *       better distribution of weight could be:
-         *
-         *      tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
-         *      tg_A->se[1]->load.weight = 1/2 * 2000 =  667
-         *
-         * rebalance_shares() is responsible for distributing the shares of a
-         * task groups like this among the group's schedulable entities across
-         * cpus.
-         *
-         */
        unsigned long shares;
 #endif
@@ -250,22 +215,12 @@ static DEFINE_SPINLOCK(task_group_lock);
 static DEFINE_MUTEX(doms_cur_mutex);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
-/* kernel thread that runs rebalance_shares() periodically */
-static struct task_struct *lb_monitor_task;
-static int load_balance_monitor(void *unused);
-#endif
-static void set_se_shares(struct sched_entity *se, unsigned long shares);
 #ifdef CONFIG_USER_SCHED
 # define INIT_TASK_GROUP_LOAD   (2*NICE_0_LOAD)
 #else
 # define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
 #endif
-#define MIN_GROUP_SHARES        2
 static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 #endif
@@ -668,6 +623,8 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
 */
 unsigned int sysctl_sched_rt_period = 1000000;
+static __read_mostly int scheduler_running;
 /*
 * part of the period that we allow rt tasks to run in us.
 * default: 0.95s
@@ -689,14 +646,16 @@ unsigned long long cpu_clock(int cpu)
        unsigned long flags;
        struct rq *rq;
-        local_irq_save(flags);
-        rq = cpu_rq(cpu);
        /*
         * Only call sched_clock() if the scheduler has already been
         * initialized (some code might call cpu_clock() very early):
         */
-        if (rq->idle)
+        if (unlikely(!scheduler_running))
-                update_rq_clock(rq);
+                return 0;
+        local_irq_save(flags);
+        rq = cpu_rq(cpu);
+        update_rq_clock(rq);
        now = rq->clock;
        local_irq_restore(flags);
@@ -1241,16 +1200,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 #endif
-static inline void inc_cpu_load(struct rq *rq, unsigned long load)
-{
-        update_load_add(&rq->load, load);
-}
-static inline void dec_cpu_load(struct rq *rq, unsigned long load)
-{
-        update_load_sub(&rq->load, load);
-}
 #ifdef CONFIG_SMP
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
@@ -1268,14 +1217,26 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 #define sched_class_highest (&rt_sched_class)
-static void inc_nr_running(struct rq *rq)
+static inline void inc_load(struct rq *rq, const struct task_struct *p)
+{
+        update_load_add(&rq->load, p->se.load.weight);
+}
+static inline void dec_load(struct rq *rq, const struct task_struct *p)
+{
+        update_load_sub(&rq->load, p->se.load.weight);
+}
+static void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
        rq->nr_running++;
+        inc_load(rq, p);
 }
-static void dec_nr_running(struct rq *rq)
+static void dec_nr_running(struct task_struct *p, struct rq *rq)
 {
        rq->nr_running--;
+        dec_load(rq, p);
 }
 static void set_load_weight(struct task_struct *p)
@@ -1367,7 +1328,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
                rq->nr_uninterruptible--;
        enqueue_task(rq, p, wakeup);
-        inc_nr_running(rq);
+        inc_nr_running(p, rq);
 }
 /*
@@ -1379,7 +1340,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
                rq->nr_uninterruptible++;
        dequeue_task(rq, p, sleep);
-        dec_nr_running(rq);
+        dec_nr_running(p, rq);
 }
 /**
@@ -2019,7 +1980,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                 * management (if any):
                 */
                p->sched_class->task_new(rq, p);
-                inc_nr_running(rq);
+                inc_nr_running(p, rq);
        }
        check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
@@ -3885,7 +3846,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
 asmlinkage void __sched schedule(void)
 {
        struct task_struct *prev, *next;
-        long *switch_count;
+        unsigned long *switch_count;
        struct rq *rq;
        int cpu;
@@ -4358,8 +4319,10 @@ void set_user_nice(struct task_struct *p, long nice)
                goto out_unlock;
        }
        on_rq = p->se.on_rq;
-        if (on_rq)
+        if (on_rq) {
                dequeue_task(rq, p, 0);
+                dec_load(rq, p);
+        }
        p->static_prio = NICE_TO_PRIO(nice);
        set_load_weight(p);
@@ -4369,6 +4332,7 @@ void set_user_nice(struct task_struct *p, long nice)
        if (on_rq) {
                enqueue_task(rq, p, 0);
+                inc_load(rq, p);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -4458,7 +4422,7 @@ int task_nice(const struct task_struct *p)
 {
        return TASK_NICE(p);
 }
-EXPORT_SYMBOL_GPL(task_nice);
+EXPORT_SYMBOL(task_nice);
 /**
 * idle_cpu - is a given cpu idle currently?
@@ -5136,7 +5100,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
        time_slice = 0;
        if (p->policy == SCHED_RR) {
                time_slice = DEF_TIMESLICE;
-        } else {
+        } else if (p->policy != SCHED_FIFO) {
                struct sched_entity *se = &p->se;
                unsigned long flags;
                struct rq *rq;
@@ -5917,7 +5881,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                spin_unlock_irq(&rq->lock);
                break;
-        case CPU_DOWN_PREPARE:
+        case CPU_DYING:
+        case CPU_DYING_FROZEN:
                /* Update our root-domain */
                rq = cpu_rq(cpu);
                spin_lock_irqsave(&rq->lock, flags);
@@ -7083,21 +7048,6 @@ void __init sched_init_smp(void)
        if (set_cpus_allowed(current, non_isolated_cpus) < 0)
                BUG();
        sched_init_granularity();
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        if (nr_cpu_ids == 1)
-                return;
-        lb_monitor_task = kthread_create(load_balance_monitor, NULL,
-                                         "group_balance");
-        if (!IS_ERR(lb_monitor_task)) {
-                lb_monitor_task->flags |= PF_NOFREEZE;
-                wake_up_process(lb_monitor_task);
-        } else {
-                printk(KERN_ERR "Could not create load balance monitor thread"
-                        "(error = %ld) \n", PTR_ERR(lb_monitor_task));
-        }
-#endif
 }
 #else
 void __init sched_init_smp(void)
@@ -7284,6 +7234,8 @@ void __init sched_init(void)
         * During early bootup we pretend to be a normal task:
         */
        current->sched_class = &fair_sched_class;
+        scheduler_running = 1;
 }
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -7418,157 +7370,6 @@ void set_curr_task(int cpu, struct task_struct *p)
 #ifdef CONFIG_GROUP_SCHED
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
-/*
- * distribute shares of all task groups among their schedulable entities,
- * to reflect load distribution across cpus.
- */
-static int rebalance_shares(struct sched_domain *sd, int this_cpu)
-{
-        struct cfs_rq *cfs_rq;
-        struct rq *rq = cpu_rq(this_cpu);
-        cpumask_t sdspan = sd->span;
-        int balanced = 1;
-        /* Walk thr' all the task groups that we have */
-        for_each_leaf_cfs_rq(rq, cfs_rq) {
-                int i;
-                unsigned long total_load = 0, total_shares;
-                struct task_group *tg = cfs_rq->tg;
-                /* Gather total task load of this group across cpus */
-                for_each_cpu_mask(i, sdspan)
-                        total_load += tg->cfs_rq[i]->load.weight;
-                /* Nothing to do if this group has no load */
-                if (!total_load)
-                        continue;
-                /*
-                 * tg->shares represents the number of cpu shares the task group
-                 * is eligible to hold on a single cpu. On N cpus, it is
-                 * eligible to hold (N * tg->shares) number of cpu shares.
-                 */
-                total_shares = tg->shares * cpus_weight(sdspan);
-                /*
-                 * redistribute total_shares across cpus as per the task load
-                 * distribution.
-                 */
-                for_each_cpu_mask(i, sdspan) {
-                        unsigned long local_load, local_shares;
-                        local_load = tg->cfs_rq[i]->load.weight;
-                        local_shares = (local_load * total_shares) / total_load;
-                        if (!local_shares)
-                                local_shares = MIN_GROUP_SHARES;
-                        if (local_shares == tg->se[i]->load.weight)
-                                continue;
-                        spin_lock_irq(&cpu_rq(i)->lock);
-                        set_se_shares(tg->se[i], local_shares);
-                        spin_unlock_irq(&cpu_rq(i)->lock);
-                        balanced = 0;
-                }
-        }
-        return balanced;
-}
-/*
- * How frequently should we rebalance_shares() across cpus?
- *
- * The more frequently we rebalance shares, the more accurate is the fairness
- * of cpu bandwidth distribution between task groups. However higher frequency
- * also implies increased scheduling overhead.
- *
- * sysctl_sched_min_bal_int_shares represents the minimum interval between
- * consecutive calls to rebalance_shares() in the same sched domain.
- *
- * sysctl_sched_max_bal_int_shares represents the maximum interval between
- * consecutive calls to rebalance_shares() in the same sched domain.
- *
- * These settings allows for the appropriate trade-off between accuracy of
- * fairness and the associated overhead.
- *
- */
-/* default: 8ms, units: milliseconds */
-const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
-/* default: 128ms, units: milliseconds */
-const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
-/* kernel thread that runs rebalance_shares() periodically */
-static int load_balance_monitor(void *unused)
-{
-        unsigned int timeout = sysctl_sched_min_bal_int_shares;
-        struct sched_param schedparm;
-        int ret;
-        /*
-         * We don't want this thread's execution to be limited by the shares
-         * assigned to default group (init_task_group). Hence make it run
-         * as a SCHED_RR RT task at the lowest priority.
-         */
-        schedparm.sched_priority = 1;
-        ret = sched_setscheduler(current, SCHED_RR, &schedparm);
-        if (ret)
-                printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
-                                " monitor thread (error = %d) \n", ret);
-        while (!kthread_should_stop()) {
-                int i, cpu, balanced = 1;
-                /* Prevent cpus going down or coming up */
-                get_online_cpus();
-                /* lockout changes to doms_cur[] array */
-                lock_doms_cur();
-                /*
-                 * Enter a rcu read-side critical section to safely walk rq->sd
-                 * chain on various cpus and to walk task group list
-                 * (rq->leaf_cfs_rq_list) in rebalance_shares().
-                 */
-                rcu_read_lock();
-                for (i = 0; i < ndoms_cur; i++) {
-                        cpumask_t cpumap = doms_cur[i];
-                        struct sched_domain *sd = NULL, *sd_prev = NULL;
-                        cpu = first_cpu(cpumap);
-                        /* Find the highest domain at which to balance shares */
-                        for_each_domain(cpu, sd) {
-                                if (!(sd->flags & SD_LOAD_BALANCE))
-                                        continue;
-                                sd_prev = sd;
-                        }
-                        sd = sd_prev;
-                        /* sd == NULL? No load balance reqd in this domain */
-                        if (!sd)
-                                continue;
-                        balanced &= rebalance_shares(sd, cpu);
-                }
-                rcu_read_unlock();
-                unlock_doms_cur();
-                put_online_cpus();
-                if (!balanced)
-                        timeout = sysctl_sched_min_bal_int_shares;
-                else if (timeout < sysctl_sched_max_bal_int_shares)
-                        timeout *= 2;
-                msleep_interruptible(timeout);
-        }
-        return 0;
-}
-#endif  /* CONFIG_SMP */
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void free_fair_sched_group(struct task_group *tg)
 {
@@ -7825,6 +7626,11 @@ void sched_move_task(struct task_struct *tsk)
        set_task_rq(tsk, task_cpu(tsk));
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        if (tsk->sched_class->moved_group)
+                tsk->sched_class->moved_group(tsk);
+#endif
        if (on_rq) {
                if (unlikely(running))
                        tsk->sched_class->set_curr_task(rq);
@@ -7835,29 +7641,25 @@ void sched_move_task(struct task_struct *tsk)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/* rq->lock to be locked by caller */
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
 {
        struct cfs_rq *cfs_rq = se->cfs_rq;
        struct rq *rq = cfs_rq->rq;
        int on_rq;
-        if (!shares)
+        spin_lock_irq(&rq->lock);
-                shares = MIN_GROUP_SHARES;
        on_rq = se->on_rq;
-        if (on_rq) {
+        if (on_rq)
                dequeue_entity(cfs_rq, se, 0);
-                dec_cpu_load(rq, se->load.weight);
-        }
        se->load.weight = shares;
        se->load.inv_weight = div64_64((1ULL<<32), shares);
-        if (on_rq) {
+        if (on_rq)
                enqueue_entity(cfs_rq, se, 0);
-                inc_cpu_load(rq, se->load.weight);
-        }
+        spin_unlock_irq(&rq->lock);
 }
 static DEFINE_MUTEX(shares_mutex);
@@ -7867,18 +7669,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
        int i;
        unsigned long flags;
+        /*
+         * A weight of 0 or 1 can cause arithmetics problems.
+         * (The default weight is 1024 - so there's no practical
+         *  limitation from this.)
+         */
+        if (shares < 2)
+                shares = 2;
        mutex_lock(&shares_mutex);
        if (tg->shares == shares)
                goto done;
-        if (shares < MIN_GROUP_SHARES)
-                shares = MIN_GROUP_SHARES;
-        /*
-         * Prevent any load balance activity (rebalance_shares,
-         * load_balance_fair) from referring to this group first,
-         * by taking it off the rq->leaf_cfs_rq_list on each cpu.
-         */
        spin_lock_irqsave(&task_group_lock, flags);
        for_each_possible_cpu(i)
                unregister_fair_sched_group(tg, i);
@@ -7892,11 +7694,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
         * w/o tripping rebalance_share or load_balance_fair.
         */
        tg->shares = shares;
-        for_each_possible_cpu(i) {
+        for_each_possible_cpu(i)
-                spin_lock_irq(&cpu_rq(i)->lock);
                set_se_shares(tg->se[i], shares);
-                spin_unlock_irq(&cpu_rq(i)->lock);
-        }
        /*
         * Enable load balance activity on this group, by inserting it back on
@@ -7928,9 +7727,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
        if (runtime == RUNTIME_INF)
                return 1ULL << 16;
-        runtime *= (1ULL << 16);
+        return div64_64(runtime << 16, period);
-        div64_64(runtime, period);
-        return runtime;
 }
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
@@ -7954,25 +7751,40 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
        return total + to_ratio(period, runtime) < global_ratio;
 }
+/* Must be called with tasklist_lock held */
+static inline int tg_has_rt_tasks(struct task_group *tg)
+{
+        struct task_struct *g, *p;
+        do_each_thread(g, p) {
+                if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+                        return 1;
+        } while_each_thread(g, p);
+        return 0;
+}
 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
        u64 rt_runtime, rt_period;
        int err = 0;
-        rt_period = sysctl_sched_rt_period * NSEC_PER_USEC;
+        rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
        rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
        if (rt_runtime_us == -1)
-                rt_runtime = rt_period;
+                rt_runtime = RUNTIME_INF;
        mutex_lock(&rt_constraints_mutex);
+        read_lock(&tasklist_lock);
+        if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) {
+                err = -EBUSY;
+                goto unlock;
+        }
        if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
                err = -EINVAL;
                goto unlock;
        }
-        if (rt_runtime_us == -1)
-                rt_runtime = RUNTIME_INF;
        tg->rt_runtime = rt_runtime;
 unlock:
+        read_unlock(&tasklist_lock);
        mutex_unlock(&rt_constraints_mutex);
        return err;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6c091d6e159d..e2a530515619 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -202,17 +202,12 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
-        struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+        struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
-        struct sched_entity *se = NULL;
-        struct rb_node *parent;
-        while (*link) {
+        if (!last)
-                parent = *link;
+                return NULL;
-                se = rb_entry(parent, struct sched_entity, run_node);
-                link = &parent->rb_right;
-        }
-        return se;
+        return rb_entry(last, struct sched_entity, run_node);
 }
 /**************************************************************
@@ -732,8 +727,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
        return se->parent;
 }
-#define GROUP_IMBALANCE_PCT     20
 #else   /* CONFIG_FAIR_GROUP_SCHED */
 #define for_each_sched_entity(se) \
@@ -824,26 +817,15 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p)
 static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 {
        struct cfs_rq *cfs_rq;
-        struct sched_entity *se = &p->se,
+        struct sched_entity *se = &p->se;
-                            *topse = NULL;      /* Highest schedulable entity */
-        int incload = 1;
        for_each_sched_entity(se) {
-                topse = se;
+                if (se->on_rq)
-                if (se->on_rq) {
-                        incload = 0;
                        break;
-                }
                cfs_rq = cfs_rq_of(se);
                enqueue_entity(cfs_rq, se, wakeup);
                wakeup = 1;
        }
-        /* Increment cpu load if we just enqueued the first task of a group on
-         * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
-         * at the highest grouping level.
-         */
-        if (incload)
-                inc_cpu_load(rq, topse->load.weight);
        hrtick_start_fair(rq, rq->curr);
 }
@@ -856,28 +838,16 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 {
        struct cfs_rq *cfs_rq;
-        struct sched_entity *se = &p->se,
+        struct sched_entity *se = &p->se;
-                            *topse = NULL;      /* Highest schedulable entity */
-        int decload = 1;
        for_each_sched_entity(se) {
-                topse = se;
                cfs_rq = cfs_rq_of(se);
                dequeue_entity(cfs_rq, se, sleep);
                /* Don't dequeue parent if it has other entities besides us */
-                if (cfs_rq->load.weight) {
+                if (cfs_rq->load.weight)
-                        if (parent_entity(se))
-                                decload = 0;
                        break;
-                }
                sleep = 1;
        }
-        /* Decrement cpu load if we just dequeued the last task of a group on
-         * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
-         * at the highest grouping level.
-         */
-        if (decload)
-                dec_cpu_load(rq, topse->load.weight);
        hrtick_start_fair(rq, rq->curr);
 }
@@ -1191,6 +1161,25 @@ static struct task_struct *load_balance_next_fair(void *arg)
        return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
 }
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+{
+        struct sched_entity *curr;
+        struct task_struct *p;
+        if (!cfs_rq->nr_running || !first_fair(cfs_rq))
+                return MAX_PRIO;
+        curr = cfs_rq->curr;
+        if (!curr)
+                curr = __pick_next_entity(cfs_rq);
+        p = task_of(curr);
+        return p->prio;
+}
+#endif
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
@@ -1200,45 +1189,28 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        struct cfs_rq *busy_cfs_rq;
        long rem_load_move = max_load_move;
        struct rq_iterator cfs_rq_iterator;
-        unsigned long load_moved;
        cfs_rq_iterator.start = load_balance_start_fair;
        cfs_rq_iterator.next = load_balance_next_fair;
        for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
 #ifdef CONFIG_FAIR_GROUP_SCHED
-                struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
+                struct cfs_rq *this_cfs_rq;
-                unsigned long maxload, task_load, group_weight;
+                long imbalance;
-                unsigned long thisload, per_task_load;
+                unsigned long maxload;
-                struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
-                task_load = busy_cfs_rq->load.weight;
-                group_weight = se->load.weight;
-                /*
+                this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
-                 * 'group_weight' is contributed by tasks of total weight
-                 * 'task_load'. To move 'rem_load_move' worth of weight only,
-                 * we need to move a maximum task load of:
-                 *
-                 *      maxload = (remload / group_weight) * task_load;
-                 */
-                maxload = (rem_load_move * task_load) / group_weight;
-                if (!maxload || !task_load)
+                imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
+                /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
+                if (imbalance <= 0)
                        continue;
-                per_task_load = task_load / busy_cfs_rq->nr_running;
+                /* Don't pull more than imbalance/2 */
-                /*
+                imbalance /= 2;
-                 * balance_tasks will try to forcibly move atleast one task if
+                maxload = min(rem_load_move, imbalance);
-                 * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
-                 * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
-                 */
-                 if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
-                        continue;
-                /* Disable priority-based load balance */
+                *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
-                *this_best_prio = 0;
-                thisload = this_cfs_rq->load.weight;
 #else
 # define maxload rem_load_move
 #endif
@@ -1247,33 +1219,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                 * load_balance_[start|next]_fair iterators
                 */
                cfs_rq_iterator.arg = busy_cfs_rq;
-                load_moved = balance_tasks(this_rq, this_cpu, busiest,
+                rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
                                               maxload, sd, idle, all_pinned,
                                               this_best_prio,
                                               &cfs_rq_iterator);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-                /*
-                 * load_moved holds the task load that was moved. The
-                 * effective (group) weight moved would be:
-                 *      load_moved_eff = load_moved/task_load * group_weight;
-                 */
-                load_moved = (group_weight * load_moved) / task_load;
-                /* Adjust shares on both cpus to reflect load_moved */
-                group_weight -= load_moved;
-                set_se_shares(se, group_weight);
-                se = busy_cfs_rq->tg->se[this_cpu];
-                if (!thisload)
-                        group_weight = load_moved;
-                else
-                        group_weight = se->load.weight + load_moved;
-                set_se_shares(se, group_weight);
-#endif
-                rem_load_move -= load_moved;
                if (rem_load_move <= 0)
                        break;
        }
@@ -1403,6 +1353,16 @@ static void set_curr_task_fair(struct rq *rq)
                set_next_entity(cfs_rq_of(se), se);
 }
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void moved_group_fair(struct task_struct *p)
+{
+        struct cfs_rq *cfs_rq = task_cfs_rq(p);
+        update_curr(cfs_rq);
+        place_entity(cfs_rq, &p->se, 1);
+}
+#endif
 /*
 * All the scheduling class methods:
 */
@@ -1431,6 +1391,10 @@ static const struct sched_class fair_sched_class = {
        .prio_changed           = prio_changed_fair,
        .switched_to            = switched_to_fair,
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        .moved_group            = moved_group_fair,
+#endif
 };
 #ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f54792b175b2..0a6d2e516420 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -393,8 +393,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
         */
        for_each_sched_rt_entity(rt_se)
                enqueue_rt_entity(rt_se);
-        inc_cpu_load(rq, p->se.load.weight);
 }
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -414,8 +412,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
                if (rt_rq && rt_rq->rt_nr_running)
                        enqueue_rt_entity(rt_se);
        }
-        dec_cpu_load(rq, p->se.load.weight);
 }
 /*
@@ -1111,9 +1107,11 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
                        pull_rt_task(rq);
                /*
                 * If there's a higher priority task waiting to run
-                 * then reschedule.
+                 * then reschedule. Note, the above pull_rt_task
+                 * can release the rq lock and p could migrate.
+                 * Only reschedule if p is still on the same runqueue.
                 */
-                if (p->prio > rq->rt.highest_prio)
+                if (p->prio > rq->rt.highest_prio && rq->curr == p)
                        resched_task(p);
 #else
                /* For UP simply resched on drop of prio */
diff --git a/kernel/signal.c b/kernel/signal.c
index 84917fe507f7..6af1210092c3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1623,7 +1623,6 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
        /* Let the debugger run.  */
        __set_current_state(TASK_TRACED);
        spin_unlock_irq(&current->sighand->siglock);
-        try_to_freeze();
        read_lock(&tasklist_lock);
        if (!unlikely(killed) && may_ptrace_stop()) {
                do_notify_parent_cldstop(current, CLD_TRAPPED);
@@ -1641,6 +1640,13 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
        }
        /*
+         * While in TASK_TRACED, we were considered "frozen enough".
+         * Now that we woke up, it's crucial if we're supposed to be
+         * frozen that we freeze now before running anything substantial.
+         */
+        try_to_freeze();
+        /*
         * We are back.  Now reacquire the siglock before touching
         * last_siginfo, so that we are sure to have synchronized with
         * any signal-sending on another CPU that wants to examine it.
@@ -1757,9 +1763,15 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
        sigset_t *mask = &current->blocked;
        int signr = 0;
+relock:
+        /*
+         * We'll jump back here after any time we were stopped in TASK_STOPPED.
+         * While in TASK_STOPPED, we were considered "frozen enough".
+         * Now that we woke up, it's crucial if we're supposed to be
+         * frozen that we freeze now before running anything substantial.
+         */
        try_to_freeze();
-relock:
        spin_lock_irq(&current->sighand->siglock);
        for (;;) {
                struct k_sigaction *ka;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 5b3aea5f471e..31e9f2a47928 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -313,6 +313,7 @@ void irq_exit(void)
        /* Make sure that timer wheel updates are propagated */
        if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
                tick_nohz_stop_sched_tick();
+        rcu_irq_exit();
 #endif
        preempt_enable_no_resched();
 }
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 7c2da88db4ed..01b6522fd92b 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -216,26 +216,27 @@ static int watchdog(void *__bind_cpu)
        /* initialize timestamp */
        touch_softlockup_watchdog();
+        set_current_state(TASK_INTERRUPTIBLE);
        /*
         * Run briefly once per second to reset the softlockup timestamp.
         * If this gets delayed for more than 60 seconds then the
         * debug-printout triggers in softlockup_tick().
         */
        while (!kthread_should_stop()) {
-                set_current_state(TASK_INTERRUPTIBLE);
                touch_softlockup_watchdog();
                schedule();
                if (kthread_should_stop())
                        break;
-                if (this_cpu != check_cpu)
+                if (this_cpu == check_cpu) {
-                        continue;
+                        if (sysctl_hung_task_timeout_secs)
+                                check_hung_uninterruptible_tasks(this_cpu);
-                if (sysctl_hung_task_timeout_secs)
+                }
-                        check_hung_uninterruptible_tasks(this_cpu);
+                set_current_state(TASK_INTERRUPTIBLE);
        }
+        __set_current_state(TASK_RUNNING);
        return 0;
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8b7e95411795..b2a2d6889bab 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -311,24 +311,6 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
-#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
-        {
-                .ctl_name       = CTL_UNNUMBERED,
-                .procname       = "sched_min_bal_int_shares",
-                .data           = &sysctl_sched_min_bal_int_shares,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
-        },
-        {
-                .ctl_name       = CTL_UNNUMBERED,
-                .procname       = "sched_max_bal_int_shares",
-                .data           = &sysctl_sched_max_bal_int_shares,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
-        },
-#endif
 #endif
        {
                .ctl_name       = CTL_UNNUMBERED,
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c88b5910e7ab..5fd9b9469770 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -42,12 +42,13 @@ long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
 long time_freq;                         /* frequency offset (scaled ppm)*/
 static long time_reftime;               /* time at last adjustment (s)  */
 long time_adjust;
+static long ntp_tick_adj;
 static void ntp_update_frequency(void)
 {
        u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
                                << TICK_LENGTH_SHIFT;
-        second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
+        second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT;
        second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
        tick_length_base = second_length;
@@ -342,14 +343,16 @@ int do_adjtimex(struct timex *txc)
                    freq_adj = shift_right(freq_adj, time_constant * 2 +
                                           (SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
                    if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
+                        u64 utemp64;
                        temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL);
                        if (time_offset < 0) {
-                            temp64 = -temp64;
+                            utemp64 = -temp64;
-                            do_div(temp64, mtemp);
+                            do_div(utemp64, mtemp);
-                            freq_adj -= temp64;
+                            freq_adj -= utemp64;
                        } else {
-                            do_div(temp64, mtemp);
+                            utemp64 = temp64;
-                            freq_adj += temp64;
+                            do_div(utemp64, mtemp);
+                            freq_adj += utemp64;
                        }
                    }
                    freq_adj += time_freq;
@@ -400,3 +403,11 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
        notify_cmos_timer();
        return(result);
 }
+static int __init ntp_tick_adj_setup(char *str)
+{
+        ntp_tick_adj = simple_strtol(str, NULL, 0);
+        return 1;
+}
+__setup("ntp_tick_adj=", ntp_tick_adj_setup);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index fa9bb73dbdb4..686da821d376 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -282,6 +282,7 @@ void tick_nohz_stop_sched_tick(void)
                        ts->idle_tick = ts->sched_timer.expires;
                        ts->tick_stopped = 1;
                        ts->idle_jiffies = last_jiffies;
+                        rcu_enter_nohz();
                }
                /*
@@ -375,6 +376,8 @@ void tick_nohz_restart_sched_tick(void)
                return;
        }
+        rcu_exit_nohz();
        /* Update jiffies first */
        select_nohz_load_balancer(0);
        now = ktime_get();
@@ -637,7 +640,7 @@ void tick_cancel_sched_timer(int cpu)
        if (ts->sched_timer.base)
                hrtimer_cancel(&ts->sched_timer);
-        ts->tick_stopped = 0;
        ts->nohz_mode = NOHZ_MODE_INACTIVE;
 }
 #endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1af9fb050fe2..671af612b768 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -187,8 +187,7 @@ static void change_clocksource(void)
        clock->error = 0;
        clock->xtime_nsec = 0;
-        clocksource_calculate_interval(clock,
+        clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
-                (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
        tick_clock_notify();
@@ -245,8 +244,7 @@ void __init timekeeping_init(void)
        ntp_clear();
        clock = clocksource_get_next();
-        clocksource_calculate_interval(clock,
+        clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
-                (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
        clock->cycle_last = clocksource_read(clock);
        xtime.tv_sec = sec;