10 files changed, 292 insertions, 205 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a13d69b2217d..8ae5fac08dfa 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1444,7 +1444,8 @@ and is between 256 and 4096 characters. It is defined in the file
                        Param: "schedule" - profile schedule points.
                        Param: <number> - step/bucket size as a power of 2 for
                                statistical time based profiling.
-                        Param: "sleep" - profile D-state sleeping (millisecs)
+                        Param: "sleep" - profile D-state sleeping (millisecs).
+                                Requires CONFIG_SCHEDSTATS
                        Param: "kvm" - profile VM exits.
        processor.max_cstate=   [HW,ACPI]
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 268c5a4a2bd4..33d6aaf94447 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -42,15 +42,15 @@ static inline void init_completion(struct completion *x)
        init_waitqueue_head(&x->wait);
 }
-extern void FASTCALL(wait_for_completion(struct completion *));
+extern void wait_for_completion(struct completion *);
-extern int FASTCALL(wait_for_completion_interruptible(struct completion *x));
+extern int wait_for_completion_interruptible(struct completion *x);
-extern unsigned long FASTCALL(wait_for_completion_timeout(struct completion *x,
+extern unsigned long wait_for_completion_timeout(struct completion *x,
-                                                   unsigned long timeout));
+                                                   unsigned long timeout);
-extern unsigned long FASTCALL(wait_for_completion_interruptible_timeout(
+extern unsigned long wait_for_completion_interruptible_timeout(
-                        struct completion *x, unsigned long timeout));
+                        struct completion *x, unsigned long timeout);
-extern void FASTCALL(complete(struct completion *));
+extern void complete(struct completion *);
-extern void FASTCALL(complete_all(struct completion *));
+extern void complete_all(struct completion *);
 #define INIT_COMPLETION(x)      ((x).done = 0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 13df99fb2769..24e08d1d900d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -828,12 +828,17 @@ struct sched_class {
        struct task_struct * (*pick_next_task) (struct rq *rq);
        void (*put_prev_task) (struct rq *rq, struct task_struct *p);
+#ifdef CONFIG_SMP
        unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
-                        struct rq *busiest,
+                        struct rq *busiest, unsigned long max_load_move,
-                        unsigned long max_nr_move, unsigned long max_load_move,
                        struct sched_domain *sd, enum cpu_idle_type idle,
                        int *all_pinned, int *this_best_prio);
+        int (*move_one_task) (struct rq *this_rq, int this_cpu,
+                              struct rq *busiest, struct sched_domain *sd,
+                              enum cpu_idle_type idle);
+#endif
        void (*set_curr_task) (struct rq *rq);
        void (*task_tick) (struct rq *rq, struct task_struct *p);
        void (*task_new) (struct rq *rq, struct task_struct *p);
@@ -1196,7 +1201,7 @@ static inline int rt_prio(int prio)
        return 0;
 }
-static inline int rt_task(struct task_struct *p)
+static inline int rt_task(const struct task_struct *p)
 {
        return rt_prio(p->prio);
 }
@@ -1211,22 +1216,22 @@ static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp)
        tsk->signal->__pgrp = pgrp;
 }
-static inline struct pid *task_pid(struct task_struct *task)
+static inline struct pid *task_pid(const struct task_struct *task)
 {
        return task->pids[PIDTYPE_PID].pid;
 }
-static inline struct pid *task_tgid(struct task_struct *task)
+static inline struct pid *task_tgid(const struct task_struct *task)
 {
        return task->group_leader->pids[PIDTYPE_PID].pid;
 }
-static inline struct pid *task_pgrp(struct task_struct *task)
+static inline struct pid *task_pgrp(const struct task_struct *task)
 {
        return task->group_leader->pids[PIDTYPE_PGID].pid;
 }
-static inline struct pid *task_session(struct task_struct *task)
+static inline struct pid *task_session(const struct task_struct *task)
 {
        return task->group_leader->pids[PIDTYPE_SID].pid;
 }
@@ -1255,7 +1260,7 @@ struct pid_namespace;
 * see also pid_nr() etc in include/linux/pid.h
 */
-static inline pid_t task_pid_nr(struct task_struct *tsk)
+static inline pid_t task_pid_nr(const struct task_struct *tsk)
 {
        return tsk->pid;
 }
@@ -1268,7 +1273,7 @@ static inline pid_t task_pid_vnr(struct task_struct *tsk)
 }
-static inline pid_t task_tgid_nr(struct task_struct *tsk)
+static inline pid_t task_tgid_nr(const struct task_struct *tsk)
 {
        return tsk->tgid;
 }
@@ -1281,7 +1286,7 @@ static inline pid_t task_tgid_vnr(struct task_struct *tsk)
 }
-static inline pid_t task_pgrp_nr(struct task_struct *tsk)
+static inline pid_t task_pgrp_nr(const struct task_struct *tsk)
 {
        return tsk->signal->__pgrp;
 }
@@ -1294,7 +1299,7 @@ static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
 }
-static inline pid_t task_session_nr(struct task_struct *tsk)
+static inline pid_t task_session_nr(const struct task_struct *tsk)
 {
        return tsk->signal->__session;
 }
@@ -1321,7 +1326,7 @@ static inline pid_t task_ppid_nr_ns(struct task_struct *tsk,
 * If pid_alive fails, then pointers within the task structure
 * can be stale and must not be dereferenced.
 */
-static inline int pid_alive(struct task_struct *p)
+static inline int pid_alive(const struct task_struct *p)
 {
        return p->pids[PIDTYPE_PID].pid != NULL;
 }
@@ -1332,7 +1337,7 @@ static inline int pid_alive(struct task_struct *p)
 *
 * Check if a task structure is the first user space task the kernel created.
 */
-static inline int is_global_init(struct task_struct *tsk)
+static inline int is_global_init(const struct task_struct *tsk)
 {
        return tsk->pid == 1;
 }
@@ -1469,7 +1474,7 @@ extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 #else
-static inline int rt_mutex_getprio(struct task_struct *p)
+static inline int rt_mutex_getprio(const struct task_struct *p)
 {
        return p->normal_prio;
 }
@@ -1721,7 +1726,7 @@ extern void wait_task_inactive(struct task_struct * p);
 * all we care about is that we have a task with the appropriate
 * pid, we don't actually care if we have the right task.
 */
-static inline int has_group_leader_pid(struct task_struct *p)
+static inline int has_group_leader_pid(const struct task_struct *p)
 {
        return p->pid == p->tgid;
 }
@@ -1738,7 +1743,7 @@ static inline struct task_struct *next_thread(const struct task_struct *p)
                          struct task_struct, thread_group);
 }
-static inline int thread_group_empty(struct task_struct *p)
+static inline int thread_group_empty(const struct task_struct *p)
 {
        return list_empty(&p->thread_group);
 }
diff --git a/init/Kconfig b/init/Kconfig
index b7dffa837926..8b88d0bedcbd 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -322,7 +322,6 @@ config CPUSETS
 config FAIR_GROUP_SCHED
        bool "Fair group CPU scheduler"
        default y
-        depends on EXPERIMENTAL
        help
          This feature lets CPU scheduler recognize task groups and control CPU
          bandwidth allocation to such task groups.
diff --git a/kernel/profile.c b/kernel/profile.c
index 631b75c25d7e..5e95330e5120 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -60,6 +60,7 @@ static int __init profile_setup(char * str)
        int par;
        if (!strncmp(str, sleepstr, strlen(sleepstr))) {
+#ifdef CONFIG_SCHEDSTATS
                prof_on = SLEEP_PROFILING;
                if (str[strlen(sleepstr)] == ',')
                        str += strlen(sleepstr) + 1;
@@ -68,6 +69,10 @@ static int __init profile_setup(char * str)
                printk(KERN_INFO
                        "kernel sleep profiling enabled (shift: %ld)\n",
                        prof_shift);
+#else
+                printk(KERN_WARNING
+                        "kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
+#endif /* CONFIG_SCHEDSTATS */
        } else if (!strncmp(str, schedstr, strlen(schedstr))) {
                prof_on = SCHED_PROFILING;
                if (str[strlen(schedstr)] == ',')
diff --git a/kernel/sched.c b/kernel/sched.c
index 2810e562a991..b4fbbc440453 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -66,6 +66,7 @@
 #include <linux/pagemap.h>
 #include <asm/tlb.h>
+#include <asm/irq_regs.h>
 /*
 * Scheduler clock - returns current time in nanosec units.
@@ -837,11 +838,18 @@ struct rq_iterator {
        struct task_struct *(*next)(void *);
 };
-static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+#ifdef CONFIG_SMP
-                      unsigned long max_nr_move, unsigned long max_load_move,
+static unsigned long
-                      struct sched_domain *sd, enum cpu_idle_type idle,
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                      int *all_pinned, unsigned long *load_moved,
+              unsigned long max_load_move, struct sched_domain *sd,
-                      int *this_best_prio, struct rq_iterator *iterator);
+              enum cpu_idle_type idle, int *all_pinned,
+              int *this_best_prio, struct rq_iterator *iterator);
+static int
+iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                   struct sched_domain *sd, enum cpu_idle_type idle,
+                   struct rq_iterator *iterator);
+#endif
 #include "sched_stats.h"
 #include "sched_idletask.c"
@@ -2223,17 +2231,17 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
        return 1;
 }
-static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+static unsigned long
-                      unsigned long max_nr_move, unsigned long max_load_move,
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                      struct sched_domain *sd, enum cpu_idle_type idle,
+              unsigned long max_load_move, struct sched_domain *sd,
-                      int *all_pinned, unsigned long *load_moved,
+              enum cpu_idle_type idle, int *all_pinned,
-                      int *this_best_prio, struct rq_iterator *iterator)
+              int *this_best_prio, struct rq_iterator *iterator)
 {
        int pulled = 0, pinned = 0, skip_for_load;
        struct task_struct *p;
        long rem_load_move = max_load_move;
-        if (max_nr_move == 0 || max_load_move == 0)
+        if (max_load_move == 0)
                goto out;
        pinned = 1;
@@ -2266,7 +2274,7 @@ next:
         * We only want to steal up to the prescribed number of tasks
         * and the prescribed amount of weighted load.
         */
-        if (pulled < max_nr_move && rem_load_move > 0) {
+        if (rem_load_move > 0) {
                if (p->prio < *this_best_prio)
                        *this_best_prio = p->prio;
                p = iterator->next(iterator->arg);
@@ -2274,7 +2282,7 @@ next:
        }
 out:
        /*
-         * Right now, this is the only place pull_task() is called,
+         * Right now, this is one of only two places pull_task() is called,
         * so we can safely collect pull_task() stats here rather than
         * inside pull_task().
         */
@@ -2282,8 +2290,8 @@ out:
        if (all_pinned)
                *all_pinned = pinned;
-        *load_moved = max_load_move - rem_load_move;
-        return pulled;
+        return max_load_move - rem_load_move;
 }
 /*
@@ -2305,7 +2313,7 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
        do {
                total_load_moved +=
                        class->load_balance(this_rq, this_cpu, busiest,
-                                ULONG_MAX, max_load_move - total_load_moved,
+                                max_load_move - total_load_moved,
                                sd, idle, all_pinned, &this_best_prio);
                class = class->next;
        } while (class && max_load_move > total_load_moved);
@@ -2313,6 +2321,32 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
        return total_load_moved > 0;
 }
+static int
+iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                   struct sched_domain *sd, enum cpu_idle_type idle,
+                   struct rq_iterator *iterator)
+{
+        struct task_struct *p = iterator->start(iterator->arg);
+        int pinned = 0;
+        while (p) {
+                if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
+                        pull_task(busiest, p, this_rq, this_cpu);
+                        /*
+                         * Right now, this is only the second place pull_task()
+                         * is called, so we can safely collect pull_task()
+                         * stats here rather than inside pull_task().
+                         */
+                        schedstat_inc(sd, lb_gained[idle]);
+                        return 1;
+                }
+                p = iterator->next(iterator->arg);
+        }
+        return 0;
+}
 /*
 * move_one_task tries to move exactly one task from busiest to this_rq, as
 * part of active balancing operations within "domain".
@@ -2324,12 +2358,9 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
                         struct sched_domain *sd, enum cpu_idle_type idle)
 {
        const struct sched_class *class;
-        int this_best_prio = MAX_PRIO;
        for (class = sched_class_highest; class; class = class->next)
-                if (class->load_balance(this_rq, this_cpu, busiest,
+                if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
-                                        1, ULONG_MAX, sd, idle, NULL,
-                                        &this_best_prio))
                        return 1;
        return 0;
@@ -3266,18 +3297,6 @@ static inline void idle_balance(int cpu, struct rq *rq)
 {
 }
-/* Avoid "used but not defined" warning on UP */
-static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                      unsigned long max_nr_move, unsigned long max_load_move,
-                      struct sched_domain *sd, enum cpu_idle_type idle,
-                      int *all_pinned, unsigned long *load_moved,
-                      int *this_best_prio, struct rq_iterator *iterator)
-{
-        *load_moved = 0;
-        return 0;
-}
 #endif
 DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -3507,12 +3526,19 @@ EXPORT_SYMBOL(sub_preempt_count);
 */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
-        printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
+        struct pt_regs *regs = get_irq_regs();
-                prev->comm, preempt_count(), task_pid_nr(prev));
+        printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
+                prev->comm, prev->pid, preempt_count());
        debug_show_held_locks(prev);
        if (irqs_disabled())
                print_irqtrace_events(prev);
-        dump_stack();
+        if (regs)
+                show_regs(regs);
+        else
+                dump_stack();
 }
 /*
@@ -3820,7 +3846,7 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
-void fastcall complete(struct completion *x)
+void complete(struct completion *x)
 {
        unsigned long flags;
@@ -3832,7 +3858,7 @@ void fastcall complete(struct completion *x)
 }
 EXPORT_SYMBOL(complete);
-void fastcall complete_all(struct completion *x)
+void complete_all(struct completion *x)
 {
        unsigned long flags;
@@ -3884,13 +3910,13 @@ wait_for_common(struct completion *x, long timeout, int state)
        return timeout;
 }
-void fastcall __sched wait_for_completion(struct completion *x)
+void __sched wait_for_completion(struct completion *x)
 {
        wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion);
-unsigned long fastcall __sched
+unsigned long __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 {
        return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
@@ -3906,7 +3932,7 @@ int __sched wait_for_completion_interruptible(struct completion *x)
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible);
-unsigned long fastcall __sched
+unsigned long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
                                          unsigned long timeout)
 {
@@ -5461,11 +5487,12 @@ static void register_sched_domain_sysctl(void)
        struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
        char buf[32];
+        WARN_ON(sd_ctl_dir[0].child);
+        sd_ctl_dir[0].child = entry;
        if (entry == NULL)
                return;
-        sd_ctl_dir[0].child = entry;
        for_each_online_cpu(i) {
                snprintf(buf, 32, "cpu%d", i);
                entry->procname = kstrdup(buf, GFP_KERNEL);
@@ -5473,14 +5500,19 @@ static void register_sched_domain_sysctl(void)
                entry->child = sd_alloc_ctl_cpu_table(i);
                entry++;
        }
+        WARN_ON(sd_sysctl_header);
        sd_sysctl_header = register_sysctl_table(sd_ctl_root);
 }
+/* may be called multiple times per register */
 static void unregister_sched_domain_sysctl(void)
 {
-        unregister_sysctl_table(sd_sysctl_header);
+        if (sd_sysctl_header)
+                unregister_sysctl_table(sd_sysctl_header);
        sd_sysctl_header = NULL;
-        sd_free_ctl_entry(&sd_ctl_dir[0].child);
+        if (sd_ctl_dir[0].child)
+                sd_free_ctl_entry(&sd_ctl_dir[0].child);
 }
 #else
 static void register_sched_domain_sysctl(void)
@@ -5611,101 +5643,101 @@ int nr_cpu_ids __read_mostly = NR_CPUS;
 EXPORT_SYMBOL(nr_cpu_ids);
 #ifdef CONFIG_SCHED_DEBUG
-static void sched_domain_debug(struct sched_domain *sd, int cpu)
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
 {
-        int level = 0;
+        struct sched_group *group = sd->groups;
+        cpumask_t groupmask;
+        char str[NR_CPUS];
-        if (!sd) {
+        cpumask_scnprintf(str, NR_CPUS, sd->span);
-                printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+        cpus_clear(groupmask);
-                return;
+        printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+        if (!(sd->flags & SD_LOAD_BALANCE)) {
+                printk("does not load-balance\n");
+                if (sd->parent)
+                        printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+                                        " has parent");
+                return -1;
        }
-        printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+        printk(KERN_CONT "span %s\n", str);
+        if (!cpu_isset(cpu, sd->span)) {
+                printk(KERN_ERR "ERROR: domain->span does not contain "
+                                "CPU%d\n", cpu);
+        }
+        if (!cpu_isset(cpu, group->cpumask)) {
+                printk(KERN_ERR "ERROR: domain->groups does not contain"
+                                " CPU%d\n", cpu);
+        }
+        printk(KERN_DEBUG "%*s groups:", level + 1, "");
        do {
-                int i;
+                if (!group) {
-                char str[NR_CPUS];
+                        printk("\n");
-                struct sched_group *group = sd->groups;
+                        printk(KERN_ERR "ERROR: group is NULL\n");
-                cpumask_t groupmask;
-                cpumask_scnprintf(str, NR_CPUS, sd->span);
-                cpus_clear(groupmask);
-                printk(KERN_DEBUG);
-                for (i = 0; i < level + 1; i++)
-                        printk(" ");
-                printk("domain %d: ", level);
-                if (!(sd->flags & SD_LOAD_BALANCE)) {
-                        printk("does not load-balance\n");
-                        if (sd->parent)
-                                printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
-                                                " has parent");
                        break;
                }
-                printk("span %s\n", str);
+                if (!group->__cpu_power) {
+                        printk(KERN_CONT "\n");
+                        printk(KERN_ERR "ERROR: domain->cpu_power not "
+                                        "set\n");
+                        break;
+                }
-                if (!cpu_isset(cpu, sd->span))
+                if (!cpus_weight(group->cpumask)) {
-                        printk(KERN_ERR "ERROR: domain->span does not contain "
+                        printk(KERN_CONT "\n");
-                                        "CPU%d\n", cpu);
+                        printk(KERN_ERR "ERROR: empty group\n");
-                if (!cpu_isset(cpu, group->cpumask))
+                        break;
-                        printk(KERN_ERR "ERROR: domain->groups does not contain"
+                }
-                                        " CPU%d\n", cpu);
-                printk(KERN_DEBUG);
+                if (cpus_intersects(groupmask, group->cpumask)) {
-                for (i = 0; i < level + 2; i++)
+                        printk(KERN_CONT "\n");
-                        printk(" ");
+                        printk(KERN_ERR "ERROR: repeated CPUs\n");
-                printk("groups:");
+                        break;
-                do {
+                }
-                        if (!group) {
-                                printk("\n");
-                                printk(KERN_ERR "ERROR: group is NULL\n");
-                                break;
-                        }
-                        if (!group->__cpu_power) {
+                cpus_or(groupmask, groupmask, group->cpumask);
-                                printk(KERN_CONT "\n");
-                                printk(KERN_ERR "ERROR: domain->cpu_power not "
-                                                "set\n");
-                                break;
-                        }
-                        if (!cpus_weight(group->cpumask)) {
+                cpumask_scnprintf(str, NR_CPUS, group->cpumask);
-                                printk(KERN_CONT "\n");
+                printk(KERN_CONT " %s", str);
-                                printk(KERN_ERR "ERROR: empty group\n");
-                                break;
-                        }
-                        if (cpus_intersects(groupmask, group->cpumask)) {
+                group = group->next;
-                                printk(KERN_CONT "\n");
+        } while (group != sd->groups);
-                                printk(KERN_ERR "ERROR: repeated CPUs\n");
+        printk(KERN_CONT "\n");
-                                break;
-                        }
-                        cpus_or(groupmask, groupmask, group->cpumask);
+        if (!cpus_equal(sd->span, groupmask))
+                printk(KERN_ERR "ERROR: groups don't span domain->span\n");
-                        cpumask_scnprintf(str, NR_CPUS, group->cpumask);
+        if (sd->parent && !cpus_subset(groupmask, sd->parent->span))
-                        printk(KERN_CONT " %s", str);
+                printk(KERN_ERR "ERROR: parent span is not a superset "
+                        "of domain->span\n");
+        return 0;
+}
-                        group = group->next;
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
-                } while (group != sd->groups);
+{
-                printk(KERN_CONT "\n");
+        int level = 0;
-                if (!cpus_equal(sd->span, groupmask))
+        if (!sd) {
-                        printk(KERN_ERR "ERROR: groups don't span "
+                printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
-                                        "domain->span\n");
+                return;
+        }
+        printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+        for (;;) {
+                if (sched_domain_debug_one(sd, cpu, level))
+                        break;
                level++;
                sd = sd->parent;
                if (!sd)
-                        continue;
+                        break;
+        }
-                if (!cpus_subset(groupmask, sd->span))
-                        printk(KERN_ERR "ERROR: parent span is not a superset "
-                                "of domain->span\n");
-        } while (sd);
 }
 #else
 # define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6424,13 +6456,17 @@ static cpumask_t fallback_doms;
 */
 static int arch_init_sched_domains(const cpumask_t *cpu_map)
 {
+        int err;
        ndoms_cur = 1;
        doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
        if (!doms_cur)
                doms_cur = &fallback_doms;
        cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
+        err = build_sched_domains(doms_cur);
        register_sched_domain_sysctl();
-        return build_sched_domains(doms_cur);
+        return err;
 }
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
@@ -6479,6 +6515,9 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
 {
        int i, j;
+        /* always unregister in case we don't destroy any domains */
+        unregister_sched_domain_sysctl();
        if (doms_new == NULL) {
                ndoms_new = 1;
                doms_new = &fallback_doms;
@@ -6514,6 +6553,8 @@ match2:
                kfree(doms_cur);
        doms_cur = doms_new;
        ndoms_cur = ndoms_new;
+        register_sched_domain_sysctl();
 }
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -7101,25 +7142,25 @@ unsigned long sched_group_shares(struct task_group *tg)
 #ifdef CONFIG_FAIR_CGROUP_SCHED
 /* return corresponding task_group object of a cgroup */
-static inline struct task_group *cgroup_tg(struct cgroup *cont)
+static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
 {
-        return container_of(cgroup_subsys_state(cont, cpu_cgroup_subsys_id),
+        return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
-                                         struct task_group, css);
+                            struct task_group, css);
 }
 static struct cgroup_subsys_state *
-cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
+cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct task_group *tg;
-        if (!cont->parent) {
+        if (!cgrp->parent) {
                /* This is early initialization for the top cgroup */
-                init_task_group.css.cgroup = cont;
+                init_task_group.css.cgroup = cgrp;
                return &init_task_group.css;
        }
        /* we support only 1-level deep hierarchical scheduler atm */
-        if (cont->parent->parent)
+        if (cgrp->parent->parent)
                return ERR_PTR(-EINVAL);
        tg = sched_create_group();
@@ -7127,21 +7168,21 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                return ERR_PTR(-ENOMEM);
        /* Bind the cgroup to task_group object we just created */
-        tg->css.cgroup = cont;
+        tg->css.cgroup = cgrp;
        return &tg->css;
 }
 static void cpu_cgroup_destroy(struct cgroup_subsys *ss,
-                                        struct cgroup *cont)
+                               struct cgroup *cgrp)
 {
-        struct task_group *tg = cgroup_tg(cont);
+        struct task_group *tg = cgroup_tg(cgrp);
        sched_destroy_group(tg);
 }
 static int cpu_cgroup_can_attach(struct cgroup_subsys *ss,
-                             struct cgroup *cont, struct task_struct *tsk)
+                             struct cgroup *cgrp, struct task_struct *tsk)
 {
        /* We don't support RT-tasks being in separate groups */
        if (tsk->sched_class != &fair_sched_class)
@@ -7151,38 +7192,21 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss,
 }
 static void
-cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
                        struct cgroup *old_cont, struct task_struct *tsk)
 {
        sched_move_task(tsk);
 }
-static ssize_t cpu_shares_write(struct cgroup *cont, struct cftype *cftype,
+static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
-                                struct file *file, const char __user *userbuf,
+                                u64 shareval)
-                                size_t nbytes, loff_t *ppos)
 {
-        unsigned long shareval;
+        return sched_group_set_shares(cgroup_tg(cgrp), shareval);
-        struct task_group *tg = cgroup_tg(cont);
-        char buffer[2*sizeof(unsigned long) + 1];
-        int rc;
-        if (nbytes > 2*sizeof(unsigned long))   /* safety check */
-                return -E2BIG;
-        if (copy_from_user(buffer, userbuf, nbytes))
-                return -EFAULT;
-        buffer[nbytes] = 0;     /* nul-terminate */
-        shareval = simple_strtoul(buffer, NULL, 10);
-        rc = sched_group_set_shares(tg, shareval);
-        return (rc < 0 ? rc : nbytes);
 }
-static u64 cpu_shares_read_uint(struct cgroup *cont, struct cftype *cft)
+static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
 {
-        struct task_group *tg = cgroup_tg(cont);
+        struct task_group *tg = cgroup_tg(cgrp);
        return (u64) tg->shares;
 }
@@ -7190,7 +7214,7 @@ static u64 cpu_shares_read_uint(struct cgroup *cont, struct cftype *cft)
 static struct cftype cpu_shares = {
        .name = "shares",
        .read_uint = cpu_shares_read_uint,
-        .write = cpu_shares_write,
+        .write_uint = cpu_shares_write_uint,
 };
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 166ed6db600b..9971831b560e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -876,6 +876,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
        }
 }
+#ifdef CONFIG_SMP
 /**************************************************
 * Fair scheduling class load-balancing methods:
 */
@@ -936,12 +937,11 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                  unsigned long max_nr_move, unsigned long max_load_move,
+                  unsigned long max_load_move,
                  struct sched_domain *sd, enum cpu_idle_type idle,
                  int *all_pinned, int *this_best_prio)
 {
        struct cfs_rq *busy_cfs_rq;
-        unsigned long load_moved, total_nr_moved = 0, nr_moved;
        long rem_load_move = max_load_move;
        struct rq_iterator cfs_rq_iterator;
@@ -969,25 +969,48 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 #else
 # define maxload rem_load_move
 #endif
-                /* pass busy_cfs_rq argument into
+                /*
+                 * pass busy_cfs_rq argument into
                 * load_balance_[start|next]_fair iterators
                 */
                cfs_rq_iterator.arg = busy_cfs_rq;
-                nr_moved = balance_tasks(this_rq, this_cpu, busiest,
+                rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
-                                max_nr_move, maxload, sd, idle, all_pinned,
+                                               maxload, sd, idle, all_pinned,
-                                &load_moved, this_best_prio, &cfs_rq_iterator);
+                                               this_best_prio,
+                                               &cfs_rq_iterator);
-                total_nr_moved += nr_moved;
-                max_nr_move -= nr_moved;
-                rem_load_move -= load_moved;
-                if (max_nr_move <= 0 || rem_load_move <= 0)
+                if (rem_load_move <= 0)
                        break;
        }
        return max_load_move - rem_load_move;
 }
+static int
+move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                   struct sched_domain *sd, enum cpu_idle_type idle)
+{
+        struct cfs_rq *busy_cfs_rq;
+        struct rq_iterator cfs_rq_iterator;
+        cfs_rq_iterator.start = load_balance_start_fair;
+        cfs_rq_iterator.next = load_balance_next_fair;
+        for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+                /*
+                 * pass busy_cfs_rq argument into
+                 * load_balance_[start|next]_fair iterators
+                 */
+                cfs_rq_iterator.arg = busy_cfs_rq;
+                if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
+                                       &cfs_rq_iterator))
+                    return 1;
+        }
+        return 0;
+}
+#endif
 /*
 * scheduler tick hitting a task of our scheduling class:
 */
@@ -1063,7 +1086,10 @@ static const struct sched_class fair_sched_class = {
        .pick_next_task         = pick_next_task_fair,
        .put_prev_task          = put_prev_task_fair,
+#ifdef CONFIG_SMP
        .load_balance           = load_balance_fair,
+        .move_one_task          = move_one_task_fair,
+#endif
        .set_curr_task          = set_curr_task_fair,
        .task_tick              = task_tick_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 6e2ead41516e..bf9c25c15b8b 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -37,15 +37,24 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
 }
+#ifdef CONFIG_SMP
 static unsigned long
 load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                        unsigned long max_nr_move, unsigned long max_load_move,
+                  unsigned long max_load_move,
-                        struct sched_domain *sd, enum cpu_idle_type idle,
+                  struct sched_domain *sd, enum cpu_idle_type idle,
-                        int *all_pinned, int *this_best_prio)
+                  int *all_pinned, int *this_best_prio)
 {
        return 0;
 }
+static int
+move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                   struct sched_domain *sd, enum cpu_idle_type idle)
+{
+        return 0;
+}
+#endif
 static void task_tick_idle(struct rq *rq, struct task_struct *curr)
 {
 }
@@ -69,7 +78,10 @@ const struct sched_class idle_sched_class = {
        .pick_next_task         = pick_next_task_idle,
        .put_prev_task          = put_prev_task_idle,
+#ifdef CONFIG_SMP
        .load_balance           = load_balance_idle,
+        .move_one_task          = move_one_task_idle,
+#endif
        .set_curr_task          = set_curr_task_idle,
        .task_tick              = task_tick_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d0097a0634e5..8abd752a0ebd 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -98,6 +98,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
        p->se.exec_start = 0;
 }
+#ifdef CONFIG_SMP
 /*
 * Load-balancing iterator. Note: while the runqueue stays locked
 * during the whole iteration, the current task might be
@@ -172,13 +173,11 @@ static struct task_struct *load_balance_next_rt(void *arg)
 static unsigned long
 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                        unsigned long max_nr_move, unsigned long max_load_move,
+                unsigned long max_load_move,
-                        struct sched_domain *sd, enum cpu_idle_type idle,
+                struct sched_domain *sd, enum cpu_idle_type idle,
-                        int *all_pinned, int *this_best_prio)
+                int *all_pinned, int *this_best_prio)
 {
-        int nr_moved;
        struct rq_iterator rt_rq_iterator;
-        unsigned long load_moved;
        rt_rq_iterator.start = load_balance_start_rt;
        rt_rq_iterator.next = load_balance_next_rt;
@@ -187,12 +186,24 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
         */
        rt_rq_iterator.arg = busiest;
-        nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
+        return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd,
-                        max_load_move, sd, idle, all_pinned, &load_moved,
+                             idle, all_pinned, this_best_prio, &rt_rq_iterator);
-                        this_best_prio, &rt_rq_iterator);
+}
+static int
+move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                 struct sched_domain *sd, enum cpu_idle_type idle)
+{
+        struct rq_iterator rt_rq_iterator;
+        rt_rq_iterator.start = load_balance_start_rt;
+        rt_rq_iterator.next = load_balance_next_rt;
+        rt_rq_iterator.arg = busiest;
-        return load_moved;
+        return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
+                                  &rt_rq_iterator);
 }
+#endif
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
 {
@@ -236,7 +247,10 @@ const struct sched_class rt_sched_class = {
        .pick_next_task         = pick_next_task_rt,
        .put_prev_task          = put_prev_task_rt,
+#ifdef CONFIG_SMP
        .load_balance           = load_balance_rt,
+        .move_one_task          = move_one_task_rt,
+#endif
        .set_curr_task          = set_curr_task_rt,
        .task_tick              = task_tick_rt,
diff --git a/kernel/user.c b/kernel/user.c
index e91331c457e2..0f3aa0234107 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -129,7 +129,7 @@ static inline void uids_mutex_unlock(void)
 }
 /* return cpu shares held by the user */
-ssize_t cpu_shares_show(struct kset *kset, char *buffer)
+static ssize_t cpu_shares_show(struct kset *kset, char *buffer)
 {
        struct user_struct *up = container_of(kset, struct user_struct, kset);
@@ -137,7 +137,8 @@ ssize_t cpu_shares_show(struct kset *kset, char *buffer)
 }
 /* modify cpu shares held by the user */
-ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size)
+static ssize_t cpu_shares_store(struct kset *kset, const char *buffer,
+                                size_t size)
 {
        struct user_struct *up = container_of(kset, struct user_struct, kset);
        unsigned long shares;