14 files changed, 263 insertions, 144 deletions
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index bc693fffabe0..f613df8ec7bf 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,6 +6,21 @@ be removed from this file.
 ---------------------------
+What:   USER_SCHED
+When:   2.6.34
+Why:    USER_SCHED was implemented as a proof of concept for group scheduling.
+        The effect of USER_SCHED can already be achieved from userspace with
+        the help of libcgroup. The removal of USER_SCHED will also simplify
+        the scheduler code with the removal of one major ifdef. There are also
+        issues USER_SCHED has with USER_NS. A decision was taken not to fix
+        those and instead remove USER_SCHED. Also new group scheduling
+        features will not be implemented for USER_SCHED.
+Who:    Dhaval Giani <dhaval@linux.vnet.ibm.com>
+---------------------------
 What:   PRISM54
 When:   2.6.34
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 2c48f945546b..4af0018533f2 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1072,7 +1072,8 @@ second).  The meanings of the columns are as follows, from left to right:
 - irq: servicing interrupts
 - softirq: servicing softirqs
 - steal: involuntary wait
- guest: running a guest
+- guest: running a normal guest
+- guest_nice: running a niced guest
 The "intr" line gives counts of interrupts  serviced since boot time, for each
 of the  possible system interrupts.   The first  column  is the  total of  all
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9107b387e91f..f2a9507b27b2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2182,6 +2182,8 @@ and is between 256 and 4096 characters. It is defined in the file
        sbni=           [NET] Granch SBNI12 leased line adapter
+        sched_debug     [KNL] Enables verbose scheduler debug messages.
        sc1200wdt=      [HW,WDT] SC1200 WDT (watchdog) driver
                        Format: <io>[,<timeout>[,<isapnp>]]
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index eb62cbcaa490..93c501dc2496 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -406,11 +406,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         * This won't pick up thread selector changes, but I guess that is ok.
         */
        savesegment(es, prev->es);
-        if (unlikely(next->es | prev->es))
+        if (next->es | prev->es)
                loadsegment(es, next->es);
        savesegment(ds, prev->ds);
-        if (unlikely(next->ds | prev->ds))
+        if (next->ds | prev->ds)
                loadsegment(ds, next->ds);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 07f77a7945c3..e209f64ab27b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -410,6 +410,16 @@ static void task_show_stack_usage(struct seq_file *m, struct task_struct *task)
 }
 #endif          /* CONFIG_MMU */
+static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
+{
+        seq_printf(m, "Cpus_allowed:\t");
+        seq_cpumask(m, &task->cpus_allowed);
+        seq_printf(m, "\n");
+        seq_printf(m, "Cpus_allowed_list:\t");
+        seq_cpumask_list(m, &task->cpus_allowed);
+        seq_printf(m, "\n");
+}
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
 {
@@ -424,6 +434,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
        }
        task_sig(m, task);
        task_cap(m, task);
+        task_cpus_allowed(m, task);
        cpuset_task_status_allowed(m, task);
 #if defined(CONFIG_S390)
        task_show_regs(m, task);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 7cc726c6d70a..b9b7aad2003d 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -27,7 +27,7 @@ static int show_stat(struct seq_file *p, void *v)
        int i, j;
        unsigned long jif;
        cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
-        cputime64_t guest;
+        cputime64_t guest, guest_nice;
        u64 sum = 0;
        u64 sum_softirq = 0;
        unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
@@ -36,7 +36,7 @@ static int show_stat(struct seq_file *p, void *v)
        user = nice = system = idle = iowait =
                irq = softirq = steal = cputime64_zero;
-        guest = cputime64_zero;
+        guest = guest_nice = cputime64_zero;
        getboottime(&boottime);
        jif = boottime.tv_sec;
@@ -51,6 +51,8 @@ static int show_stat(struct seq_file *p, void *v)
                softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
                steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
                guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+                guest_nice = cputime64_add(guest_nice,
+                        kstat_cpu(i).cpustat.guest_nice);
                for_each_irq_nr(j) {
                        sum += kstat_irqs_cpu(j, i);
                }
@@ -65,7 +67,8 @@ static int show_stat(struct seq_file *p, void *v)
        }
        sum += arch_irq_stat();
-        seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+        seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+                "%llu\n",
                (unsigned long long)cputime64_to_clock_t(user),
                (unsigned long long)cputime64_to_clock_t(nice),
                (unsigned long long)cputime64_to_clock_t(system),
@@ -74,7 +77,8 @@ static int show_stat(struct seq_file *p, void *v)
                (unsigned long long)cputime64_to_clock_t(irq),
                (unsigned long long)cputime64_to_clock_t(softirq),
                (unsigned long long)cputime64_to_clock_t(steal),
-                (unsigned long long)cputime64_to_clock_t(guest));
+                (unsigned long long)cputime64_to_clock_t(guest),
+                (unsigned long long)cputime64_to_clock_t(guest_nice));
        for_each_online_cpu(i) {
                /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
@@ -88,8 +92,10 @@ static int show_stat(struct seq_file *p, void *v)
                softirq = kstat_cpu(i).cpustat.softirq;
                steal = kstat_cpu(i).cpustat.steal;
                guest = kstat_cpu(i).cpustat.guest;
+                guest_nice = kstat_cpu(i).cpustat.guest_nice;
                seq_printf(p,
-                        "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+                        "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
+                        "%llu\n",
                        i,
                        (unsigned long long)cputime64_to_clock_t(user),
                        (unsigned long long)cputime64_to_clock_t(nice),
@@ -99,7 +105,8 @@ static int show_stat(struct seq_file *p, void *v)
                        (unsigned long long)cputime64_to_clock_t(irq),
                        (unsigned long long)cputime64_to_clock_t(softirq),
                        (unsigned long long)cputime64_to_clock_t(steal),
-                        (unsigned long long)cputime64_to_clock_t(guest));
+                        (unsigned long long)cputime64_to_clock_t(guest),
+                        (unsigned long long)cputime64_to_clock_t(guest_nice));
        }
        seq_printf(p, "intr %llu", (unsigned long long)sum);
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 348fa8874b52..c059044bc6dc 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -25,6 +25,7 @@ struct cpu_usage_stat {
        cputime64_t iowait;
        cputime64_t steal;
        cputime64_t guest;
+        cputime64_t guest_nice;
 };
 struct kernel_stat {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75e6e60bf583..78ba664474f3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -145,7 +145,6 @@ extern unsigned long this_cpu_load(void);
 extern void calc_global_load(void);
-extern u64 cpu_nr_migrations(int cpu);
 extern unsigned long get_parent_ip(unsigned long addr);
@@ -171,8 +170,6 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 }
 #endif
-extern unsigned long long time_sync_thresh;
 /*
 * Task state bitmask. NOTE! These bits are also
 * encoded in fs/proc/array.c: get_task_state().
@@ -349,7 +346,6 @@ extern signed long schedule_timeout(signed long timeout);
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
-asmlinkage void __schedule(void);
 asmlinkage void schedule(void);
 extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
@@ -1013,9 +1009,13 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
        return to_cpumask(sd->span);
 }
-extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
+extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
                                    struct sched_domain_attr *dattr_new);
+/* Allocate an array of sched domains, for partition_sched_domains(). */
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
 /* Test a flag in parent sched domain */
 static inline int test_sd_parent(struct sched_domain *sd, int flag)
 {
@@ -1033,7 +1033,7 @@ unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
 struct sched_domain_attr;
 static inline void
-partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
+partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
                        struct sched_domain_attr *dattr_new)
 {
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b5cb469d2545..3cf2183b472d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 *      element of the partition (one sched domain) to be passed to
 *      partition_sched_domains().
 */
-/* FIXME: see the FIXME in partition_sched_domains() */
+static int generate_sched_domains(cpumask_var_t **domains,
-static int generate_sched_domains(struct cpumask **domains,
                        struct sched_domain_attr **attributes)
 {
        LIST_HEAD(q);           /* queue of cpusets to be scanned */
@@ -546,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains,
        struct cpuset **csa;    /* array of all cpuset ptrs */
        int csn;                /* how many cpuset ptrs in csa so far */
        int i, j, k;            /* indices for partition finding loops */
-        struct cpumask *doms;   /* resulting partition; i.e. sched domains */
+        cpumask_var_t *doms;    /* resulting partition; i.e. sched domains */
        struct sched_domain_attr *dattr;  /* attributes for custom domains */
        int ndoms = 0;          /* number of sched domains in result */
        int nslot;              /* next empty doms[] struct cpumask slot */
@@ -557,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains,
        /* Special case for the 99% of systems with one, full, sched domain */
        if (is_sched_load_balance(&top_cpuset)) {
-                doms = kmalloc(cpumask_size(), GFP_KERNEL);
+                ndoms = 1;
+                doms = alloc_sched_domains(ndoms);
                if (!doms)
                        goto done;
@@ -566,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains,
                        *dattr = SD_ATTR_INIT;
                        update_domain_attr_tree(dattr, &top_cpuset);
                }
-                cpumask_copy(doms, top_cpuset.cpus_allowed);
+                cpumask_copy(doms[0], top_cpuset.cpus_allowed);
-                ndoms = 1;
                goto done;
        }
@@ -636,7 +635,7 @@ restart:
         * Now we know how many domains to create.
         * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
         */
-        doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL);
+        doms = alloc_sched_domains(ndoms);
        if (!doms)
                goto done;
@@ -656,7 +655,7 @@ restart:
                        continue;
                }
-                dp = doms + nslot;
+                dp = doms[nslot];
                if (nslot == ndoms) {
                        static int warnings = 10;
@@ -718,7 +717,7 @@ done:
 static void do_rebuild_sched_domains(struct work_struct *unused)
 {
        struct sched_domain_attr *attr;
-        struct cpumask *doms;
+        cpumask_var_t *doms;
        int ndoms;
        get_online_cpus();
@@ -2052,7 +2051,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
                                unsigned long phase, void *unused_cpu)
 {
        struct sched_domain_attr *attr;
-        struct cpumask *doms;
+        cpumask_var_t *doms;
        int ndoms;
        switch (phase) {
@@ -2537,15 +2536,9 @@ const struct file_operations proc_cpuset_operations = {
 };
 #endif /* CONFIG_PROC_PID_CPUSET */
-/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
+/* Display task mems_allowed in /proc/<pid>/status file. */
 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
 {
-        seq_printf(m, "Cpus_allowed:\t");
-        seq_cpumask(m, &task->cpus_allowed);
-        seq_printf(m, "\n");
-        seq_printf(m, "Cpus_allowed_list:\t");
-        seq_cpumask_list(m, &task->cpus_allowed);
-        seq_printf(m, "\n");
        seq_printf(m, "Mems_allowed:\t");
        seq_nodemask(m, &task->mems_allowed);
        seq_printf(m, "\n");
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 9147a3190c9d..7d7014634022 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -870,7 +870,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
        /*
         * All threads that don't have debuggerinfo should be
-         * in __schedule() sleeping, since all other CPUs
+         * in schedule() sleeping, since all other CPUs
         * are in kgdb_wait, and thus have debuggerinfo.
         */
        if (local_debuggerinfo) {
diff --git a/kernel/sched.c b/kernel/sched.c
index 3c91f110fc62..315ba4059f93 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -535,14 +535,12 @@ struct rq {
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 #ifdef CONFIG_NO_HZ
-        unsigned long last_tick_seen;
        unsigned char in_nohz_recently;
 #endif
        /* capture load from *all* tasks on this cpu: */
        struct load_weight load;
        unsigned long nr_load_updates;
        u64 nr_switches;
-        u64 nr_migrations_in;
        struct cfs_rq cfs;
        struct rt_rq rt;
@@ -591,6 +589,8 @@ struct rq {
        u64 rt_avg;
        u64 age_stamp;
+        u64 idle_stamp;
+        u64 avg_idle;
 #endif
        /* calc_load related fields */
@@ -772,7 +772,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        if (!sched_feat_names[i])
                return -EINVAL;
-        filp->f_pos += cnt;
+        *ppos += cnt;
        return cnt;
 }
@@ -2079,7 +2079,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 #endif
        if (old_cpu != new_cpu) {
                p->se.nr_migrations++;
-                new_rq->nr_migrations_in++;
 #ifdef CONFIG_SCHEDSTATS
                if (task_hot(p, old_rq->clock, NULL))
                        schedstat_inc(p, se.nr_forced2_migrations);
@@ -2443,6 +2442,17 @@ out_running:
 #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
                p->sched_class->task_wake_up(rq, p);
+        if (unlikely(rq->idle_stamp)) {
+                u64 delta = rq->clock - rq->idle_stamp;
+                u64 max = 2*sysctl_sched_migration_cost;
+                if (delta > max)
+                        rq->avg_idle = max;
+                else
+                        update_avg(&rq->avg_idle, delta);
+                rq->idle_stamp = 0;
+        }
 #endif
 out:
        task_rq_unlock(rq, &flags);
@@ -2855,14 +2865,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
         */
        arch_start_context_switch(prev);
-        if (unlikely(!mm)) {
+        if (likely(!mm)) {
                next->active_mm = oldmm;
                atomic_inc(&oldmm->mm_count);
                enter_lazy_tlb(oldmm, next);
        } else
                switch_mm(oldmm, mm, next);
-        if (unlikely(!prev->mm)) {
+        if (likely(!prev->mm)) {
                prev->active_mm = NULL;
                rq->prev_mm = oldmm;
        }
@@ -3025,15 +3035,6 @@ static void calc_load_account_active(struct rq *this_rq)
 }
 /*
- * Externally visible per-cpu scheduler statistics:
- * cpu_nr_migrations(cpu) - number of migrations into that cpu
- */
-u64 cpu_nr_migrations(int cpu)
-{
-        return cpu_rq(cpu)->nr_migrations_in;
-}
-/*
 * Update rq->cpu_load[] statistics. This function is usually called every
 * scheduler tick (TICK_NSEC).
 */
@@ -4133,7 +4134,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        unsigned long flags;
        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-        cpumask_setall(cpus);
+        cpumask_copy(cpus, cpu_online_mask);
        /*
         * When power savings policy is enabled for the parent domain, idle
@@ -4296,7 +4297,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
        int all_pinned = 0;
        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-        cpumask_setall(cpus);
+        cpumask_copy(cpus, cpu_online_mask);
        /*
         * When power savings policy is enabled for the parent domain, idle
@@ -4436,6 +4437,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
        int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
+        this_rq->idle_stamp = this_rq->clock;
+        if (this_rq->avg_idle < sysctl_sched_migration_cost)
+                return;
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
@@ -4450,8 +4456,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                interval = msecs_to_jiffies(sd->balance_interval);
                if (time_after(next_balance, sd->last_balance + interval))
                        next_balance = sd->last_balance + interval;
-                if (pulled_task)
+                if (pulled_task) {
+                        this_rq->idle_stamp = 0;
                        break;
+                }
        }
        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
                /*
@@ -5053,8 +5061,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
        p->gtime = cputime_add(p->gtime, cputime);
        /* Add guest time to cpustat. */
-        cpustat->user = cputime64_add(cpustat->user, tmp);
+        if (TASK_NICE(p) > 0) {
-        cpustat->guest = cputime64_add(cpustat->guest, tmp);
+                cpustat->nice = cputime64_add(cpustat->nice, tmp);
+                cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
+        } else {
+                cpustat->user = cputime64_add(cpustat->user, tmp);
+                cpustat->guest = cputime64_add(cpustat->guest, tmp);
+        }
 }
 /*
@@ -5179,41 +5192,45 @@ cputime_t task_stime(struct task_struct *p)
        return p->stime;
 }
 #else
+#ifndef nsecs_to_cputime
+# define nsecs_to_cputime(__nsecs) \
+        msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
+#endif
 cputime_t task_utime(struct task_struct *p)
 {
-        clock_t utime = cputime_to_clock_t(p->utime),
+        cputime_t utime = p->utime, total = utime + p->stime;
-                total = utime + cputime_to_clock_t(p->stime);
        u64 temp;
        /*
         * Use CFS's precise accounting:
         */
-        temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
+        temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime);
        if (total) {
                temp *= utime;
                do_div(temp, total);
        }
-        utime = (clock_t)temp;
+        utime = (cputime_t)temp;
-        p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
+        p->prev_utime = max(p->prev_utime, utime);
        return p->prev_utime;
 }
 cputime_t task_stime(struct task_struct *p)
 {
-        clock_t stime;
+        cputime_t stime;
        /*
         * Use CFS's precise accounting. (we subtract utime from
         * the total, to make sure the total observed by userspace
         * grows monotonically - apps rely on that):
         */
-        stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
+        stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p);
-                        cputime_to_clock_t(task_utime(p));
        if (stime >= 0)
-                p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+                p->prev_stime = max(p->prev_stime, stime);
        return p->prev_stime;
 }
@@ -6182,22 +6199,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
        BUG_ON(p->se.on_rq);
        p->policy = policy;
-        switch (p->policy) {
-        case SCHED_NORMAL:
-        case SCHED_BATCH:
-        case SCHED_IDLE:
-                p->sched_class = &fair_sched_class;
-                break;
-        case SCHED_FIFO:
-        case SCHED_RR:
-                p->sched_class = &rt_sched_class;
-                break;
-        }
        p->rt_priority = prio;
        p->normal_prio = normal_prio(p);
        /* we are holding p->pi_lock already */
        p->prio = rt_mutex_getprio(p);
+        if (rt_prio(p->prio))
+                p->sched_class = &rt_sched_class;
+        else
+                p->sched_class = &fair_sched_class;
        set_load_weight(p);
 }
@@ -6942,7 +6951,7 @@ void show_state_filter(unsigned long state_filter)
        /*
         * Only show locks if all tasks are dumped:
         */
-        if (state_filter == -1)
+        if (!state_filter)
                debug_show_all_locks();
 }
@@ -7747,6 +7756,16 @@ early_initcall(migration_init);
 #ifdef CONFIG_SCHED_DEBUG
+static __read_mostly int sched_domain_debug_enabled;
+static int __init sched_domain_debug_setup(char *str)
+{
+        sched_domain_debug_enabled = 1;
+        return 0;
+}
+early_param("sched_debug", sched_domain_debug_setup);
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  struct cpumask *groupmask)
 {
@@ -7833,6 +7852,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
        cpumask_var_t groupmask;
        int level = 0;
+        if (!sched_domain_debug_enabled)
+                return;
        if (!sd) {
                printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
                return;
@@ -8890,7 +8912,7 @@ static int build_sched_domains(const struct cpumask *cpu_map)
        return __build_sched_domains(cpu_map, NULL);
 }
-static struct cpumask *doms_cur;        /* current sched domains */
+static cpumask_var_t *doms_cur; /* current sched domains */
 static int ndoms_cur;           /* number of sched domains in 'doms_cur' */
 static struct sched_domain_attr *dattr_cur;
                                /* attribues of custom domains in 'doms_cur' */
@@ -8912,6 +8934,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
        return 0;
 }
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+        int i;
+        cpumask_var_t *doms;
+        doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
+        if (!doms)
+                return NULL;
+        for (i = 0; i < ndoms; i++) {
+                if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+                        free_sched_domains(doms, i);
+                        return NULL;
+                }
+        }
+        return doms;
+}
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+        unsigned int i;
+        for (i = 0; i < ndoms; i++)
+                free_cpumask_var(doms[i]);
+        kfree(doms);
+}
 /*
 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
 * For now this just excludes isolated cpus, but could be used to
@@ -8923,12 +8970,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
        arch_update_cpu_topology();
        ndoms_cur = 1;
-        doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
+        doms_cur = alloc_sched_domains(ndoms_cur);
        if (!doms_cur)
-                doms_cur = fallback_doms;
+                doms_cur = &fallback_doms;
-        cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
+        cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
        dattr_cur = NULL;
-        err = build_sched_domains(doms_cur);
+        err = build_sched_domains(doms_cur[0]);
        register_sched_domain_sysctl();
        return err;
@@ -8978,19 +9025,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 * doms_new[] to the current sched domain partitioning, doms_cur[].
 * It destroys each deleted domain and builds each new domain.
 *
- * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
 * The masks don't intersect (don't overlap.) We should setup one
 * sched domain for each mask. CPUs not in any of the cpumasks will
 * not be load balanced. If the same cpumask appears both in the
 * current 'doms_cur' domains and in the new 'doms_new', we can leave
 * it as it is.
 *
- * The passed in 'doms_new' should be kmalloc'd. This routine takes
+ * The passed in 'doms_new' should be allocated using
- * ownership of it and will kfree it when done with it. If the caller
+ * alloc_sched_domains.  This routine takes ownership of it and will
- * failed the kmalloc call, then it can pass in doms_new == NULL &&
+ * free_sched_domains it when done with it. If the caller failed the
- * ndoms_new == 1, and partition_sched_domains() will fallback to
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
- * the single partition 'fallback_doms', it also forces the domains
+ * and partition_sched_domains() will fallback to the single partition
- * to be rebuilt.
+ * 'fallback_doms', it also forces the domains to be rebuilt.
 *
 * If doms_new == NULL it will be replaced with cpu_online_mask.
 * ndoms_new == 0 is a special case for destroying existing domains,
@@ -8998,8 +9045,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 *
 * Call with hotplug lock held
 */
-/* FIXME: Change to struct cpumask *doms_new[] */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                             struct sched_domain_attr *dattr_new)
 {
        int i, j, n;
@@ -9018,40 +9064,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
        /* Destroy deleted domains */
        for (i = 0; i < ndoms_cur; i++) {
                for (j = 0; j < n && !new_topology; j++) {
-                        if (cpumask_equal(&doms_cur[i], &doms_new[j])
+                        if (cpumask_equal(doms_cur[i], doms_new[j])
                            && dattrs_equal(dattr_cur, i, dattr_new, j))
                                goto match1;
                }
                /* no match - a current sched domain not in new doms_new[] */
-                detach_destroy_domains(doms_cur + i);
+                detach_destroy_domains(doms_cur[i]);
 match1:
                ;
        }
        if (doms_new == NULL) {
                ndoms_cur = 0;
-                doms_new = fallback_doms;
+                doms_new = &fallback_doms;
-                cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
+                cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map);
                WARN_ON_ONCE(dattr_new);
        }
        /* Build new domains */
        for (i = 0; i < ndoms_new; i++) {
                for (j = 0; j < ndoms_cur && !new_topology; j++) {
-                        if (cpumask_equal(&doms_new[i], &doms_cur[j])
+                        if (cpumask_equal(doms_new[i], doms_cur[j])
                            && dattrs_equal(dattr_new, i, dattr_cur, j))
                                goto match2;
                }
                /* no match - add a new doms_new */
-                __build_sched_domains(doms_new + i,
+                __build_sched_domains(doms_new[i],
                                        dattr_new ? dattr_new + i : NULL);
 match2:
                ;
        }
        /* Remember the new sched domains */
-        if (doms_cur != fallback_doms)
+        if (doms_cur != &fallback_doms)
-                kfree(doms_cur);
+                free_sched_domains(doms_cur, ndoms_cur);
        kfree(dattr_cur);       /* kfree(NULL) is safe */
        doms_cur = doms_new;
        dattr_cur = dattr_new;
@@ -9373,10 +9419,6 @@ void __init sched_init(void)
 #ifdef CONFIG_CPUMASK_OFFSTACK
        alloc_size += num_possible_cpus() * cpumask_size();
 #endif
-        /*
-         * As sched_init() is called before page_alloc is setup,
-         * we use alloc_bootmem().
-         */
        if (alloc_size) {
                ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
@@ -9531,6 +9573,8 @@ void __init sched_init(void)
                rq->cpu = i;
                rq->online = 0;
                rq->migration_thread = NULL;
+                rq->idle_stamp = 0;
+                rq->avg_idle = 2*sysctl_sched_migration_cost;
                INIT_LIST_HEAD(&rq->migration_queue);
                rq_attach_root(rq, &def_root_domain);
 #endif
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index efb84409bc43..6988cf08f705 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -285,12 +285,16 @@ static void print_cpu(struct seq_file *m, int cpu)
 #ifdef CONFIG_SCHEDSTATS
 #define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
+#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
        P(yld_count);
        P(sched_switch);
        P(sched_count);
        P(sched_goidle);
+#ifdef CONFIG_SMP
+        P64(avg_idle);
+#endif
        P(ttwu_count);
        P(ttwu_local);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 37087a7fac22..f61837ad336d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1345,6 +1345,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 }
 /*
+ * Try and locate an idle CPU in the sched_domain.
+ */
+static int
+select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
+{
+        int cpu = smp_processor_id();
+        int prev_cpu = task_cpu(p);
+        int i;
+        /*
+         * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
+         * test in select_task_rq_fair) and the prev_cpu is idle then that's
+         * always a better target than the current cpu.
+         */
+        if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+                return prev_cpu;
+        /*
+         * Otherwise, iterate the domain and find an elegible idle cpu.
+         */
+        for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+                if (!cpu_rq(i)->cfs.nr_running) {
+                        target = i;
+                        break;
+                }
+        }
+        return target;
+}
+/*
 * sched_balance_self: balance the current task (running on cpu) in domains
 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
 * SD_BALANCE_EXEC.
@@ -1398,11 +1429,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
                                want_sd = 0;
                }
-                if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+                /*
-                    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+                 * While iterating the domains looking for a spanning
+                 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
+                 * in cache sharing domains along the way.
+                 */
+                if (want_affine) {
+                        int target = -1;
-                        affine_sd = tmp;
+                        /*
-                        want_affine = 0;
+                         * If both cpu and prev_cpu are part of this domain,
+                         * cpu is a valid SD_WAKE_AFFINE target.
+                         */
+                        if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
+                                target = cpu;
+                        /*
+                         * If there's an idle sibling in this domain, make that
+                         * the wake_affine target instead of the current cpu.
+                         */
+                        if (tmp->flags & SD_PREFER_SIBLING)
+                                target = select_idle_sibling(p, tmp, target);
+                        if (target >= 0) {
+                                if (tmp->flags & SD_WAKE_AFFINE) {
+                                        affine_sd = tmp;
+                                        want_affine = 0;
+                                }
+                                cpu = target;
+                        }
                }
                if (!want_sd && !want_affine)
@@ -1679,7 +1734,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
        struct cfs_rq *cfs_rq = &rq->cfs;
        struct sched_entity *se;
-        if (unlikely(!cfs_rq->nr_running))
+        if (!cfs_rq->nr_running)
                return NULL;
        do {
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index a4d790cddb19..5c5fef378415 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1153,29 +1153,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
-static inline int pick_optimal_cpu(int this_cpu,
-                                   const struct cpumask *mask)
-{
-        int first;
-        /* "this_cpu" is cheaper to preempt than a remote processor */
-        if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
-                return this_cpu;
-        first = cpumask_first(mask);
-        if (first < nr_cpu_ids)
-                return first;
-        return -1;
-}
 static int find_lowest_rq(struct task_struct *task)
 {
        struct sched_domain *sd;
        struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
        int this_cpu = smp_processor_id();
        int cpu      = task_cpu(task);
-        cpumask_var_t domain_mask;
        if (task->rt.nr_cpus_allowed == 1)
                return -1; /* No other targets possible */
@@ -1198,28 +1181,26 @@ static int find_lowest_rq(struct task_struct *task)
         * Otherwise, we consult the sched_domains span maps to figure
         * out which cpu is logically closest to our hot cache data.
         */
-        if (this_cpu == cpu)
+        if (!cpumask_test_cpu(this_cpu, lowest_mask))
-                this_cpu = -1; /* Skip this_cpu opt if the same */
+                this_cpu = -1; /* Skip this_cpu opt if not among lowest */
-        if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
-                for_each_domain(cpu, sd) {
-                        if (sd->flags & SD_WAKE_AFFINE) {
-                                int best_cpu;
-                                cpumask_and(domain_mask,
+        for_each_domain(cpu, sd) {
-                                            sched_domain_span(sd),
+                if (sd->flags & SD_WAKE_AFFINE) {
-                                            lowest_mask);
+                        int best_cpu;
-                                best_cpu = pick_optimal_cpu(this_cpu,
+                        /*
-                                                            domain_mask);
+                         * "this_cpu" is cheaper to preempt than a
+                         * remote processor.
-                                if (best_cpu != -1) {
+                         */
-                                        free_cpumask_var(domain_mask);
+                        if (this_cpu != -1 &&
-                                        return best_cpu;
+                            cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
-                                }
+                                return this_cpu;
-                        }
+                        best_cpu = cpumask_first_and(lowest_mask,
+                                                     sched_domain_span(sd));
+                        if (best_cpu < nr_cpu_ids)
+                                return best_cpu;
                }
-                free_cpumask_var(domain_mask);
        }
        /*
@@ -1227,7 +1208,13 @@ static int find_lowest_rq(struct task_struct *task)
         * just give the caller *something* to work with from the compatible
         * locations.
         */
-        return pick_optimal_cpu(this_cpu, lowest_mask);
+        if (this_cpu != -1)
+                return this_cpu;
+        cpu = cpumask_any(lowest_mask);
+        if (cpu < nr_cpu_ids)
+                return cpu;
+        return -1;
 }
 /* Will lock the rq it finds */