1 files changed, 503 insertions, 320 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 1b59e265273..da1edc8277d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -309,8 +309,8 @@ void set_tg_uid(struct user_struct *user)
 /*
 * Root task group.
- *      Every UID task group (including init_task_group aka UID-0) will
+ *      Every UID task group (including init_task_group aka UID-0) will
- *      be a child to this group.
+ *      be a child to this group.
 */
 struct task_group root_task_group;
@@ -318,7 +318,7 @@ struct task_group root_task_group;
 /* Default task group's sched entity on each cpu */
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
-static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -616,6 +616,7 @@ struct rq {
        unsigned char idle_at_tick;
        /* For active balancing */
+        int post_schedule;
        int active_balance;
        int push_cpu;
        /* cpu of this runqueue: */
@@ -693,6 +694,7 @@ static inline int cpu_of(struct rq *rq)
 #define this_rq()               (&__get_cpu_var(runqueues))
 #define task_rq(p)              cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
+#define raw_rq()                (&__raw_get_cpu_var(runqueues))
 inline void update_rq_clock(struct rq *rq)
 {
@@ -1513,28 +1515,35 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 #ifdef CONFIG_FAIR_GROUP_SCHED
+struct update_shares_data {
+        unsigned long rq_weight[NR_CPUS];
+};
+static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 /*
 * Calculate and set the cpu's group shares.
 */
-static void
+static void update_group_shares_cpu(struct task_group *tg, int cpu,
-update_group_shares_cpu(struct task_group *tg, int cpu,
+                                    unsigned long sd_shares,
-                        unsigned long sd_shares, unsigned long sd_rq_weight)
+                                    unsigned long sd_rq_weight,
+                                    struct update_shares_data *usd)
 {
-        unsigned long shares;
+        unsigned long shares, rq_weight;
-        unsigned long rq_weight;
+        int boost = 0;
-        if (!tg->se[cpu])
-                return;
-        rq_weight = tg->cfs_rq[cpu]->rq_weight;
+        rq_weight = usd->rq_weight[cpu];
+        if (!rq_weight) {
+                boost = 1;
+                rq_weight = NICE_0_LOAD;
+        }
        /*
-         *           \Sum shares * rq_weight
+         *             \Sum_j shares_j * rq_weight_i
-         * shares =  -----------------------
+         * shares_i =  -----------------------------
-         *               \Sum rq_weight
+         *                  \Sum_j rq_weight_j
-         *
         */
        shares = (sd_shares * rq_weight) / sd_rq_weight;
        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@ -1545,8 +1554,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
                unsigned long flags;
                spin_lock_irqsave(&rq->lock, flags);
-                tg->cfs_rq[cpu]->shares = shares;
+                tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
+                tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
                __set_se_shares(tg->se[cpu], shares);
                spin_unlock_irqrestore(&rq->lock, flags);
        }
@@ -1559,22 +1568,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
 */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-        unsigned long weight, rq_weight = 0;
+        unsigned long weight, rq_weight = 0, shares = 0;
-        unsigned long shares = 0;
+        struct update_shares_data *usd;
        struct sched_domain *sd = data;
+        unsigned long flags;
        int i;
+        if (!tg->se[0])
+                return 0;
+        local_irq_save(flags);
+        usd = &__get_cpu_var(update_shares_data);
        for_each_cpu(i, sched_domain_span(sd)) {
+                weight = tg->cfs_rq[i]->load.weight;
+                usd->rq_weight[i] = weight;
                /*
                 * If there are currently no tasks on the cpu pretend there
                 * is one of average load so that when a new task gets to
                 * run here it will not get delayed by group starvation.
                 */
-                weight = tg->cfs_rq[i]->load.weight;
                if (!weight)
                        weight = NICE_0_LOAD;
-                tg->cfs_rq[i]->rq_weight = weight;
                rq_weight += weight;
                shares += tg->cfs_rq[i]->shares;
        }
@@ -1586,7 +1603,9 @@ static int tg_shares_up(struct task_group *tg, void *data)
                shares = tg->shares;
        for_each_cpu(i, sched_domain_span(sd))
-                update_group_shares_cpu(tg, i, shares, rq_weight);
+                update_group_shares_cpu(tg, i, shares, rq_weight, usd);
+        local_irq_restore(flags);
        return 0;
 }
@@ -1616,8 +1635,14 @@ static int tg_load_down(struct task_group *tg, void *data)
 static void update_shares(struct sched_domain *sd)
 {
-        u64 now = cpu_clock(raw_smp_processor_id());
+        s64 elapsed;
-        s64 elapsed = now - sd->last_update;
+        u64 now;
+        if (root_task_group_empty())
+                return;
+        now = cpu_clock(raw_smp_processor_id());
+        elapsed = now - sd->last_update;
        if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
                sd->last_update = now;
@@ -1627,6 +1652,9 @@ static void update_shares(struct sched_domain *sd)
 static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 {
+        if (root_task_group_empty())
+                return;
        spin_unlock(&rq->lock);
        update_shares(sd);
        spin_lock(&rq->lock);
@@ -1634,6 +1662,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 static void update_h_load(long cpu)
 {
+        if (root_task_group_empty())
+                return;
        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
@@ -2637,9 +2668,32 @@ void sched_fork(struct task_struct *p, int clone_flags)
        set_task_cpu(p, cpu);
        /*
-         * Make sure we do not leak PI boosting priority to the child:
+         * Make sure we do not leak PI boosting priority to the child.
         */
        p->prio = current->normal_prio;
+        /*
+         * Revert to default priority/policy on fork if requested.
+         */
+        if (unlikely(p->sched_reset_on_fork)) {
+                if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
+                        p->policy = SCHED_NORMAL;
+                if (p->normal_prio < DEFAULT_PRIO)
+                        p->prio = DEFAULT_PRIO;
+                if (PRIO_TO_NICE(p->static_prio) < 0) {
+                        p->static_prio = NICE_TO_PRIO(0);
+                        set_load_weight(p);
+                }
+                /*
+                 * We don't need the reset flag anymore after the fork. It has
+                 * fulfilled its duty:
+                 */
+                p->sched_reset_on_fork = 0;
+        }
        if (!rt_prio(p->prio))
                p->sched_class = &fair_sched_class;
@@ -2796,12 +2850,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 {
        struct mm_struct *mm = rq->prev_mm;
        long prev_state;
-#ifdef CONFIG_SMP
-        int post_schedule = 0;
-        if (current->sched_class->needs_post_schedule)
-                post_schedule = current->sched_class->needs_post_schedule(rq);
-#endif
        rq->prev_mm = NULL;
@@ -2820,10 +2868,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        finish_arch_switch(prev);
        perf_counter_task_sched_in(current, cpu_of(rq));
        finish_lock_switch(rq, prev);
-#ifdef CONFIG_SMP
-        if (post_schedule)
-                current->sched_class->post_schedule(rq);
-#endif
        fire_sched_in_preempt_notifiers(current);
        if (mm)
@@ -2838,6 +2882,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        }
 }
+#ifdef CONFIG_SMP
+/* assumes rq->lock is held */
+static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
+{
+        if (prev->sched_class->pre_schedule)
+                prev->sched_class->pre_schedule(rq, prev);
+}
+/* rq->lock is NOT held, but preemption is disabled */
+static inline void post_schedule(struct rq *rq)
+{
+        if (rq->post_schedule) {
+                unsigned long flags;
+                spin_lock_irqsave(&rq->lock, flags);
+                if (rq->curr->sched_class->post_schedule)
+                        rq->curr->sched_class->post_schedule(rq);
+                spin_unlock_irqrestore(&rq->lock, flags);
+                rq->post_schedule = 0;
+        }
+}
+#else
+static inline void pre_schedule(struct rq *rq, struct task_struct *p)
+{
+}
+static inline void post_schedule(struct rq *rq)
+{
+}
+#endif
 /**
 * schedule_tail - first thing a freshly forked thread must call.
 * @prev: the thread we just switched away from.
@@ -2848,6 +2928,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
        struct rq *rq = this_rq();
        finish_task_switch(rq, prev);
+        /*
+         * FIXME: do we need to worry about rq being invalidated by the
+         * task_switch?
+         */
+        post_schedule(rq);
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
        /* In this case, finish_task_switch does not reenable preemption */
        preempt_enable();
@@ -3379,9 +3466,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 {
        const struct sched_class *class;
-        for (class = sched_class_highest; class; class = class->next)
+        for_each_class(class) {
                if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
                        return 1;
+        }
        return 0;
 }
@@ -5349,10 +5437,7 @@ need_resched_nonpreemptible:
                switch_count = &prev->nvcsw;
        }
-#ifdef CONFIG_SMP
+        pre_schedule(rq, prev);
-        if (prev->sched_class->pre_schedule)
-                prev->sched_class->pre_schedule(rq, prev);
-#endif
        if (unlikely(!rq->nr_running))
                idle_balance(cpu, rq);
@@ -5378,6 +5463,8 @@ need_resched_nonpreemptible:
        } else
                spin_unlock_irq(&rq->lock);
+        post_schedule(rq);
        if (unlikely(reacquire_kernel_lock(current) < 0))
                goto need_resched_nonpreemptible;
@@ -6123,17 +6210,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
        unsigned long flags;
        const struct sched_class *prev_class = p->sched_class;
        struct rq *rq;
+        int reset_on_fork;
        /* may grab non-irq protected spin_locks */
        BUG_ON(in_interrupt());
 recheck:
        /* double check policy once rq lock held */
-        if (policy < 0)
+        if (policy < 0) {
+                reset_on_fork = p->sched_reset_on_fork;
                policy = oldpolicy = p->policy;
-        else if (policy != SCHED_FIFO && policy != SCHED_RR &&
+        } else {
-                        policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+                reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
-                        policy != SCHED_IDLE)
+                policy &= ~SCHED_RESET_ON_FORK;
-                return -EINVAL;
+                if (policy != SCHED_FIFO && policy != SCHED_RR &&
+                                policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+                                policy != SCHED_IDLE)
+                        return -EINVAL;
+        }
        /*
         * Valid priorities for SCHED_FIFO and SCHED_RR are
         * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -6177,6 +6272,10 @@ recheck:
                /* can't change other user's priorities */
                if (!check_same_owner(p))
                        return -EPERM;
+                /* Normal users shall not reset the sched_reset_on_fork flag */
+                if (p->sched_reset_on_fork && !reset_on_fork)
+                        return -EPERM;
        }
        if (user) {
@@ -6220,6 +6319,8 @@ recheck:
        if (running)
                p->sched_class->put_prev_task(rq, p);
+        p->sched_reset_on_fork = reset_on_fork;
        oldprio = p->prio;
        __setscheduler(rq, p, policy, param->sched_priority);
@@ -6336,14 +6437,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
        if (p) {
                retval = security_task_getscheduler(p);
                if (!retval)
-                        retval = p->policy;
+                        retval = p->policy
+                                | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
        }
        read_unlock(&tasklist_lock);
        return retval;
 }
 /**
- * sys_sched_getscheduler - get the RT priority of a thread
+ * sys_sched_getparam - get the RT priority of a thread
 * @pid: the pid in question.
 * @param: structure containing the RT priority.
 */
@@ -6571,19 +6673,9 @@ static inline int should_resched(void)
 static void __cond_resched(void)
 {
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+        add_preempt_count(PREEMPT_ACTIVE);
-        __might_sleep(__FILE__, __LINE__);
+        schedule();
-#endif
+        sub_preempt_count(PREEMPT_ACTIVE);
-        /*
-         * The BKS might be reacquired before we have dropped
-         * PREEMPT_ACTIVE, which could trigger a second
-         * cond_resched() call.
-         */
-        do {
-                add_preempt_count(PREEMPT_ACTIVE);
-                schedule();
-                sub_preempt_count(PREEMPT_ACTIVE);
-        } while (need_resched());
 }
 int __sched _cond_resched(void)
@@ -6597,14 +6689,14 @@ int __sched _cond_resched(void)
 EXPORT_SYMBOL(_cond_resched);
 /*
- * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
 * call schedule, and on return reacquire the lock.
 *
 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
 * operations here to prevent schedule() from being called twice (once via
 * spin_unlock(), once by hand).
 */
-int cond_resched_lock(spinlock_t *lock)
+int __cond_resched_lock(spinlock_t *lock)
 {
        int resched = should_resched();
        int ret = 0;
@@ -6620,9 +6712,9 @@ int cond_resched_lock(spinlock_t *lock)
        }
        return ret;
 }
-EXPORT_SYMBOL(cond_resched_lock);
+EXPORT_SYMBOL(__cond_resched_lock);
-int __sched cond_resched_softirq(void)
+int __sched __cond_resched_softirq(void)
 {
        BUG_ON(!in_softirq());
@@ -6634,7 +6726,7 @@ int __sched cond_resched_softirq(void)
        }
        return 0;
 }
-EXPORT_SYMBOL(cond_resched_softirq);
+EXPORT_SYMBOL(__cond_resched_softirq);
 /**
 * yield - yield the current processor to other threads.
@@ -6658,11 +6750,13 @@ EXPORT_SYMBOL(yield);
 */
 void __sched io_schedule(void)
 {
-        struct rq *rq = &__raw_get_cpu_var(runqueues);
+        struct rq *rq = raw_rq();
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
+        current->in_iowait = 1;
        schedule();
+        current->in_iowait = 0;
        atomic_dec(&rq->nr_iowait);
        delayacct_blkio_end();
 }
@@ -6670,12 +6764,14 @@ EXPORT_SYMBOL(io_schedule);
 long __sched io_schedule_timeout(long timeout)
 {
-        struct rq *rq = &__raw_get_cpu_var(runqueues);
+        struct rq *rq = raw_rq();
        long ret;
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
+        current->in_iowait = 1;
        ret = schedule_timeout(timeout);
+        current->in_iowait = 0;
        atomic_dec(&rq->nr_iowait);
        delayacct_blkio_end();
        return ret;
@@ -6992,8 +7088,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
                /* Need help from migration thread: drop lock and wait. */
+                struct task_struct *mt = rq->migration_thread;
+                get_task_struct(mt);
                task_rq_unlock(rq, &flags);
                wake_up_process(rq->migration_thread);
+                put_task_struct(mt);
                wait_for_completion(&req.done);
                tlb_migrate_finish(p->mm);
                return 0;
@@ -7625,7 +7725,7 @@ static int __init migration_init(void)
        migration_call(&migration_notifier, CPU_ONLINE, cpu);
        register_cpu_notifier(&migration_notifier);
-        return err;
+        return 0;
 }
 early_initcall(migration_init);
 #endif
@@ -7841,7 +7941,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
        rq->rd = rd;
        cpumask_set_cpu(rq->cpu, rd->span);
-        if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
+        if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
                set_rq_online(rq);
        spin_unlock_irqrestore(&rq->lock, flags);
@@ -8091,6 +8191,39 @@ struct static_sched_domain {
        DECLARE_BITMAP(span, CONFIG_NR_CPUS);
 };
+struct s_data {
+#ifdef CONFIG_NUMA
+        int                     sd_allnodes;
+        cpumask_var_t           domainspan;
+        cpumask_var_t           covered;
+        cpumask_var_t           notcovered;
+#endif
+        cpumask_var_t           nodemask;
+        cpumask_var_t           this_sibling_map;
+        cpumask_var_t           this_core_map;
+        cpumask_var_t           send_covered;
+        cpumask_var_t           tmpmask;
+        struct sched_group      **sched_group_nodes;
+        struct root_domain      *rd;
+};
+enum s_alloc {
+        sa_sched_groups = 0,
+        sa_rootdomain,
+        sa_tmpmask,
+        sa_send_covered,
+        sa_this_core_map,
+        sa_this_sibling_map,
+        sa_nodemask,
+        sa_sched_group_nodes,
+#ifdef CONFIG_NUMA
+        sa_notcovered,
+        sa_covered,
+        sa_domainspan,
+#endif
+        sa_none,
+};
 /*
 * SMT sched-domains:
 */
@@ -8213,6 +8346,71 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
                sg = sg->next;
        } while (sg != group_head);
 }
+static int build_numa_sched_groups(struct s_data *d,
+                                   const struct cpumask *cpu_map, int num)
+{
+        struct sched_domain *sd;
+        struct sched_group *sg, *prev;
+        int n, j;
+        cpumask_clear(d->covered);
+        cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
+        if (cpumask_empty(d->nodemask)) {
+                d->sched_group_nodes[num] = NULL;
+                goto out;
+        }
+        sched_domain_node_span(num, d->domainspan);
+        cpumask_and(d->domainspan, d->domainspan, cpu_map);
+        sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                          GFP_KERNEL, num);
+        if (!sg) {
+                printk(KERN_WARNING "Can not alloc domain group for node %d\n",
+                       num);
+                return -ENOMEM;
+        }
+        d->sched_group_nodes[num] = sg;
+        for_each_cpu(j, d->nodemask) {
+                sd = &per_cpu(node_domains, j).sd;
+                sd->groups = sg;
+        }
+        sg->__cpu_power = 0;
+        cpumask_copy(sched_group_cpus(sg), d->nodemask);
+        sg->next = sg;
+        cpumask_or(d->covered, d->covered, d->nodemask);
+        prev = sg;
+        for (j = 0; j < nr_node_ids; j++) {
+                n = (num + j) % nr_node_ids;
+                cpumask_complement(d->notcovered, d->covered);
+                cpumask_and(d->tmpmask, d->notcovered, cpu_map);
+                cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
+                if (cpumask_empty(d->tmpmask))
+                        break;
+                cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
+                if (cpumask_empty(d->tmpmask))
+                        continue;
+                sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                                  GFP_KERNEL, num);
+                if (!sg) {
+                        printk(KERN_WARNING
+                               "Can not alloc domain group for node %d\n", j);
+                        return -ENOMEM;
+                }
+                sg->__cpu_power = 0;
+                cpumask_copy(sched_group_cpus(sg), d->tmpmask);
+                sg->next = prev->next;
+                cpumask_or(d->covered, d->covered, d->tmpmask);
+                prev->next = sg;
+                prev = sg;
+        }
+out:
+        return 0;
+}
 #endif /* CONFIG_NUMA */
 #ifdef CONFIG_NUMA
@@ -8378,280 +8576,285 @@ static void set_domain_attribute(struct sched_domain *sd,
        }
 }
-/*
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
- * Build sched domains for a given set of cpus and attach the sched domains
+                                 const struct cpumask *cpu_map)
- * to the individual cpus
+{
- */
+        switch (what) {
-static int __build_sched_domains(const struct cpumask *cpu_map,
+        case sa_sched_groups:
-                                 struct sched_domain_attr *attr)
+                free_sched_groups(cpu_map, d->tmpmask); /* fall through */
-{
+                d->sched_group_nodes = NULL;
-        int i, err = -ENOMEM;
+        case sa_rootdomain:
-        struct root_domain *rd;
+                free_rootdomain(d->rd); /* fall through */
-        cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
+        case sa_tmpmask:
-                tmpmask;
+                free_cpumask_var(d->tmpmask); /* fall through */
+        case sa_send_covered:
+                free_cpumask_var(d->send_covered); /* fall through */
+        case sa_this_core_map:
+                free_cpumask_var(d->this_core_map); /* fall through */
+        case sa_this_sibling_map:
+                free_cpumask_var(d->this_sibling_map); /* fall through */
+        case sa_nodemask:
+                free_cpumask_var(d->nodemask); /* fall through */
+        case sa_sched_group_nodes:
 #ifdef CONFIG_NUMA
-        cpumask_var_t domainspan, covered, notcovered;
+                kfree(d->sched_group_nodes); /* fall through */
-        struct sched_group **sched_group_nodes = NULL;
+        case sa_notcovered:
-        int sd_allnodes = 0;
+                free_cpumask_var(d->notcovered); /* fall through */
+        case sa_covered:
-        if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
+                free_cpumask_var(d->covered); /* fall through */
-                goto out;
+        case sa_domainspan:
-        if (!alloc_cpumask_var(&covered, GFP_KERNEL))
+                free_cpumask_var(d->domainspan); /* fall through */
-                goto free_domainspan;
+#endif
-        if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
+        case sa_none:
-                goto free_covered;
+                break;
-#endif
+        }
+}
-        if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
-                goto free_notcovered;
-        if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
-                goto free_nodemask;
-        if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
-                goto free_this_sibling_map;
-        if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
-                goto free_this_core_map;
-        if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
-                goto free_send_covered;
+static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
+                                                   const struct cpumask *cpu_map)
+{
 #ifdef CONFIG_NUMA
-        /*
+        if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
-         * Allocate the per-node list of sched groups
+                return sa_none;
-         */
+        if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
-        sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
+                return sa_domainspan;
-                                    GFP_KERNEL);
+        if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
-        if (!sched_group_nodes) {
+                return sa_covered;
+        /* Allocate the per-node list of sched groups */
+        d->sched_group_nodes = kcalloc(nr_node_ids,
+                                      sizeof(struct sched_group *), GFP_KERNEL);
+        if (!d->sched_group_nodes) {
                printk(KERN_WARNING "Can not alloc sched group node list\n");
-                goto free_tmpmask;
+                return sa_notcovered;
-        }
+        }
-#endif
+        sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
+#endif
-        rd = alloc_rootdomain();
+        if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
-        if (!rd) {
+                return sa_sched_group_nodes;
+        if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
+                return sa_nodemask;
+        if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
+                return sa_this_sibling_map;
+        if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+                return sa_this_core_map;
+        if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
+                return sa_send_covered;
+        d->rd = alloc_rootdomain();
+        if (!d->rd) {
                printk(KERN_WARNING "Cannot alloc root domain\n");
-                goto free_sched_groups;
+                return sa_tmpmask;
        }
+        return sa_rootdomain;
+}
+static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
+        const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
+{
+        struct sched_domain *sd = NULL;
 #ifdef CONFIG_NUMA
-        sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
+        struct sched_domain *parent;
-#endif
-        /*
-         * Set up domains for cpus specified by the cpu_map.
-         */
-        for_each_cpu(i, cpu_map) {
-                struct sched_domain *sd = NULL, *p;
-                cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
-#ifdef CONFIG_NUMA
-                if (cpumask_weight(cpu_map) >
-                                SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
-                        sd = &per_cpu(allnodes_domains, i).sd;
-                        SD_INIT(sd, ALLNODES);
-                        set_domain_attribute(sd, attr);
-                        cpumask_copy(sched_domain_span(sd), cpu_map);
-                        cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
-                        p = sd;
-                        sd_allnodes = 1;
-                } else
-                        p = NULL;
-                sd = &per_cpu(node_domains, i).sd;
+        d->sd_allnodes = 0;
-                SD_INIT(sd, NODE);
+        if (cpumask_weight(cpu_map) >
+            SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
+                sd = &per_cpu(allnodes_domains, i).sd;
+                SD_INIT(sd, ALLNODES);
                set_domain_attribute(sd, attr);
-                sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
+                cpumask_copy(sched_domain_span(sd), cpu_map);
-                sd->parent = p;
+                cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
-                if (p)
+                d->sd_allnodes = 1;
-                        p->child = sd;
+        }
-                cpumask_and(sched_domain_span(sd),
+        parent = sd;
-                            sched_domain_span(sd), cpu_map);
+        sd = &per_cpu(node_domains, i).sd;
+        SD_INIT(sd, NODE);
+        set_domain_attribute(sd, attr);
+        sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
+        sd->parent = parent;
+        if (parent)
+                parent->child = sd;
+        cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
 #endif
+        return sd;
+}
-                p = sd;
+static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
-                sd = &per_cpu(phys_domains, i).sd;
+        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
-                SD_INIT(sd, CPU);
+        struct sched_domain *parent, int i)
-                set_domain_attribute(sd, attr);
+{
-                cpumask_copy(sched_domain_span(sd), nodemask);
+        struct sched_domain *sd;
-                sd->parent = p;
+        sd = &per_cpu(phys_domains, i).sd;
-                if (p)
+        SD_INIT(sd, CPU);
-                        p->child = sd;
+        set_domain_attribute(sd, attr);
-                cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
+        cpumask_copy(sched_domain_span(sd), d->nodemask);
+        sd->parent = parent;
+        if (parent)
+                parent->child = sd;
+        cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
+        return sd;
+}
+static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
+        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+        struct sched_domain *parent, int i)
+{
+        struct sched_domain *sd = parent;
 #ifdef CONFIG_SCHED_MC
-                p = sd;
+        sd = &per_cpu(core_domains, i).sd;
-                sd = &per_cpu(core_domains, i).sd;
+        SD_INIT(sd, MC);
-                SD_INIT(sd, MC);
+        set_domain_attribute(sd, attr);
-                set_domain_attribute(sd, attr);
+        cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
-                cpumask_and(sched_domain_span(sd), cpu_map,
+        sd->parent = parent;
-                                                   cpu_coregroup_mask(i));
+        parent->child = sd;
-                sd->parent = p;
+        cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
-                p->child = sd;
-                cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
 #endif
+        return sd;
+}
+static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
+        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+        struct sched_domain *parent, int i)
+{
+        struct sched_domain *sd = parent;
 #ifdef CONFIG_SCHED_SMT
-                p = sd;
+        sd = &per_cpu(cpu_domains, i).sd;
-                sd = &per_cpu(cpu_domains, i).sd;
+        SD_INIT(sd, SIBLING);
-                SD_INIT(sd, SIBLING);
+        set_domain_attribute(sd, attr);
-                set_domain_attribute(sd, attr);
+        cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
-                cpumask_and(sched_domain_span(sd),
+        sd->parent = parent;
-                            topology_thread_cpumask(i), cpu_map);
+        parent->child = sd;
-                sd->parent = p;
+        cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
-                p->child = sd;
-                cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
 #endif
-        }
+        return sd;
+}
+static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
+                               const struct cpumask *cpu_map, int cpu)
+{
+        switch (l) {
 #ifdef CONFIG_SCHED_SMT
-        /* Set up CPU (sibling) groups */
+        case SD_LV_SIBLING: /* set up CPU (sibling) groups */
-        for_each_cpu(i, cpu_map) {
+                cpumask_and(d->this_sibling_map, cpu_map,
-                cpumask_and(this_sibling_map,
+                            topology_thread_cpumask(cpu));
-                            topology_thread_cpumask(i), cpu_map);
+                if (cpu == cpumask_first(d->this_sibling_map))
-                if (i != cpumask_first(this_sibling_map))
+                        init_sched_build_groups(d->this_sibling_map, cpu_map,
-                        continue;
+                                                &cpu_to_cpu_group,
+                                                d->send_covered, d->tmpmask);
-                init_sched_build_groups(this_sibling_map, cpu_map,
+                break;
-                                        &cpu_to_cpu_group,
-                                        send_covered, tmpmask);
-        }
 #endif
 #ifdef CONFIG_SCHED_MC
-        /* Set up multi-core groups */
+        case SD_LV_MC: /* set up multi-core groups */
-        for_each_cpu(i, cpu_map) {
+                cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
-                cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
+                if (cpu == cpumask_first(d->this_core_map))
-                if (i != cpumask_first(this_core_map))
+                        init_sched_build_groups(d->this_core_map, cpu_map,
-                        continue;
+                                                &cpu_to_core_group,
+                                                d->send_covered, d->tmpmask);
-                init_sched_build_groups(this_core_map, cpu_map,
+                break;
-                                        &cpu_to_core_group,
-                                        send_covered, tmpmask);
-        }
 #endif
+        case SD_LV_CPU: /* set up physical groups */
-        /* Set up physical groups */
+                cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
-        for (i = 0; i < nr_node_ids; i++) {
+                if (!cpumask_empty(d->nodemask))
-                cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
+                        init_sched_build_groups(d->nodemask, cpu_map,
-                if (cpumask_empty(nodemask))
+                                                &cpu_to_phys_group,
-                        continue;
+                                                d->send_covered, d->tmpmask);
+                break;
-                init_sched_build_groups(nodemask, cpu_map,
-                                        &cpu_to_phys_group,
-                                        send_covered, tmpmask);
-        }
 #ifdef CONFIG_NUMA
-        /* Set up node groups */
+        case SD_LV_ALLNODES:
-        if (sd_allnodes) {
+                init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
-                init_sched_build_groups(cpu_map, cpu_map,
+                                        d->send_covered, d->tmpmask);
-                                        &cpu_to_allnodes_group,
+                break;
-                                        send_covered, tmpmask);
+#endif
+        default:
+                break;
        }
+}
-        for (i = 0; i < nr_node_ids; i++) {
+/*
-                /* Set up node groups */
+ * Build sched domains for a given set of cpus and attach the sched domains
-                struct sched_group *sg, *prev;
+ * to the individual cpus
-                int j;
+ */
+static int __build_sched_domains(const struct cpumask *cpu_map,
-                cpumask_clear(covered);
+                                 struct sched_domain_attr *attr)
-                cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
+{
-                if (cpumask_empty(nodemask)) {
+        enum s_alloc alloc_state = sa_none;
-                        sched_group_nodes[i] = NULL;
+        struct s_data d;
-                        continue;
+        struct sched_domain *sd;
-                }
+        int i;
+#ifdef CONFIG_NUMA
+        d.sd_allnodes = 0;
+#endif
-                sched_domain_node_span(i, domainspan);
+        alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
-                cpumask_and(domainspan, domainspan, cpu_map);
+        if (alloc_state != sa_rootdomain)
+                goto error;
+        alloc_state = sa_sched_groups;
-                sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+        /*
-                                  GFP_KERNEL, i);
+         * Set up domains for cpus specified by the cpu_map.
-                if (!sg) {
+         */
-                        printk(KERN_WARNING "Can not alloc domain group for "
+        for_each_cpu(i, cpu_map) {
-                                "node %d\n", i);
+                cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
-                        goto error;
+                            cpu_map);
-                }
-                sched_group_nodes[i] = sg;
-                for_each_cpu(j, nodemask) {
-                        struct sched_domain *sd;
-                        sd = &per_cpu(node_domains, j).sd;
+                sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
-                        sd->groups = sg;
+                sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
-                }
+                sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
-                sg->__cpu_power = 0;
+                sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
-                cpumask_copy(sched_group_cpus(sg), nodemask);
+        }
-                sg->next = sg;
-                cpumask_or(covered, covered, nodemask);
-                prev = sg;
-                for (j = 0; j < nr_node_ids; j++) {
+        for_each_cpu(i, cpu_map) {
-                        int n = (i + j) % nr_node_ids;
+                build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+                build_sched_groups(&d, SD_LV_MC, cpu_map, i);
+        }
-                        cpumask_complement(notcovered, covered);
+        /* Set up physical groups */
-                        cpumask_and(tmpmask, notcovered, cpu_map);
+        for (i = 0; i < nr_node_ids; i++)
-                        cpumask_and(tmpmask, tmpmask, domainspan);
+                build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
-                        if (cpumask_empty(tmpmask))
-                                break;
-                        cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
+#ifdef CONFIG_NUMA
-                        if (cpumask_empty(tmpmask))
+        /* Set up node groups */
-                                continue;
+        if (d.sd_allnodes)
+                build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
-                        sg = kmalloc_node(sizeof(struct sched_group) +
+        for (i = 0; i < nr_node_ids; i++)
-                                          cpumask_size(),
+                if (build_numa_sched_groups(&d, cpu_map, i))
-                                          GFP_KERNEL, i);
+                        goto error;
-                        if (!sg) {
-                                printk(KERN_WARNING
-                                "Can not alloc domain group for node %d\n", j);
-                                goto error;
-                        }
-                        sg->__cpu_power = 0;
-                        cpumask_copy(sched_group_cpus(sg), tmpmask);
-                        sg->next = prev->next;
-                        cpumask_or(covered, covered, tmpmask);
-                        prev->next = sg;
-                        prev = sg;
-                }
-        }
 #endif
        /* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
        for_each_cpu(i, cpu_map) {
-                struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
+                sd = &per_cpu(cpu_domains, i).sd;
                init_sched_groups_power(i, sd);
        }
 #endif
 #ifdef CONFIG_SCHED_MC
        for_each_cpu(i, cpu_map) {
-                struct sched_domain *sd = &per_cpu(core_domains, i).sd;
+                sd = &per_cpu(core_domains, i).sd;
                init_sched_groups_power(i, sd);
        }
 #endif
        for_each_cpu(i, cpu_map) {
-                struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
+                sd = &per_cpu(phys_domains, i).sd;
                init_sched_groups_power(i, sd);
        }
 #ifdef CONFIG_NUMA
        for (i = 0; i < nr_node_ids; i++)
-                init_numa_sched_groups_power(sched_group_nodes[i]);
+                init_numa_sched_groups_power(d.sched_group_nodes[i]);
-        if (sd_allnodes) {
+        if (d.sd_allnodes) {
                struct sched_group *sg;
                cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
-                                                                tmpmask);
+                                                                d.tmpmask);
                init_numa_sched_groups_power(sg);
        }
 #endif
        /* Attach the domains */
        for_each_cpu(i, cpu_map) {
-                struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
                sd = &per_cpu(cpu_domains, i).sd;
 #elif defined(CONFIG_SCHED_MC)
@@ -8659,44 +8862,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 #else
                sd = &per_cpu(phys_domains, i).sd;
 #endif
-                cpu_attach_domain(sd, rd, i);
+                cpu_attach_domain(sd, d.rd, i);
        }
-        err = 0;
+        d.sched_group_nodes = NULL; /* don't free this we still need it */
+        __free_domain_allocs(&d, sa_tmpmask, cpu_map);
-free_tmpmask:
+        return 0;
-        free_cpumask_var(tmpmask);
-free_send_covered:
-        free_cpumask_var(send_covered);
-free_this_core_map:
-        free_cpumask_var(this_core_map);
-free_this_sibling_map:
-        free_cpumask_var(this_sibling_map);
-free_nodemask:
-        free_cpumask_var(nodemask);
-free_notcovered:
-#ifdef CONFIG_NUMA
-        free_cpumask_var(notcovered);
-free_covered:
-        free_cpumask_var(covered);
-free_domainspan:
-        free_cpumask_var(domainspan);
-out:
-#endif
-        return err;
-free_sched_groups:
-#ifdef CONFIG_NUMA
-        kfree(sched_group_nodes);
-#endif
-        goto free_tmpmask;
-#ifdef CONFIG_NUMA
 error:
-        free_sched_groups(cpu_map, tmpmask);
+        __free_domain_allocs(&d, alloc_state, cpu_map);
-        free_rootdomain(rd);
+        return -ENOMEM;
-        goto free_tmpmask;
-#endif
 }
 static int build_sched_domains(const struct cpumask *cpu_map)
@@ -9304,11 +9479,11 @@ void __init sched_init(void)
                 * system cpu resource, based on the weight assigned to root
                 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
                 * by letting tasks of init_task_group sit in a separate cfs_rq
-                 * (init_cfs_rq) and having one entity represent this group of
+                 * (init_tg_cfs_rq) and having one entity represent this group of
                 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
                 */
                init_tg_cfs_entry(&init_task_group,
-                                &per_cpu(init_cfs_rq, i),
+                                &per_cpu(init_tg_cfs_rq, i),
                                &per_cpu(init_sched_entity, i), i, 1,
                                root_task_group.se[i]);
@@ -9334,6 +9509,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
+                rq->post_schedule = 0;
                rq->active_balance = 0;
                rq->next_balance = jiffies;
                rq->push_cpu = 0;
@@ -9398,13 +9574,20 @@ void __init sched_init(void)
 }
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-void __might_sleep(char *file, int line)
+static inline int preempt_count_equals(int preempt_offset)
+{
+        int nested = preempt_count() & ~PREEMPT_ACTIVE;
+        return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+}
+void __might_sleep(char *file, int line, int preempt_offset)
 {
 #ifdef in_atomic
        static unsigned long prev_jiffy;        /* ratelimiting */
-        if ((!in_atomic() && !irqs_disabled()) ||
+        if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
-                    system_state != SYSTEM_RUNNING || oops_in_progress)
+            system_state != SYSTEM_RUNNING || oops_in_progress)
                return;
        if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                return;