1 files changed, 101 insertions, 130 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index ad10d0aae1d7..338340a3fb89 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -397,9 +397,9 @@ struct cfs_rq {
         * 'curr' points to currently running entity on this cfs_rq.
         * It is set to NULL otherwise (i.e when none are currently running).
         */
-        struct sched_entity *curr, *next;
+        struct sched_entity *curr, *next, *last;
-        unsigned long nr_spread_over;
+        unsigned int nr_spread_over;
 #ifdef CONFIG_FAIR_GROUP_SCHED
        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
@@ -703,45 +703,18 @@ static __read_mostly char *sched_feat_names[] = {
 #undef SCHED_FEAT
-static int sched_feat_open(struct inode *inode, struct file *filp)
+static int sched_feat_show(struct seq_file *m, void *v)
-{
-        filp->private_data = inode->i_private;
-        return 0;
-}
-static ssize_t
-sched_feat_read(struct file *filp, char __user *ubuf,
-                size_t cnt, loff_t *ppos)
 {
-        char *buf;
-        int r = 0;
-        int len = 0;
        int i;
        for (i = 0; sched_feat_names[i]; i++) {
-                len += strlen(sched_feat_names[i]);
+                if (!(sysctl_sched_features & (1UL << i)))
-                len += 4;
+                        seq_puts(m, "NO_");
+                seq_printf(m, "%s ", sched_feat_names[i]);
        }
+        seq_puts(m, "\n");
-        buf = kmalloc(len + 2, GFP_KERNEL);
+        return 0;
-        if (!buf)
-                return -ENOMEM;
-        for (i = 0; sched_feat_names[i]; i++) {
-                if (sysctl_sched_features & (1UL << i))
-                        r += sprintf(buf + r, "%s ", sched_feat_names[i]);
-                else
-                        r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
-        }
-        r += sprintf(buf + r, "\n");
-        WARN_ON(r >= len + 2);
-        r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-        kfree(buf);
-        return r;
 }
 static ssize_t
@@ -786,10 +759,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        return cnt;
 }
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+        return single_open(filp, sched_feat_show, NULL);
+}
 static struct file_operations sched_feat_fops = {
-        .open   = sched_feat_open,
+        .open           = sched_feat_open,
-        .read   = sched_feat_read,
+        .write          = sched_feat_write,
-        .write  = sched_feat_write,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
 };
 static __init int sched_init_debug(void)
@@ -969,6 +949,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
        }
 }
+void task_rq_unlock_wait(struct task_struct *p)
+{
+        struct rq *rq = task_rq(p);
+        smp_mb(); /* spin-unlock-wait is not a full memory barrier */
+        spin_unlock_wait(&rq->lock);
+}
 static void __task_rq_unlock(struct rq *rq)
        __releases(rq->lock)
 {
@@ -1448,6 +1436,8 @@ static unsigned long cpu_avg_load_per_task(int cpu)
        if (rq->nr_running)
                rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+        else
+                rq->avg_load_per_task = 0;
        return rq->avg_load_per_task;
 }
@@ -1463,27 +1453,13 @@ static void
 update_group_shares_cpu(struct task_group *tg, int cpu,
                        unsigned long sd_shares, unsigned long sd_rq_weight)
 {
-        int boost = 0;
        unsigned long shares;
        unsigned long rq_weight;
        if (!tg->se[cpu])
                return;
-        rq_weight = tg->cfs_rq[cpu]->load.weight;
+        rq_weight = tg->cfs_rq[cpu]->rq_weight;
-        /*
-         * If there are currently no tasks on the cpu pretend there is one of
-         * average load so that when a new task gets to run here it will not
-         * get delayed by group starvation.
-         */
-        if (!rq_weight) {
-                boost = 1;
-                rq_weight = NICE_0_LOAD;
-        }
-        if (unlikely(rq_weight > sd_rq_weight))
-                rq_weight = sd_rq_weight;
        /*
         *           \Sum shares * rq_weight
@@ -1491,7 +1467,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
         *               \Sum rq_weight
         *
         */
-        shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+        shares = (sd_shares * rq_weight) / sd_rq_weight;
        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
        if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1500,11 +1476,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
                unsigned long flags;
                spin_lock_irqsave(&rq->lock, flags);
-                /*
+                tg->cfs_rq[cpu]->shares = shares;
-                 * record the actual number of shares, not the boosted amount.
-                 */
-                tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-                tg->cfs_rq[cpu]->rq_weight = rq_weight;
                __set_se_shares(tg->se[cpu], shares);
                spin_unlock_irqrestore(&rq->lock, flags);
@@ -1518,13 +1490,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
 */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-        unsigned long rq_weight = 0;
+        unsigned long weight, rq_weight = 0;
        unsigned long shares = 0;
        struct sched_domain *sd = data;
        int i;
        for_each_cpu_mask(i, sd->span) {
-                rq_weight += tg->cfs_rq[i]->load.weight;
+                /*
+                 * If there are currently no tasks on the cpu pretend there
+                 * is one of average load so that when a new task gets to
+                 * run here it will not get delayed by group starvation.
+                 */
+                weight = tg->cfs_rq[i]->load.weight;
+                if (!weight)
+                        weight = NICE_0_LOAD;
+                tg->cfs_rq[i]->rq_weight = weight;
+                rq_weight += weight;
                shares += tg->cfs_rq[i]->shares;
        }
@@ -1534,9 +1516,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
                shares = tg->shares;
-        if (!rq_weight)
-                rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
        for_each_cpu_mask(i, sd->span)
                update_group_shares_cpu(tg, i, shares, rq_weight);
@@ -1805,7 +1784,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
        /*
         * Buddy candidates are cache hot:
         */
-        if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
+        if (sched_feat(CACHE_HOT_BUDDY) &&
+                        (&p->se == cfs_rq_of(&p->se)->next ||
+                         &p->se == cfs_rq_of(&p->se)->last))
                return 1;
        if (p->sched_class != &fair_sched_class)
@@ -5858,6 +5839,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
+        spin_lock_irqsave(&rq->lock, flags);
        __sched_fork(idle);
        idle->se.exec_start = sched_clock();
@@ -5865,7 +5848,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        idle->cpus_allowed = cpumask_of_cpu(cpu);
        __set_task_cpu(idle, cpu);
-        spin_lock_irqsave(&rq->lock, flags);
        rq->curr = rq->idle = idle;
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
        idle->oncpu = 1;
@@ -6112,7 +6094,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
 /*
 * Figure out where task on dead CPU should go, use force if necessary.
- * NOTE: interrupts should be disabled by the caller
 */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
@@ -6622,28 +6603,6 @@ early_initcall(migration_init);
 #ifdef CONFIG_SCHED_DEBUG
-static inline const char *sd_level_to_string(enum sched_domain_level lvl)
-{
-        switch (lvl) {
-        case SD_LV_NONE:
-                        return "NONE";
-        case SD_LV_SIBLING:
-                        return "SIBLING";
-        case SD_LV_MC:
-                        return "MC";
-        case SD_LV_CPU:
-                        return "CPU";
-        case SD_LV_NODE:
-                        return "NODE";
-        case SD_LV_ALLNODES:
-                        return "ALLNODES";
-        case SD_LV_MAX:
-                        return "MAX";
-        }
-        return "MAX";
-}
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  cpumask_t *groupmask)
 {
@@ -6663,8 +6622,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                return -1;
        }
-        printk(KERN_CONT "span %s level %s\n",
+        printk(KERN_CONT "span %s level %s\n", str, sd->name);
-                str, sd_level_to_string(sd->level));
        if (!cpu_isset(cpu, sd->span)) {
                printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6875,15 +6833,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        struct sched_domain *tmp;
        /* Remove the sched domains which do not contribute to scheduling. */
-        for (tmp = sd; tmp; tmp = tmp->parent) {
+        for (tmp = sd; tmp; ) {
                struct sched_domain *parent = tmp->parent;
                if (!parent)
                        break;
                if (sd_parent_degenerate(tmp, parent)) {
                        tmp->parent = parent->parent;
                        if (parent->parent)
                                parent->parent->child = tmp;
-                }
+                } else
+                        tmp = tmp->parent;
        }
        if (sd && sd_degenerate(sd)) {
@@ -7318,13 +7278,21 @@ struct allmasks {
 };
 #if     NR_CPUS > 128
-#define SCHED_CPUMASK_ALLOC             1
+#define SCHED_CPUMASK_DECLARE(v)        struct allmasks *v
-#define SCHED_CPUMASK_FREE(v)           kfree(v)
+static inline void sched_cpumask_alloc(struct allmasks **masks)
-#define SCHED_CPUMASK_DECLARE(v)        struct allmasks *v
+{
+        *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
+}
+static inline void sched_cpumask_free(struct allmasks *masks)
+{
+        kfree(masks);
+}
 #else
-#define SCHED_CPUMASK_ALLOC             0
+#define SCHED_CPUMASK_DECLARE(v)        struct allmasks _v, *v = &_v
-#define SCHED_CPUMASK_FREE(v)
+static inline void sched_cpumask_alloc(struct allmasks **masks)
-#define SCHED_CPUMASK_DECLARE(v)        struct allmasks _v, *v = &_v
+{ }
+static inline void sched_cpumask_free(struct allmasks *masks)
+{ }
 #endif
 #define SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
@@ -7400,9 +7368,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                return -ENOMEM;
        }
-#if SCHED_CPUMASK_ALLOC
        /* get space for all scratch cpumask variables */
-        allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
+        sched_cpumask_alloc(&allmasks);
        if (!allmasks) {
                printk(KERN_WARNING "Cannot alloc cpumask array\n");
                kfree(rd);
@@ -7411,7 +7378,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
                return -ENOMEM;
        }
-#endif
        tmpmask = (cpumask_t *)allmasks;
@@ -7665,13 +7632,14 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                cpu_attach_domain(sd, rd, i);
        }
-        SCHED_CPUMASK_FREE((void *)allmasks);
+        sched_cpumask_free(allmasks);
        return 0;
 #ifdef CONFIG_NUMA
 error:
        free_sched_groups(cpu_map, tmpmask);
-        SCHED_CPUMASK_FREE((void *)allmasks);
+        sched_cpumask_free(allmasks);
+        kfree(rd);
        return -ENOMEM;
 #endif
 }
@@ -7734,8 +7702,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
        cpumask_t tmpmask;
        int i;
-        unregister_sched_domain_sysctl();
        for_each_cpu_mask_nr(i, *cpu_map)
                cpu_attach_domain(NULL, &def_root_domain, i);
        synchronize_sched();
@@ -7773,13 +7739,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 *
 * The passed in 'doms_new' should be kmalloc'd. This routine takes
 * ownership of it and will kfree it when done with it. If the caller
- * failed the kmalloc call, then it can pass in doms_new == NULL,
+ * failed the kmalloc call, then it can pass in doms_new == NULL &&
- * and partition_sched_domains() will fallback to the single partition
+ * ndoms_new == 1, and partition_sched_domains() will fallback to
- * 'fallback_doms', it also forces the domains to be rebuilt.
+ * the single partition 'fallback_doms', it also forces the domains
+ * to be rebuilt.
 *
- * If doms_new==NULL it will be replaced with cpu_online_map.
+ * If doms_new == NULL it will be replaced with cpu_online_map.
- * ndoms_new==0 is a special case for destroying existing domains.
+ * ndoms_new == 0 is a special case for destroying existing domains,
- * It will not create the default domain.
+ * and it will not create the default domain.
 *
 * Call with hotplug lock held
 */
@@ -7812,7 +7779,7 @@ match1:
                ndoms_cur = 0;
                doms_new = &fallback_doms;
                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-                dattr_new = NULL;
+                WARN_ON_ONCE(dattr_new);
        }
        /* Build new domains */
@@ -8472,7 +8439,7 @@ static
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct cfs_rq *cfs_rq;
-        struct sched_entity *se, *parent_se;
+        struct sched_entity *se;
        struct rq *rq;
        int i;
@@ -8488,18 +8455,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
-                cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
+                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
-                                GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+                                      GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
                        goto err;
-                se = kmalloc_node(sizeof(struct sched_entity),
+                se = kzalloc_node(sizeof(struct sched_entity),
-                                GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+                                  GFP_KERNEL, cpu_to_node(i));
                if (!se)
                        goto err;
-                parent_se = parent ? parent->se[i] : NULL;
+                init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
-                init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
        }
        return 1;
@@ -8560,7 +8526,7 @@ static
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct rt_rq *rt_rq;
-        struct sched_rt_entity *rt_se, *parent_se;
+        struct sched_rt_entity *rt_se;
        struct rq *rq;
        int i;
@@ -8577,18 +8543,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
-                rt_rq = kmalloc_node(sizeof(struct rt_rq),
+                rt_rq = kzalloc_node(sizeof(struct rt_rq),
-                                GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+                                     GFP_KERNEL, cpu_to_node(i));
                if (!rt_rq)
                        goto err;
-                rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
+                rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
-                                GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+                                     GFP_KERNEL, cpu_to_node(i));
                if (!rt_se)
                        goto err;
-                parent_se = parent ? parent->rt_se[i] : NULL;
+                init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
-                init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
        }
        return 1;
@@ -9231,11 +9196,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 * (balbir@in.ibm.com).
 */
-/* track cpu usage of a group of tasks */
+/* track cpu usage of a group of tasks and its child groups */
 struct cpuacct {
        struct cgroup_subsys_state css;
        /* cpuusage holds pointer to a u64-type object on every cpu */
        u64 *cpuusage;
+        struct cpuacct *parent;
 };
 struct cgroup_subsys cpuacct_subsys;
@@ -9269,6 +9235,9 @@ static struct cgroup_subsys_state *cpuacct_create(
                return ERR_PTR(-ENOMEM);
        }
+        if (cgrp->parent)
+                ca->parent = cgroup_ca(cgrp->parent);
        return &ca->css;
 }
@@ -9348,14 +9317,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
        struct cpuacct *ca;
+        int cpu;
        if (!cpuacct_subsys.active)
                return;
+        cpu = task_cpu(tsk);
        ca = task_ca(tsk);
-        if (ca) {
-                u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
+        for (; ca; ca = ca->parent) {
+                u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
 }

diff --git a/kernel/sched.c b/kernel/sched.c index ad10d0aae1d7..338340a3fb89 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -397,9 +397,9 @@ struct cfs_rq {
397	* 'curr' points to currently running entity on this cfs_rq.	397	* 'curr' points to currently running entity on this cfs_rq.
398	* It is set to NULL otherwise (i.e when none are currently running).	398	* It is set to NULL otherwise (i.e when none are currently running).
399	*/	399	*/
400	struct sched_entity curr, next;	400	struct sched_entity curr, next, *last;
401		401
402	unsigned long nr_spread_over;	402	unsigned int nr_spread_over;
403		403
404	#ifdef CONFIG_FAIR_GROUP_SCHED	404	#ifdef CONFIG_FAIR_GROUP_SCHED
405	struct rq rq; / cpu runqueue to which this cfs_rq is attached */	405	struct rq rq; / cpu runqueue to which this cfs_rq is attached */
@@ -703,45 +703,18 @@ static __read_mostly char *sched_feat_names[] = {
703		703
704	#undef SCHED_FEAT	704	#undef SCHED_FEAT
705		705
706	static int sched_feat_open(struct inode inode, struct file filp)	706	static int sched_feat_show(struct seq_file m, void v)
707	{
708	filp->private_data = inode->i_private;
709	return 0;
710	}
711
712	static ssize_t
713	sched_feat_read(struct file filp, char __user ubuf,
714	size_t cnt, loff_t *ppos)
715	{	707	{
716	char *buf;
717	int r = 0;
718	int len = 0;
719	int i;	708	int i;
720		709
721	for (i = 0; sched_feat_names[i]; i++) {	710	for (i = 0; sched_feat_names[i]; i++) {
722	len += strlen(sched_feat_names[i]);	711	if (!(sysctl_sched_features & (1UL << i)))
723	len += 4;	712	seq_puts(m, "NO_");
		713	seq_printf(m, "%s ", sched_feat_names[i]);
724	}	714	}
		715	seq_puts(m, "\n");
725		716
726	buf = kmalloc(len + 2, GFP_KERNEL);	717	return 0;
727	if (!buf)
728	return -ENOMEM;
729
730	for (i = 0; sched_feat_names[i]; i++) {
731	if (sysctl_sched_features & (1UL << i))
732	r += sprintf(buf + r, "%s ", sched_feat_names[i]);
733	else
734	r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
735	}
736
737	r += sprintf(buf + r, "\n");
738	WARN_ON(r >= len + 2);
739
740	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
741
742	kfree(buf);
743
744	return r;
745	}	718	}
746		719
747	static ssize_t	720	static ssize_t
@@ -786,10 +759,17 @@ sched_feat_write(struct file filp, const char __user ubuf,
786	return cnt;	759	return cnt;
787	}	760	}
788		761
		762	static int sched_feat_open(struct inode inode, struct file filp)
		763	{
		764	return single_open(filp, sched_feat_show, NULL);
		765	}
		766
789	static struct file_operations sched_feat_fops = {	767	static struct file_operations sched_feat_fops = {
790	.open = sched_feat_open,	768	.open = sched_feat_open,
791	.read = sched_feat_read,	769	.write = sched_feat_write,
792	.write = sched_feat_write,	770	.read = seq_read,
		771	.llseek = seq_lseek,
		772	.release = single_release,
793	};	773	};
794		774
795	static __init int sched_init_debug(void)	775	static __init int sched_init_debug(void)
@@ -969,6 +949,14 @@ static struct rq task_rq_lock(struct task_struct p, unsigned long *flags)
969	}	949	}
970	}	950	}
971		951
		952	void task_rq_unlock_wait(struct task_struct *p)
		953	{
		954	struct rq *rq = task_rq(p);
		955
		956	smp_mb(); /* spin-unlock-wait is not a full memory barrier */
		957	spin_unlock_wait(&rq->lock);
		958	}
		959
972	static void __task_rq_unlock(struct rq *rq)	960	static void __task_rq_unlock(struct rq *rq)
973	__releases(rq->lock)	961	__releases(rq->lock)
974	{	962	{
@@ -1448,6 +1436,8 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1448		1436
1449	if (rq->nr_running)	1437	if (rq->nr_running)
1450	rq->avg_load_per_task = rq->load.weight / rq->nr_running;	1438	rq->avg_load_per_task = rq->load.weight / rq->nr_running;
		1439	else
		1440	rq->avg_load_per_task = 0;
1451		1441
1452	return rq->avg_load_per_task;	1442	return rq->avg_load_per_task;
1453	}	1443	}
@@ -1463,27 +1453,13 @@ static void
1463	update_group_shares_cpu(struct task_group *tg, int cpu,	1453	update_group_shares_cpu(struct task_group *tg, int cpu,
1464	unsigned long sd_shares, unsigned long sd_rq_weight)	1454	unsigned long sd_shares, unsigned long sd_rq_weight)
1465	{	1455	{
1466	int boost = 0;
1467	unsigned long shares;	1456	unsigned long shares;
1468	unsigned long rq_weight;	1457	unsigned long rq_weight;
1469		1458
1470	if (!tg->se[cpu])	1459	if (!tg->se[cpu])
1471	return;	1460	return;
1472		1461
1473	rq_weight = tg->cfs_rq[cpu]->load.weight;	1462	rq_weight = tg->cfs_rq[cpu]->rq_weight;
1474
1475	/*
1476	* If there are currently no tasks on the cpu pretend there is one of
1477	* average load so that when a new task gets to run here it will not
1478	* get delayed by group starvation.
1479	*/
1480	if (!rq_weight) {
1481	boost = 1;
1482	rq_weight = NICE_0_LOAD;
1483	}
1484
1485	if (unlikely(rq_weight > sd_rq_weight))
1486	rq_weight = sd_rq_weight;
1487		1463
1488	/*	1464	/*
1489	* \Sum shares * rq_weight	1465	* \Sum shares * rq_weight
@@ -1491,7 +1467,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1491	* \Sum rq_weight	1467	* \Sum rq_weight
1492	*	1468	*
1493	*/	1469	*/
1494	shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);	1470	shares = (sd_shares * rq_weight) / sd_rq_weight;
1495	shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);	1471	shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1496		1472
1497	if (abs(shares - tg->se[cpu]->load.weight) >	1473	if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1500,11 +1476,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1500	unsigned long flags;	1476	unsigned long flags;
1501		1477
1502	spin_lock_irqsave(&rq->lock, flags);	1478	spin_lock_irqsave(&rq->lock, flags);
1503	/*	1479	tg->cfs_rq[cpu]->shares = shares;
1504	* record the actual number of shares, not the boosted amount.
1505	*/
1506	tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1507	tg->cfs_rq[cpu]->rq_weight = rq_weight;
1508		1480
1509	__set_se_shares(tg->se[cpu], shares);	1481	__set_se_shares(tg->se[cpu], shares);
1510	spin_unlock_irqrestore(&rq->lock, flags);	1482	spin_unlock_irqrestore(&rq->lock, flags);
@@ -1518,13 +1490,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1518	*/	1490	*/
1519	static int tg_shares_up(struct task_group tg, void data)	1491	static int tg_shares_up(struct task_group tg, void data)
1520	{	1492	{
1521	unsigned long rq_weight = 0;	1493	unsigned long weight, rq_weight = 0;
1522	unsigned long shares = 0;	1494	unsigned long shares = 0;
1523	struct sched_domain *sd = data;	1495	struct sched_domain *sd = data;
1524	int i;	1496	int i;
1525		1497
1526	for_each_cpu_mask(i, sd->span) {	1498	for_each_cpu_mask(i, sd->span) {
1527	rq_weight += tg->cfs_rq[i]->load.weight;	1499	/*
		1500	* If there are currently no tasks on the cpu pretend there
		1501	* is one of average load so that when a new task gets to
		1502	* run here it will not get delayed by group starvation.
		1503	*/
		1504	weight = tg->cfs_rq[i]->load.weight;
		1505	if (!weight)
		1506	weight = NICE_0_LOAD;
		1507
		1508	tg->cfs_rq[i]->rq_weight = weight;
		1509	rq_weight += weight;
1528	shares += tg->cfs_rq[i]->shares;	1510	shares += tg->cfs_rq[i]->shares;
1529	}	1511	}
1530		1512
@@ -1534,9 +1516,6 @@ static int tg_shares_up(struct task_group tg, void data)
1534	if (!sd->parent \|\| !(sd->parent->flags & SD_LOAD_BALANCE))	1516	if (!sd->parent \|\| !(sd->parent->flags & SD_LOAD_BALANCE))
1535	shares = tg->shares;	1517	shares = tg->shares;
1536		1518
1537	if (!rq_weight)
1538	rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1539
1540	for_each_cpu_mask(i, sd->span)	1519	for_each_cpu_mask(i, sd->span)
1541	update_group_shares_cpu(tg, i, shares, rq_weight);	1520	update_group_shares_cpu(tg, i, shares, rq_weight);
1542		1521
@@ -1805,7 +1784,9 @@ task_hot(struct task_struct p, u64 now, struct sched_domain sd)
1805	/*	1784	/*
1806	* Buddy candidates are cache hot:	1785	* Buddy candidates are cache hot:
1807	*/	1786	*/
1808	if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))	1787	if (sched_feat(CACHE_HOT_BUDDY) &&
		1788	(&p->se == cfs_rq_of(&p->se)->next \|\|
		1789	&p->se == cfs_rq_of(&p->se)->last))
1809	return 1;	1790	return 1;
1810		1791
1811	if (p->sched_class != &fair_sched_class)	1792	if (p->sched_class != &fair_sched_class)
@@ -5858,6 +5839,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5858	struct rq *rq = cpu_rq(cpu);	5839	struct rq *rq = cpu_rq(cpu);
5859	unsigned long flags;	5840	unsigned long flags;
5860		5841
		5842	spin_lock_irqsave(&rq->lock, flags);
		5843
5861	__sched_fork(idle);	5844	__sched_fork(idle);
5862	idle->se.exec_start = sched_clock();	5845	idle->se.exec_start = sched_clock();
5863		5846
@@ -5865,7 +5848,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5865	idle->cpus_allowed = cpumask_of_cpu(cpu);	5848	idle->cpus_allowed = cpumask_of_cpu(cpu);
5866	__set_task_cpu(idle, cpu);	5849	__set_task_cpu(idle, cpu);
5867		5850
5868	spin_lock_irqsave(&rq->lock, flags);
5869	rq->curr = rq->idle = idle;	5851	rq->curr = rq->idle = idle;
5870	#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)	5852	#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5871	idle->oncpu = 1;	5853	idle->oncpu = 1;
@@ -6112,7 +6094,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6112		6094
6113	/*	6095	/*
6114	* Figure out where task on dead CPU should go, use force if necessary.	6096	* Figure out where task on dead CPU should go, use force if necessary.
6115	* NOTE: interrupts should be disabled by the caller
6116	*/	6097	*/
6117	static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)	6098	static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6118	{	6099	{
@@ -6622,28 +6603,6 @@ early_initcall(migration_init);
6622		6603
6623	#ifdef CONFIG_SCHED_DEBUG	6604	#ifdef CONFIG_SCHED_DEBUG
6624		6605
6625	static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6626	{
6627	switch (lvl) {
6628	case SD_LV_NONE:
6629	return "NONE";
6630	case SD_LV_SIBLING:
6631	return "SIBLING";
6632	case SD_LV_MC:
6633	return "MC";
6634	case SD_LV_CPU:
6635	return "CPU";
6636	case SD_LV_NODE:
6637	return "NODE";
6638	case SD_LV_ALLNODES:
6639	return "ALLNODES";
6640	case SD_LV_MAX:
6641	return "MAX";
6642
6643	}
6644	return "MAX";
6645	}
6646
6647	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,	6606	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6648	cpumask_t *groupmask)	6607	cpumask_t *groupmask)
6649	{	6608	{
@@ -6663,8 +6622,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6663	return -1;	6622	return -1;
6664	}	6623	}
6665		6624
6666	printk(KERN_CONT "span %s level %s\n",	6625	printk(KERN_CONT "span %s level %s\n", str, sd->name);
6667	str, sd_level_to_string(sd->level));
6668		6626
6669	if (!cpu_isset(cpu, sd->span)) {	6627	if (!cpu_isset(cpu, sd->span)) {
6670	printk(KERN_ERR "ERROR: domain->span does not contain "	6628	printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6875,15 +6833,17 @@ cpu_attach_domain(struct sched_domain sd, struct root_domain rd, int cpu)
6875	struct sched_domain *tmp;	6833	struct sched_domain *tmp;
6876		6834
6877	/* Remove the sched domains which do not contribute to scheduling. */	6835	/* Remove the sched domains which do not contribute to scheduling. */
6878	for (tmp = sd; tmp; tmp = tmp->parent) {	6836	for (tmp = sd; tmp; ) {
6879	struct sched_domain *parent = tmp->parent;	6837	struct sched_domain *parent = tmp->parent;
6880	if (!parent)	6838	if (!parent)
6881	break;	6839	break;
		6840
6882	if (sd_parent_degenerate(tmp, parent)) {	6841	if (sd_parent_degenerate(tmp, parent)) {
6883	tmp->parent = parent->parent;	6842	tmp->parent = parent->parent;
6884	if (parent->parent)	6843	if (parent->parent)
6885	parent->parent->child = tmp;	6844	parent->parent->child = tmp;
6886	}	6845	} else
		6846	tmp = tmp->parent;
6887	}	6847	}
6888		6848
6889	if (sd && sd_degenerate(sd)) {	6849	if (sd && sd_degenerate(sd)) {
@@ -7318,13 +7278,21 @@ struct allmasks {
7318	};	7278	};
7319		7279
7320	#if NR_CPUS > 128	7280	#if NR_CPUS > 128
7321	#define SCHED_CPUMASK_ALLOC 1	7281	#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7322	#define SCHED_CPUMASK_FREE(v) kfree(v)	7282	static inline void sched_cpumask_alloc(struct allmasks **masks)
7323	#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v	7283	{
		7284	masks = kmalloc(sizeof(*masks), GFP_KERNEL);
		7285	}
		7286	static inline void sched_cpumask_free(struct allmasks *masks)
		7287	{
		7288	kfree(masks);
		7289	}
7324	#else	7290	#else
7325	#define SCHED_CPUMASK_ALLOC 0	7291	#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7326	#define SCHED_CPUMASK_FREE(v)	7292	static inline void sched_cpumask_alloc(struct allmasks **masks)
7327	#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v	7293	{ }
		7294	static inline void sched_cpumask_free(struct allmasks *masks)
		7295	{ }
7328	#endif	7296	#endif
7329		7297
7330	#define SCHED_CPUMASK_VAR(v, a) cpumask_t v = (cpumask_t ) \	7298	#define SCHED_CPUMASK_VAR(v, a) cpumask_t v = (cpumask_t ) \
@@ -7400,9 +7368,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7400	return -ENOMEM;	7368	return -ENOMEM;
7401	}	7369	}
7402		7370
7403	#if SCHED_CPUMASK_ALLOC
7404	/* get space for all scratch cpumask variables */	7371	/* get space for all scratch cpumask variables */
7405	allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);	7372	sched_cpumask_alloc(&allmasks);
7406	if (!allmasks) {	7373	if (!allmasks) {
7407	printk(KERN_WARNING "Cannot alloc cpumask array\n");	7374	printk(KERN_WARNING "Cannot alloc cpumask array\n");
7408	kfree(rd);	7375	kfree(rd);
@@ -7411,7 +7378,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7411	#endif	7378	#endif
7412	return -ENOMEM;	7379	return -ENOMEM;
7413	}	7380	}
7414	#endif	7381
7415	tmpmask = (cpumask_t *)allmasks;	7382	tmpmask = (cpumask_t *)allmasks;
7416		7383
7417		7384
@@ -7665,13 +7632,14 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7665	cpu_attach_domain(sd, rd, i);	7632	cpu_attach_domain(sd, rd, i);
7666	}	7633	}
7667		7634
7668	SCHED_CPUMASK_FREE((void *)allmasks);	7635	sched_cpumask_free(allmasks);
7669	return 0;	7636	return 0;
7670		7637
7671	#ifdef CONFIG_NUMA	7638	#ifdef CONFIG_NUMA
7672	error:	7639	error:
7673	free_sched_groups(cpu_map, tmpmask);	7640	free_sched_groups(cpu_map, tmpmask);
7674	SCHED_CPUMASK_FREE((void *)allmasks);	7641	sched_cpumask_free(allmasks);
		7642	kfree(rd);
7675	return -ENOMEM;	7643	return -ENOMEM;
7676	#endif	7644	#endif
7677	}	7645	}
@@ -7734,8 +7702,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
7734	cpumask_t tmpmask;	7702	cpumask_t tmpmask;
7735	int i;	7703	int i;
7736		7704
7737	unregister_sched_domain_sysctl();
7738
7739	for_each_cpu_mask_nr(i, *cpu_map)	7705	for_each_cpu_mask_nr(i, *cpu_map)
7740	cpu_attach_domain(NULL, &def_root_domain, i);	7706	cpu_attach_domain(NULL, &def_root_domain, i);
7741	synchronize_sched();	7707	synchronize_sched();
@@ -7773,13 +7739,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7773	*	7739	*
7774	* The passed in 'doms_new' should be kmalloc'd. This routine takes	7740	* The passed in 'doms_new' should be kmalloc'd. This routine takes
7775	* ownership of it and will kfree it when done with it. If the caller	7741	* ownership of it and will kfree it when done with it. If the caller
7776	* failed the kmalloc call, then it can pass in doms_new == NULL,	7742	* failed the kmalloc call, then it can pass in doms_new == NULL &&
7777	* and partition_sched_domains() will fallback to the single partition	7743	* ndoms_new == 1, and partition_sched_domains() will fallback to
7778	* 'fallback_doms', it also forces the domains to be rebuilt.	7744	* the single partition 'fallback_doms', it also forces the domains
		7745	* to be rebuilt.
7779	*	7746	*
7780	* If doms_new==NULL it will be replaced with cpu_online_map.	7747	* If doms_new == NULL it will be replaced with cpu_online_map.
7781	* ndoms_new==0 is a special case for destroying existing domains.	7748	* ndoms_new == 0 is a special case for destroying existing domains,
7782	* It will not create the default domain.	7749	* and it will not create the default domain.
7783	*	7750	*
7784	* Call with hotplug lock held	7751	* Call with hotplug lock held
7785	*/	7752	*/
@@ -7812,7 +7779,7 @@ match1:
7812	ndoms_cur = 0;	7779	ndoms_cur = 0;
7813	doms_new = &fallback_doms;	7780	doms_new = &fallback_doms;
7814	cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);	7781	cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7815	dattr_new = NULL;	7782	WARN_ON_ONCE(dattr_new);
7816	}	7783	}
7817		7784
7818	/* Build new domains */	7785	/* Build new domains */
@@ -8472,7 +8439,7 @@ static
8472	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)	8439	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
8473	{	8440	{
8474	struct cfs_rq *cfs_rq;	8441	struct cfs_rq *cfs_rq;
8475	struct sched_entity se, parent_se;	8442	struct sched_entity *se;
8476	struct rq *rq;	8443	struct rq *rq;
8477	int i;	8444	int i;
8478		8445
@@ -8488,18 +8455,17 @@ int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
8488	for_each_possible_cpu(i) {	8455	for_each_possible_cpu(i) {
8489	rq = cpu_rq(i);	8456	rq = cpu_rq(i);
8490		8457
8491	cfs_rq = kmalloc_node(sizeof(struct cfs_rq),	8458	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8492	GFP_KERNEL\|__GFP_ZERO, cpu_to_node(i));	8459	GFP_KERNEL, cpu_to_node(i));
8493	if (!cfs_rq)	8460	if (!cfs_rq)
8494	goto err;	8461	goto err;
8495		8462
8496	se = kmalloc_node(sizeof(struct sched_entity),	8463	se = kzalloc_node(sizeof(struct sched_entity),
8497	GFP_KERNEL\|__GFP_ZERO, cpu_to_node(i));	8464	GFP_KERNEL, cpu_to_node(i));
8498	if (!se)	8465	if (!se)
8499	goto err;	8466	goto err;
8500		8467
8501	parent_se = parent ? parent->se[i] : NULL;	8468	init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
8502	init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8503	}	8469	}
8504		8470
8505	return 1;	8471	return 1;
@@ -8560,7 +8526,7 @@ static
8560	int alloc_rt_sched_group(struct task_group tg, struct task_group parent)	8526	int alloc_rt_sched_group(struct task_group tg, struct task_group parent)
8561	{	8527	{
8562	struct rt_rq *rt_rq;	8528	struct rt_rq *rt_rq;
8563	struct sched_rt_entity rt_se, parent_se;	8529	struct sched_rt_entity *rt_se;
8564	struct rq *rq;	8530	struct rq *rq;
8565	int i;	8531	int i;
8566		8532
@@ -8577,18 +8543,17 @@ int alloc_rt_sched_group(struct task_group tg, struct task_group parent)
8577	for_each_possible_cpu(i) {	8543	for_each_possible_cpu(i) {
8578	rq = cpu_rq(i);	8544	rq = cpu_rq(i);
8579		8545
8580	rt_rq = kmalloc_node(sizeof(struct rt_rq),	8546	rt_rq = kzalloc_node(sizeof(struct rt_rq),
8581	GFP_KERNEL\|__GFP_ZERO, cpu_to_node(i));	8547	GFP_KERNEL, cpu_to_node(i));
8582	if (!rt_rq)	8548	if (!rt_rq)
8583	goto err;	8549	goto err;
8584		8550
8585	rt_se = kmalloc_node(sizeof(struct sched_rt_entity),	8551	rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8586	GFP_KERNEL\|__GFP_ZERO, cpu_to_node(i));	8552	GFP_KERNEL, cpu_to_node(i));
8587	if (!rt_se)	8553	if (!rt_se)
8588	goto err;	8554	goto err;
8589		8555
8590	parent_se = parent ? parent->rt_se[i] : NULL;	8556	init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
8591	init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
8592	}	8557	}
8593		8558
8594	return 1;	8559	return 1;
@@ -9231,11 +9196,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9231	* (balbir@in.ibm.com).	9196	* (balbir@in.ibm.com).
9232	*/	9197	*/
9233		9198
9234	/* track cpu usage of a group of tasks */	9199	/* track cpu usage of a group of tasks and its child groups */
9235	struct cpuacct {	9200	struct cpuacct {
9236	struct cgroup_subsys_state css;	9201	struct cgroup_subsys_state css;
9237	/* cpuusage holds pointer to a u64-type object on every cpu */	9202	/* cpuusage holds pointer to a u64-type object on every cpu */
9238	u64 *cpuusage;	9203	u64 *cpuusage;
		9204	struct cpuacct *parent;
9239	};	9205	};
9240		9206
9241	struct cgroup_subsys cpuacct_subsys;	9207	struct cgroup_subsys cpuacct_subsys;
@@ -9269,6 +9235,9 @@ static struct cgroup_subsys_state *cpuacct_create(
9269	return ERR_PTR(-ENOMEM);	9235	return ERR_PTR(-ENOMEM);
9270	}	9236	}
9271		9237
		9238	if (cgrp->parent)
		9239	ca->parent = cgroup_ca(cgrp->parent);
		9240
9272	return &ca->css;	9241	return &ca->css;
9273	}	9242	}
9274		9243
@@ -9348,14 +9317,16 @@ static int cpuacct_populate(struct cgroup_subsys ss, struct cgroup cgrp)
9348	static void cpuacct_charge(struct task_struct *tsk, u64 cputime)	9317	static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9349	{	9318	{
9350	struct cpuacct *ca;	9319	struct cpuacct *ca;
		9320	int cpu;
9351		9321
9352	if (!cpuacct_subsys.active)	9322	if (!cpuacct_subsys.active)
9353	return;	9323	return;
9354		9324
		9325	cpu = task_cpu(tsk);
9355	ca = task_ca(tsk);	9326	ca = task_ca(tsk);
9356	if (ca) {
9357	u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
9358		9327
		9328	for (; ca; ca = ca->parent) {
		9329	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9359	*cpuusage += cputime;	9330	*cpuusage += cputime;
9360	}	9331	}
9361	}	9332	}