3 files changed, 103 insertions, 14 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 62830eaec52f..1b7399dfa361 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -273,6 +273,7 @@ struct task_group {
        struct list_head list;
 };
+#ifdef CONFIG_USER_SCHED
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /* Default task group's sched entity on each cpu */
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
@@ -284,6 +285,7 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 #endif
+#endif
 /* task_group_lock serializes add/remove of task groups and also changes to
 * a task group's cpu shares.
@@ -7447,6 +7449,10 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
                list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
        tg->se[cpu] = se;
+        /* se could be NULL for init_task_group */
+        if (!se)
+                return;
        se->cfs_rq = &rq->cfs;
        se->my_q = cfs_rq;
        se->load.weight = tg->shares;
@@ -7469,6 +7475,9 @@ static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
                list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
        tg->rt_se[cpu] = rt_se;
+        if (!rt_se)
+                return;
        rt_se->rt_rq = &rq->rt;
        rt_se->my_q = rt_rq;
        rt_se->parent = NULL;
@@ -7539,18 +7548,56 @@ void __init sched_init(void)
 #ifdef CONFIG_FAIR_GROUP_SCHED
                init_task_group.shares = init_task_group_load;
                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+#ifdef CONFIG_CGROUP_SCHED
+                /*
+                 * How much cpu bandwidth does init_task_group get?
+                 *
+                 * In case of task-groups formed thr' the cgroup filesystem, it
+                 * gets 100% of the cpu resources in the system. This overall
+                 * system cpu resource is divided among the tasks of
+                 * init_task_group and its child task-groups in a fair manner,
+                 * based on each entity's (task or task-group's) weight
+                 * (se->load.weight).
+                 *
+                 * In other words, if init_task_group has 10 tasks of weight
+                 * 1024) and two child groups A0 and A1 (of weight 1024 each),
+                 * then A0's share of the cpu resource is:
+                 *
+                 *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+                 *
+                 * We achieve this by letting init_task_group's tasks sit
+                 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
+                 */
+                init_tg_cfs_entry(rq, &init_task_group, &rq->cfs, NULL, i, 1);
+#elif defined CONFIG_USER_SCHED
+                /*
+                 * In case of task-groups formed thr' the user id of tasks,
+                 * init_task_group represents tasks belonging to root user.
+                 * Hence it forms a sibling of all subsequent groups formed.
+                 * In this case, init_task_group gets only a fraction of overall
+                 * system cpu resource, based on the weight assigned to root
+                 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
+                 * by letting tasks of init_task_group sit in a separate cfs_rq
+                 * (init_cfs_rq) and having one entity represent this group of
+                 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
+                 */
                init_tg_cfs_entry(rq, &init_task_group,
                                &per_cpu(init_cfs_rq, i),
                                &per_cpu(init_sched_entity, i), i, 1);
 #endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+                rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
                INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
+#ifdef CONFIG_CGROUP_SCHED
+                init_tg_rt_entry(rq, &init_task_group, &rq->rt, NULL, i, 1);
+#elif defined CONFIG_USER_SCHED
                init_tg_rt_entry(rq, &init_task_group,
                                &per_cpu(init_rt_rq, i),
                                &per_cpu(init_sched_rt_entity, i), i, 1);
-#else
+#endif
-                rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #endif
                for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 022e036f2c3e..3dde0f0ec93a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1133,6 +1133,17 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
        return 0;
 }
+/* return depth at which a sched entity is present in the hierarchy */
+static inline int depth_se(struct sched_entity *se)
+{
+        int depth = 0;
+        for_each_sched_entity(se)
+                depth++;
+        return depth;
+}
 /*
 * Preempt the current task with a newly woken task if needed:
 */
@@ -1141,6 +1152,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
        struct task_struct *curr = rq->curr;
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
        struct sched_entity *se = &curr->se, *pse = &p->se;
+        int se_depth, pse_depth;
        if (unlikely(rt_prio(p->prio))) {
                update_rq_clock(rq);
@@ -1165,6 +1177,27 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
        if (!sched_feat(WAKEUP_PREEMPT))
                return;
+        /*
+         * preemption test can be made between sibling entities who are in the
+         * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
+         * both tasks until we find their ancestors who are siblings of common
+         * parent.
+         */
+        /* First walk up until both entities are at same depth */
+        se_depth = depth_se(se);
+        pse_depth = depth_se(pse);
+        while (se_depth > pse_depth) {
+                se_depth--;
+                se = parent_entity(se);
+        }
+        while (pse_depth > se_depth) {
+                pse_depth--;
+                pse = parent_entity(pse);
+        }
        while (!is_same_group(se, pse)) {
                se = parent_entity(se);
                pse = parent_entity(pse);
@@ -1223,13 +1256,22 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 static struct task_struct *
 __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
 {
-        struct task_struct *p;
+        struct task_struct *p = NULL;
+        struct sched_entity *se;
        if (!curr)
                return NULL;
-        p = rb_entry(curr, struct task_struct, se.run_node);
+        /* Skip over entities that are not tasks */
-        cfs_rq->rb_load_balance_curr = rb_next(curr);
+        do {
+                se = rb_entry(curr, struct sched_entity, run_node);
+                curr = rb_next(curr);
+        } while (curr && !entity_is_task(se));
+        cfs_rq->rb_load_balance_curr = curr;
+        if (entity_is_task(se))
+                p = task_of(se);
        return p;
 }
@@ -1489,9 +1531,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
 {
        struct cfs_rq *cfs_rq;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
-#endif
        rcu_read_lock();
        for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
                print_cfs_rq(m, cpu, cfs_rq);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8ff824565e06..201a69382a42 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -374,11 +374,15 @@ static void update_curr_rt(struct rq *rq)
        curr->se.exec_start = rq->clock;
        cpuacct_charge(curr, delta_exec);
-        spin_lock(&rt_rq->rt_runtime_lock);
+        for_each_sched_rt_entity(rt_se) {
-        rt_rq->rt_time += delta_exec;
+                rt_rq = rt_rq_of_se(rt_se);
-        if (sched_rt_runtime_exceeded(rt_rq))
-                resched_task(curr);
+                spin_lock(&rt_rq->rt_runtime_lock);
-        spin_unlock(&rt_rq->rt_runtime_lock);
+                rt_rq->rt_time += delta_exec;
+                if (sched_rt_runtime_exceeded(rt_rq))
+                        resched_task(curr);
+                spin_unlock(&rt_rq->rt_runtime_lock);
+        }
 }
 static inline
@@ -477,7 +481,6 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 * entries, we must remove entries top - down.
 *
 * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
- *      doesn't matter much for now, as h=2 for GROUP_SCHED.
 */
 static void dequeue_rt_stack(struct task_struct *p)
 {

diff --git a/kernel/sched.c b/kernel/sched.c index 62830eaec52f..1b7399dfa361 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -273,6 +273,7 @@ struct task_group {
273	struct list_head list;	273	struct list_head list;
274	};	274	};
275		275
		276	#ifdef CONFIG_USER_SCHED
276	#ifdef CONFIG_FAIR_GROUP_SCHED	277	#ifdef CONFIG_FAIR_GROUP_SCHED
277	/* Default task group's sched entity on each cpu */	278	/* Default task group's sched entity on each cpu */
278	static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);	279	static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
@@ -284,6 +285,7 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
284	static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);	285	static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
285	static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;	286	static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
286	#endif	287	#endif
		288	#endif
287		289
288	/* task_group_lock serializes add/remove of task groups and also changes to	290	/* task_group_lock serializes add/remove of task groups and also changes to
289	* a task group's cpu shares.	291	* a task group's cpu shares.
@@ -7447,6 +7449,10 @@ static void init_tg_cfs_entry(struct rq rq, struct task_group tg,
7447	list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);	7449	list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7448		7450
7449	tg->se[cpu] = se;	7451	tg->se[cpu] = se;
		7452	/* se could be NULL for init_task_group */
		7453	if (!se)
		7454	return;
		7455
7450	se->cfs_rq = &rq->cfs;	7456	se->cfs_rq = &rq->cfs;
7451	se->my_q = cfs_rq;	7457	se->my_q = cfs_rq;
7452	se->load.weight = tg->shares;	7458	se->load.weight = tg->shares;
@@ -7469,6 +7475,9 @@ static void init_tg_rt_entry(struct rq rq, struct task_group tg,
7469	list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);	7475	list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7470		7476
7471	tg->rt_se[cpu] = rt_se;	7477	tg->rt_se[cpu] = rt_se;
		7478	if (!rt_se)
		7479	return;
		7480
7472	rt_se->rt_rq = &rq->rt;	7481	rt_se->rt_rq = &rq->rt;
7473	rt_se->my_q = rt_rq;	7482	rt_se->my_q = rt_rq;
7474	rt_se->parent = NULL;	7483	rt_se->parent = NULL;
@@ -7539,18 +7548,56 @@ void __init sched_init(void)
7539	#ifdef CONFIG_FAIR_GROUP_SCHED	7548	#ifdef CONFIG_FAIR_GROUP_SCHED
7540	init_task_group.shares = init_task_group_load;	7549	init_task_group.shares = init_task_group_load;
7541	INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);	7550	INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
		7551	#ifdef CONFIG_CGROUP_SCHED
		7552	/*
		7553	* How much cpu bandwidth does init_task_group get?
		7554	*
		7555	* In case of task-groups formed thr' the cgroup filesystem, it
		7556	* gets 100% of the cpu resources in the system. This overall
		7557	* system cpu resource is divided among the tasks of
		7558	* init_task_group and its child task-groups in a fair manner,
		7559	* based on each entity's (task or task-group's) weight
		7560	* (se->load.weight).
		7561	*
		7562	* In other words, if init_task_group has 10 tasks of weight
		7563	* 1024) and two child groups A0 and A1 (of weight 1024 each),
		7564	* then A0's share of the cpu resource is:
		7565	*
		7566	* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
		7567	*
		7568	* We achieve this by letting init_task_group's tasks sit
		7569	* directly in rq->cfs (i.e init_task_group->se[] = NULL).
		7570	*/
		7571	init_tg_cfs_entry(rq, &init_task_group, &rq->cfs, NULL, i, 1);
		7572	#elif defined CONFIG_USER_SCHED
		7573	/*
		7574	* In case of task-groups formed thr' the user id of tasks,
		7575	* init_task_group represents tasks belonging to root user.
		7576	* Hence it forms a sibling of all subsequent groups formed.
		7577	* In this case, init_task_group gets only a fraction of overall
		7578	* system cpu resource, based on the weight assigned to root
		7579	* user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
		7580	* by letting tasks of init_task_group sit in a separate cfs_rq
		7581	* (init_cfs_rq) and having one entity represent this group of
		7582	* tasks in rq->cfs (i.e init_task_group->se[] != NULL).
		7583	*/
7542	init_tg_cfs_entry(rq, &init_task_group,	7584	init_tg_cfs_entry(rq, &init_task_group,
7543	&per_cpu(init_cfs_rq, i),	7585	&per_cpu(init_cfs_rq, i),
7544	&per_cpu(init_sched_entity, i), i, 1);	7586	&per_cpu(init_sched_entity, i), i, 1);
7545		7587
7546	#endif	7588	#endif
		7589	#endif /* CONFIG_FAIR_GROUP_SCHED */
		7590
		7591	rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7547	#ifdef CONFIG_RT_GROUP_SCHED	7592	#ifdef CONFIG_RT_GROUP_SCHED
7548	INIT_LIST_HEAD(&rq->leaf_rt_rq_list);	7593	INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
		7594	#ifdef CONFIG_CGROUP_SCHED
		7595	init_tg_rt_entry(rq, &init_task_group, &rq->rt, NULL, i, 1);
		7596	#elif defined CONFIG_USER_SCHED
7549	init_tg_rt_entry(rq, &init_task_group,	7597	init_tg_rt_entry(rq, &init_task_group,
7550	&per_cpu(init_rt_rq, i),	7598	&per_cpu(init_rt_rq, i),
7551	&per_cpu(init_sched_rt_entity, i), i, 1);	7599	&per_cpu(init_sched_rt_entity, i), i, 1);
7552	#else	7600	#endif
7553	rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7554	#endif	7601	#endif
7555		7602
7556	for (j = 0; j < CPU_LOAD_IDX_MAX; j++)	7603	for (j = 0; j < CPU_LOAD_IDX_MAX; j++)


diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 022e036f2c3e..3dde0f0ec93a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -1133,6 +1133,17 @@ wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
1133	return 0;	1133	return 0;
1134	}	1134	}
1135		1135
		1136	/* return depth at which a sched entity is present in the hierarchy */
		1137	static inline int depth_se(struct sched_entity *se)
		1138	{
		1139	int depth = 0;
		1140
		1141	for_each_sched_entity(se)
		1142	depth++;
		1143
		1144	return depth;
		1145	}
		1146
1136	/*	1147	/*
1137	* Preempt the current task with a newly woken task if needed:	1148	* Preempt the current task with a newly woken task if needed:
1138	*/	1149	*/
@@ -1141,6 +1152,7 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p)
1141	struct task_struct *curr = rq->curr;	1152	struct task_struct *curr = rq->curr;
1142	struct cfs_rq *cfs_rq = task_cfs_rq(curr);	1153	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1143	struct sched_entity se = &curr->se, pse = &p->se;	1154	struct sched_entity se = &curr->se, pse = &p->se;
		1155	int se_depth, pse_depth;
1144		1156
1145	if (unlikely(rt_prio(p->prio))) {	1157	if (unlikely(rt_prio(p->prio))) {
1146	update_rq_clock(rq);	1158	update_rq_clock(rq);
@@ -1165,6 +1177,27 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p)
1165	if (!sched_feat(WAKEUP_PREEMPT))	1177	if (!sched_feat(WAKEUP_PREEMPT))
1166	return;	1178	return;
1167		1179
		1180	/*
		1181	* preemption test can be made between sibling entities who are in the
		1182	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
		1183	* both tasks until we find their ancestors who are siblings of common
		1184	* parent.
		1185	*/
		1186
		1187	/* First walk up until both entities are at same depth */
		1188	se_depth = depth_se(se);
		1189	pse_depth = depth_se(pse);
		1190
		1191	while (se_depth > pse_depth) {
		1192	se_depth--;
		1193	se = parent_entity(se);
		1194	}
		1195
		1196	while (pse_depth > se_depth) {
		1197	pse_depth--;
		1198	pse = parent_entity(pse);
		1199	}
		1200
1168	while (!is_same_group(se, pse)) {	1201	while (!is_same_group(se, pse)) {
1169	se = parent_entity(se);	1202	se = parent_entity(se);
1170	pse = parent_entity(pse);	1203	pse = parent_entity(pse);
@@ -1223,13 +1256,22 @@ static void put_prev_task_fair(struct rq rq, struct task_struct prev)
1223	static struct task_struct *	1256	static struct task_struct *
1224	__load_balance_iterator(struct cfs_rq cfs_rq, struct rb_node curr)	1257	__load_balance_iterator(struct cfs_rq cfs_rq, struct rb_node curr)
1225	{	1258	{
1226	struct task_struct *p;	1259	struct task_struct *p = NULL;
		1260	struct sched_entity *se;
1227		1261
1228	if (!curr)	1262	if (!curr)
1229	return NULL;	1263	return NULL;
1230		1264
1231	p = rb_entry(curr, struct task_struct, se.run_node);	1265	/* Skip over entities that are not tasks */
1232	cfs_rq->rb_load_balance_curr = rb_next(curr);	1266	do {
		1267	se = rb_entry(curr, struct sched_entity, run_node);
		1268	curr = rb_next(curr);
		1269	} while (curr && !entity_is_task(se));
		1270
		1271	cfs_rq->rb_load_balance_curr = curr;
		1272
		1273	if (entity_is_task(se))
		1274	p = task_of(se);
1233		1275
1234	return p;	1276	return p;
1235	}	1277	}
@@ -1489,9 +1531,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
1489	{	1531	{
1490	struct cfs_rq *cfs_rq;	1532	struct cfs_rq *cfs_rq;
1491		1533
1492	#ifdef CONFIG_FAIR_GROUP_SCHED
1493	print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
1494	#endif
1495	rcu_read_lock();	1534	rcu_read_lock();
1496	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)	1535	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1497	print_cfs_rq(m, cpu, cfs_rq);	1536	print_cfs_rq(m, cpu, cfs_rq);


diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8ff824565e06..201a69382a42 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c
@@ -374,11 +374,15 @@ static void update_curr_rt(struct rq *rq)
374	curr->se.exec_start = rq->clock;	374	curr->se.exec_start = rq->clock;
375	cpuacct_charge(curr, delta_exec);	375	cpuacct_charge(curr, delta_exec);
376		376
377	spin_lock(&rt_rq->rt_runtime_lock);	377	for_each_sched_rt_entity(rt_se) {
378	rt_rq->rt_time += delta_exec;	378	rt_rq = rt_rq_of_se(rt_se);
379	if (sched_rt_runtime_exceeded(rt_rq))	379
380	resched_task(curr);	380	spin_lock(&rt_rq->rt_runtime_lock);
381	spin_unlock(&rt_rq->rt_runtime_lock);	381	rt_rq->rt_time += delta_exec;
		382	if (sched_rt_runtime_exceeded(rt_rq))
		383	resched_task(curr);
		384	spin_unlock(&rt_rq->rt_runtime_lock);
		385	}
382	}	386	}
383		387
384	static inline	388	static inline
@@ -477,7 +481,6 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
477	* entries, we must remove entries top - down.	481	* entries, we must remove entries top - down.
478	*	482	*
479	* XXX: O(1/2 h^2) because we can only walk up, not down the chain.	483	* XXX: O(1/2 h^2) because we can only walk up, not down the chain.
480	* doesn't matter much for now, as h=2 for GROUP_SCHED.
481	*/	484	*/
482	static void dequeue_rt_stack(struct task_struct *p)	485	static void dequeue_rt_stack(struct task_struct *p)
483	{	486	{