aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDhaval Giani <dhaval@linux.vnet.ibm.com>2008-04-19 13:44:59 -0400
committerIngo Molnar <mingo@elte.hu>2008-04-19 13:44:59 -0400
commit354d60c2ff72d86627dfe2089d186824abf4bb8e (patch)
tree10cea61ce7036448ed7246820c5575df2a61bb3b
parentea736ed5d353d7a3aa1cf8ce4cf8d947bc353fb2 (diff)
sched: mix tasks and groups
This patch allows tasks and groups to exist in the same cfs_rq. With this change the CFS group scheduling follows a 1/(M+N) model from a 1/(1+N) fairness model where M tasks and N groups exist at the cfs_rq level. [a.p.zijlstra@chello.nl: rt bits and assorted fixes] Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com> Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--kernel/sched.c51
-rw-r--r--kernel/sched_fair.c51
-rw-r--r--kernel/sched_rt.c15
3 files changed, 103 insertions, 14 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 62830eaec52f..1b7399dfa361 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -273,6 +273,7 @@ struct task_group {
273 struct list_head list; 273 struct list_head list;
274}; 274};
275 275
276#ifdef CONFIG_USER_SCHED
276#ifdef CONFIG_FAIR_GROUP_SCHED 277#ifdef CONFIG_FAIR_GROUP_SCHED
277/* Default task group's sched entity on each cpu */ 278/* Default task group's sched entity on each cpu */
278static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 279static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
@@ -284,6 +285,7 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
284static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 285static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
285static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 286static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
286#endif 287#endif
288#endif
287 289
288/* task_group_lock serializes add/remove of task groups and also changes to 290/* task_group_lock serializes add/remove of task groups and also changes to
289 * a task group's cpu shares. 291 * a task group's cpu shares.
@@ -7447,6 +7449,10 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
7447 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 7449 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7448 7450
7449 tg->se[cpu] = se; 7451 tg->se[cpu] = se;
7452 /* se could be NULL for init_task_group */
7453 if (!se)
7454 return;
7455
7450 se->cfs_rq = &rq->cfs; 7456 se->cfs_rq = &rq->cfs;
7451 se->my_q = cfs_rq; 7457 se->my_q = cfs_rq;
7452 se->load.weight = tg->shares; 7458 se->load.weight = tg->shares;
@@ -7469,6 +7475,9 @@ static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
7469 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7475 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7470 7476
7471 tg->rt_se[cpu] = rt_se; 7477 tg->rt_se[cpu] = rt_se;
7478 if (!rt_se)
7479 return;
7480
7472 rt_se->rt_rq = &rq->rt; 7481 rt_se->rt_rq = &rq->rt;
7473 rt_se->my_q = rt_rq; 7482 rt_se->my_q = rt_rq;
7474 rt_se->parent = NULL; 7483 rt_se->parent = NULL;
@@ -7539,18 +7548,56 @@ void __init sched_init(void)
7539#ifdef CONFIG_FAIR_GROUP_SCHED 7548#ifdef CONFIG_FAIR_GROUP_SCHED
7540 init_task_group.shares = init_task_group_load; 7549 init_task_group.shares = init_task_group_load;
7541 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7550 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7551#ifdef CONFIG_CGROUP_SCHED
7552 /*
7553 * How much cpu bandwidth does init_task_group get?
7554 *
7555 * In case of task-groups formed thr' the cgroup filesystem, it
7556 * gets 100% of the cpu resources in the system. This overall
7557 * system cpu resource is divided among the tasks of
7558 * init_task_group and its child task-groups in a fair manner,
7559 * based on each entity's (task or task-group's) weight
7560 * (se->load.weight).
7561 *
7562 * In other words, if init_task_group has 10 tasks of weight
7563 * 1024) and two child groups A0 and A1 (of weight 1024 each),
7564 * then A0's share of the cpu resource is:
7565 *
7566 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
7567 *
7568 * We achieve this by letting init_task_group's tasks sit
7569 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
7570 */
7571 init_tg_cfs_entry(rq, &init_task_group, &rq->cfs, NULL, i, 1);
7572#elif defined CONFIG_USER_SCHED
7573 /*
7574 * In case of task-groups formed thr' the user id of tasks,
7575 * init_task_group represents tasks belonging to root user.
7576 * Hence it forms a sibling of all subsequent groups formed.
7577 * In this case, init_task_group gets only a fraction of overall
7578 * system cpu resource, based on the weight assigned to root
7579 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
7580 * by letting tasks of init_task_group sit in a separate cfs_rq
7581 * (init_cfs_rq) and having one entity represent this group of
7582 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
7583 */
7542 init_tg_cfs_entry(rq, &init_task_group, 7584 init_tg_cfs_entry(rq, &init_task_group,
7543 &per_cpu(init_cfs_rq, i), 7585 &per_cpu(init_cfs_rq, i),
7544 &per_cpu(init_sched_entity, i), i, 1); 7586 &per_cpu(init_sched_entity, i), i, 1);
7545 7587
7546#endif 7588#endif
7589#endif /* CONFIG_FAIR_GROUP_SCHED */
7590
7591 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7547#ifdef CONFIG_RT_GROUP_SCHED 7592#ifdef CONFIG_RT_GROUP_SCHED
7548 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7593 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7594#ifdef CONFIG_CGROUP_SCHED
7595 init_tg_rt_entry(rq, &init_task_group, &rq->rt, NULL, i, 1);
7596#elif defined CONFIG_USER_SCHED
7549 init_tg_rt_entry(rq, &init_task_group, 7597 init_tg_rt_entry(rq, &init_task_group,
7550 &per_cpu(init_rt_rq, i), 7598 &per_cpu(init_rt_rq, i),
7551 &per_cpu(init_sched_rt_entity, i), i, 1); 7599 &per_cpu(init_sched_rt_entity, i), i, 1);
7552#else 7600#endif
7553 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7554#endif 7601#endif
7555 7602
7556 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7603 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 022e036f2c3e..3dde0f0ec93a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1133,6 +1133,17 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1133 return 0; 1133 return 0;
1134} 1134}
1135 1135
1136/* return depth at which a sched entity is present in the hierarchy */
1137static inline int depth_se(struct sched_entity *se)
1138{
1139 int depth = 0;
1140
1141 for_each_sched_entity(se)
1142 depth++;
1143
1144 return depth;
1145}
1146
1136/* 1147/*
1137 * Preempt the current task with a newly woken task if needed: 1148 * Preempt the current task with a newly woken task if needed:
1138 */ 1149 */
@@ -1141,6 +1152,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1141 struct task_struct *curr = rq->curr; 1152 struct task_struct *curr = rq->curr;
1142 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1153 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1143 struct sched_entity *se = &curr->se, *pse = &p->se; 1154 struct sched_entity *se = &curr->se, *pse = &p->se;
1155 int se_depth, pse_depth;
1144 1156
1145 if (unlikely(rt_prio(p->prio))) { 1157 if (unlikely(rt_prio(p->prio))) {
1146 update_rq_clock(rq); 1158 update_rq_clock(rq);
@@ -1165,6 +1177,27 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1165 if (!sched_feat(WAKEUP_PREEMPT)) 1177 if (!sched_feat(WAKEUP_PREEMPT))
1166 return; 1178 return;
1167 1179
1180 /*
1181 * preemption test can be made between sibling entities who are in the
1182 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
1183 * both tasks until we find their ancestors who are siblings of common
1184 * parent.
1185 */
1186
1187 /* First walk up until both entities are at same depth */
1188 se_depth = depth_se(se);
1189 pse_depth = depth_se(pse);
1190
1191 while (se_depth > pse_depth) {
1192 se_depth--;
1193 se = parent_entity(se);
1194 }
1195
1196 while (pse_depth > se_depth) {
1197 pse_depth--;
1198 pse = parent_entity(pse);
1199 }
1200
1168 while (!is_same_group(se, pse)) { 1201 while (!is_same_group(se, pse)) {
1169 se = parent_entity(se); 1202 se = parent_entity(se);
1170 pse = parent_entity(pse); 1203 pse = parent_entity(pse);
@@ -1223,13 +1256,22 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1223static struct task_struct * 1256static struct task_struct *
1224__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) 1257__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
1225{ 1258{
1226 struct task_struct *p; 1259 struct task_struct *p = NULL;
1260 struct sched_entity *se;
1227 1261
1228 if (!curr) 1262 if (!curr)
1229 return NULL; 1263 return NULL;
1230 1264
1231 p = rb_entry(curr, struct task_struct, se.run_node); 1265 /* Skip over entities that are not tasks */
1232 cfs_rq->rb_load_balance_curr = rb_next(curr); 1266 do {
1267 se = rb_entry(curr, struct sched_entity, run_node);
1268 curr = rb_next(curr);
1269 } while (curr && !entity_is_task(se));
1270
1271 cfs_rq->rb_load_balance_curr = curr;
1272
1273 if (entity_is_task(se))
1274 p = task_of(se);
1233 1275
1234 return p; 1276 return p;
1235} 1277}
@@ -1489,9 +1531,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
1489{ 1531{
1490 struct cfs_rq *cfs_rq; 1532 struct cfs_rq *cfs_rq;
1491 1533
1492#ifdef CONFIG_FAIR_GROUP_SCHED
1493 print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
1494#endif
1495 rcu_read_lock(); 1534 rcu_read_lock();
1496 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 1535 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1497 print_cfs_rq(m, cpu, cfs_rq); 1536 print_cfs_rq(m, cpu, cfs_rq);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8ff824565e06..201a69382a42 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -374,11 +374,15 @@ static void update_curr_rt(struct rq *rq)
374 curr->se.exec_start = rq->clock; 374 curr->se.exec_start = rq->clock;
375 cpuacct_charge(curr, delta_exec); 375 cpuacct_charge(curr, delta_exec);
376 376
377 spin_lock(&rt_rq->rt_runtime_lock); 377 for_each_sched_rt_entity(rt_se) {
378 rt_rq->rt_time += delta_exec; 378 rt_rq = rt_rq_of_se(rt_se);
379 if (sched_rt_runtime_exceeded(rt_rq)) 379
380 resched_task(curr); 380 spin_lock(&rt_rq->rt_runtime_lock);
381 spin_unlock(&rt_rq->rt_runtime_lock); 381 rt_rq->rt_time += delta_exec;
382 if (sched_rt_runtime_exceeded(rt_rq))
383 resched_task(curr);
384 spin_unlock(&rt_rq->rt_runtime_lock);
385 }
382} 386}
383 387
384static inline 388static inline
@@ -477,7 +481,6 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
477 * entries, we must remove entries top - down. 481 * entries, we must remove entries top - down.
478 * 482 *
479 * XXX: O(1/2 h^2) because we can only walk up, not down the chain. 483 * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
480 * doesn't matter much for now, as h=2 for GROUP_SCHED.
481 */ 484 */
482static void dequeue_rt_stack(struct task_struct *p) 485static void dequeue_rt_stack(struct task_struct *p)
483{ 486{