diff options
author | Dhaval Giani <dhaval@linux.vnet.ibm.com> | 2008-04-19 13:44:59 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-04-19 13:44:59 -0400 |
commit | 354d60c2ff72d86627dfe2089d186824abf4bb8e (patch) | |
tree | 10cea61ce7036448ed7246820c5575df2a61bb3b | |
parent | ea736ed5d353d7a3aa1cf8ce4cf8d947bc353fb2 (diff) |
sched: mix tasks and groups
This patch allows tasks and groups to exist in the same cfs_rq. With this
change the CFS group scheduling follows a 1/(M+N) model from a 1/(1+N)
fairness model where M tasks and N groups exist at the cfs_rq level.
[a.p.zijlstra@chello.nl: rt bits and assorted fixes]
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | kernel/sched.c | 51 | ||||
-rw-r--r-- | kernel/sched_fair.c | 51 | ||||
-rw-r--r-- | kernel/sched_rt.c | 15 |
3 files changed, 103 insertions, 14 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 62830eaec52f..1b7399dfa361 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -273,6 +273,7 @@ struct task_group { | |||
273 | struct list_head list; | 273 | struct list_head list; |
274 | }; | 274 | }; |
275 | 275 | ||
276 | #ifdef CONFIG_USER_SCHED | ||
276 | #ifdef CONFIG_FAIR_GROUP_SCHED | 277 | #ifdef CONFIG_FAIR_GROUP_SCHED |
277 | /* Default task group's sched entity on each cpu */ | 278 | /* Default task group's sched entity on each cpu */ |
278 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 279 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
@@ -284,6 +285,7 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | |||
284 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 285 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
285 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 286 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; |
286 | #endif | 287 | #endif |
288 | #endif | ||
287 | 289 | ||
288 | /* task_group_lock serializes add/remove of task groups and also changes to | 290 | /* task_group_lock serializes add/remove of task groups and also changes to |
289 | * a task group's cpu shares. | 291 | * a task group's cpu shares. |
@@ -7447,6 +7449,10 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, | |||
7447 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 7449 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); |
7448 | 7450 | ||
7449 | tg->se[cpu] = se; | 7451 | tg->se[cpu] = se; |
7452 | /* se could be NULL for init_task_group */ | ||
7453 | if (!se) | ||
7454 | return; | ||
7455 | |||
7450 | se->cfs_rq = &rq->cfs; | 7456 | se->cfs_rq = &rq->cfs; |
7451 | se->my_q = cfs_rq; | 7457 | se->my_q = cfs_rq; |
7452 | se->load.weight = tg->shares; | 7458 | se->load.weight = tg->shares; |
@@ -7469,6 +7475,9 @@ static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, | |||
7469 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | 7475 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); |
7470 | 7476 | ||
7471 | tg->rt_se[cpu] = rt_se; | 7477 | tg->rt_se[cpu] = rt_se; |
7478 | if (!rt_se) | ||
7479 | return; | ||
7480 | |||
7472 | rt_se->rt_rq = &rq->rt; | 7481 | rt_se->rt_rq = &rq->rt; |
7473 | rt_se->my_q = rt_rq; | 7482 | rt_se->my_q = rt_rq; |
7474 | rt_se->parent = NULL; | 7483 | rt_se->parent = NULL; |
@@ -7539,18 +7548,56 @@ void __init sched_init(void) | |||
7539 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7548 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7540 | init_task_group.shares = init_task_group_load; | 7549 | init_task_group.shares = init_task_group_load; |
7541 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 7550 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7551 | #ifdef CONFIG_CGROUP_SCHED | ||
7552 | /* | ||
7553 | * How much cpu bandwidth does init_task_group get? | ||
7554 | * | ||
7555 | * In case of task-groups formed thr' the cgroup filesystem, it | ||
7556 | * gets 100% of the cpu resources in the system. This overall | ||
7557 | * system cpu resource is divided among the tasks of | ||
7558 | * init_task_group and its child task-groups in a fair manner, | ||
7559 | * based on each entity's (task or task-group's) weight | ||
7560 | * (se->load.weight). | ||
7561 | * | ||
7562 | * In other words, if init_task_group has 10 tasks of weight | ||
7563 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | ||
7564 | * then A0's share of the cpu resource is: | ||
7565 | * | ||
7566 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | ||
7567 | * | ||
7568 | * We achieve this by letting init_task_group's tasks sit | ||
7569 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | ||
7570 | */ | ||
7571 | init_tg_cfs_entry(rq, &init_task_group, &rq->cfs, NULL, i, 1); | ||
7572 | #elif defined CONFIG_USER_SCHED | ||
7573 | /* | ||
7574 | * In case of task-groups formed thr' the user id of tasks, | ||
7575 | * init_task_group represents tasks belonging to root user. | ||
7576 | * Hence it forms a sibling of all subsequent groups formed. | ||
7577 | * In this case, init_task_group gets only a fraction of overall | ||
7578 | * system cpu resource, based on the weight assigned to root | ||
7579 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | ||
7580 | * by letting tasks of init_task_group sit in a separate cfs_rq | ||
7581 | * (init_cfs_rq) and having one entity represent this group of | ||
7582 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | ||
7583 | */ | ||
7542 | init_tg_cfs_entry(rq, &init_task_group, | 7584 | init_tg_cfs_entry(rq, &init_task_group, |
7543 | &per_cpu(init_cfs_rq, i), | 7585 | &per_cpu(init_cfs_rq, i), |
7544 | &per_cpu(init_sched_entity, i), i, 1); | 7586 | &per_cpu(init_sched_entity, i), i, 1); |
7545 | 7587 | ||
7546 | #endif | 7588 | #endif |
7589 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
7590 | |||
7591 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | ||
7547 | #ifdef CONFIG_RT_GROUP_SCHED | 7592 | #ifdef CONFIG_RT_GROUP_SCHED |
7548 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 7593 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
7594 | #ifdef CONFIG_CGROUP_SCHED | ||
7595 | init_tg_rt_entry(rq, &init_task_group, &rq->rt, NULL, i, 1); | ||
7596 | #elif defined CONFIG_USER_SCHED | ||
7549 | init_tg_rt_entry(rq, &init_task_group, | 7597 | init_tg_rt_entry(rq, &init_task_group, |
7550 | &per_cpu(init_rt_rq, i), | 7598 | &per_cpu(init_rt_rq, i), |
7551 | &per_cpu(init_sched_rt_entity, i), i, 1); | 7599 | &per_cpu(init_sched_rt_entity, i), i, 1); |
7552 | #else | 7600 | #endif |
7553 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | ||
7554 | #endif | 7601 | #endif |
7555 | 7602 | ||
7556 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 7603 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 022e036f2c3e..3dde0f0ec93a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -1133,6 +1133,17 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) | |||
1133 | return 0; | 1133 | return 0; |
1134 | } | 1134 | } |
1135 | 1135 | ||
1136 | /* return depth at which a sched entity is present in the hierarchy */ | ||
1137 | static inline int depth_se(struct sched_entity *se) | ||
1138 | { | ||
1139 | int depth = 0; | ||
1140 | |||
1141 | for_each_sched_entity(se) | ||
1142 | depth++; | ||
1143 | |||
1144 | return depth; | ||
1145 | } | ||
1146 | |||
1136 | /* | 1147 | /* |
1137 | * Preempt the current task with a newly woken task if needed: | 1148 | * Preempt the current task with a newly woken task if needed: |
1138 | */ | 1149 | */ |
@@ -1141,6 +1152,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
1141 | struct task_struct *curr = rq->curr; | 1152 | struct task_struct *curr = rq->curr; |
1142 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1153 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
1143 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1154 | struct sched_entity *se = &curr->se, *pse = &p->se; |
1155 | int se_depth, pse_depth; | ||
1144 | 1156 | ||
1145 | if (unlikely(rt_prio(p->prio))) { | 1157 | if (unlikely(rt_prio(p->prio))) { |
1146 | update_rq_clock(rq); | 1158 | update_rq_clock(rq); |
@@ -1165,6 +1177,27 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
1165 | if (!sched_feat(WAKEUP_PREEMPT)) | 1177 | if (!sched_feat(WAKEUP_PREEMPT)) |
1166 | return; | 1178 | return; |
1167 | 1179 | ||
1180 | /* | ||
1181 | * preemption test can be made between sibling entities who are in the | ||
1182 | * same cfs_rq i.e who have a common parent. Walk up the hierarchy of | ||
1183 | * both tasks until we find their ancestors who are siblings of common | ||
1184 | * parent. | ||
1185 | */ | ||
1186 | |||
1187 | /* First walk up until both entities are at same depth */ | ||
1188 | se_depth = depth_se(se); | ||
1189 | pse_depth = depth_se(pse); | ||
1190 | |||
1191 | while (se_depth > pse_depth) { | ||
1192 | se_depth--; | ||
1193 | se = parent_entity(se); | ||
1194 | } | ||
1195 | |||
1196 | while (pse_depth > se_depth) { | ||
1197 | pse_depth--; | ||
1198 | pse = parent_entity(pse); | ||
1199 | } | ||
1200 | |||
1168 | while (!is_same_group(se, pse)) { | 1201 | while (!is_same_group(se, pse)) { |
1169 | se = parent_entity(se); | 1202 | se = parent_entity(se); |
1170 | pse = parent_entity(pse); | 1203 | pse = parent_entity(pse); |
@@ -1223,13 +1256,22 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
1223 | static struct task_struct * | 1256 | static struct task_struct * |
1224 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) | 1257 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) |
1225 | { | 1258 | { |
1226 | struct task_struct *p; | 1259 | struct task_struct *p = NULL; |
1260 | struct sched_entity *se; | ||
1227 | 1261 | ||
1228 | if (!curr) | 1262 | if (!curr) |
1229 | return NULL; | 1263 | return NULL; |
1230 | 1264 | ||
1231 | p = rb_entry(curr, struct task_struct, se.run_node); | 1265 | /* Skip over entities that are not tasks */ |
1232 | cfs_rq->rb_load_balance_curr = rb_next(curr); | 1266 | do { |
1267 | se = rb_entry(curr, struct sched_entity, run_node); | ||
1268 | curr = rb_next(curr); | ||
1269 | } while (curr && !entity_is_task(se)); | ||
1270 | |||
1271 | cfs_rq->rb_load_balance_curr = curr; | ||
1272 | |||
1273 | if (entity_is_task(se)) | ||
1274 | p = task_of(se); | ||
1233 | 1275 | ||
1234 | return p; | 1276 | return p; |
1235 | } | 1277 | } |
@@ -1489,9 +1531,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu) | |||
1489 | { | 1531 | { |
1490 | struct cfs_rq *cfs_rq; | 1532 | struct cfs_rq *cfs_rq; |
1491 | 1533 | ||
1492 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1493 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); | ||
1494 | #endif | ||
1495 | rcu_read_lock(); | 1534 | rcu_read_lock(); |
1496 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 1535 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) |
1497 | print_cfs_rq(m, cpu, cfs_rq); | 1536 | print_cfs_rq(m, cpu, cfs_rq); |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8ff824565e06..201a69382a42 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -374,11 +374,15 @@ static void update_curr_rt(struct rq *rq) | |||
374 | curr->se.exec_start = rq->clock; | 374 | curr->se.exec_start = rq->clock; |
375 | cpuacct_charge(curr, delta_exec); | 375 | cpuacct_charge(curr, delta_exec); |
376 | 376 | ||
377 | spin_lock(&rt_rq->rt_runtime_lock); | 377 | for_each_sched_rt_entity(rt_se) { |
378 | rt_rq->rt_time += delta_exec; | 378 | rt_rq = rt_rq_of_se(rt_se); |
379 | if (sched_rt_runtime_exceeded(rt_rq)) | 379 | |
380 | resched_task(curr); | 380 | spin_lock(&rt_rq->rt_runtime_lock); |
381 | spin_unlock(&rt_rq->rt_runtime_lock); | 381 | rt_rq->rt_time += delta_exec; |
382 | if (sched_rt_runtime_exceeded(rt_rq)) | ||
383 | resched_task(curr); | ||
384 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
385 | } | ||
382 | } | 386 | } |
383 | 387 | ||
384 | static inline | 388 | static inline |
@@ -477,7 +481,6 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
477 | * entries, we must remove entries top - down. | 481 | * entries, we must remove entries top - down. |
478 | * | 482 | * |
479 | * XXX: O(1/2 h^2) because we can only walk up, not down the chain. | 483 | * XXX: O(1/2 h^2) because we can only walk up, not down the chain. |
480 | * doesn't matter much for now, as h=2 for GROUP_SCHED. | ||
481 | */ | 484 | */ |
482 | static void dequeue_rt_stack(struct task_struct *p) | 485 | static void dequeue_rt_stack(struct task_struct *p) |
483 | { | 486 | { |