diff options
| author | Dhaval Giani <dhaval@linux.vnet.ibm.com> | 2008-04-19 13:44:59 -0400 |
|---|---|---|
| committer | Ingo Molnar <mingo@elte.hu> | 2008-04-19 13:44:59 -0400 |
| commit | 354d60c2ff72d86627dfe2089d186824abf4bb8e (patch) | |
| tree | 10cea61ce7036448ed7246820c5575df2a61bb3b /kernel | |
| parent | ea736ed5d353d7a3aa1cf8ce4cf8d947bc353fb2 (diff) | |
sched: mix tasks and groups
This patch allows tasks and groups to exist in the same cfs_rq. With this
change the CFS group scheduling follows a 1/(M+N) model from a 1/(1+N)
fairness model where M tasks and N groups exist at the cfs_rq level.
[a.p.zijlstra@chello.nl: rt bits and assorted fixes]
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/sched.c | 51 | ||||
| -rw-r--r-- | kernel/sched_fair.c | 51 | ||||
| -rw-r--r-- | kernel/sched_rt.c | 15 |
3 files changed, 103 insertions, 14 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 62830eaec5..1b7399dfa3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -273,6 +273,7 @@ struct task_group { | |||
| 273 | struct list_head list; | 273 | struct list_head list; |
| 274 | }; | 274 | }; |
| 275 | 275 | ||
| 276 | #ifdef CONFIG_USER_SCHED | ||
| 276 | #ifdef CONFIG_FAIR_GROUP_SCHED | 277 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 277 | /* Default task group's sched entity on each cpu */ | 278 | /* Default task group's sched entity on each cpu */ |
| 278 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 279 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
| @@ -284,6 +285,7 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | |||
| 284 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 285 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
| 285 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 286 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; |
| 286 | #endif | 287 | #endif |
| 288 | #endif | ||
| 287 | 289 | ||
| 288 | /* task_group_lock serializes add/remove of task groups and also changes to | 290 | /* task_group_lock serializes add/remove of task groups and also changes to |
| 289 | * a task group's cpu shares. | 291 | * a task group's cpu shares. |
| @@ -7447,6 +7449,10 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, | |||
| 7447 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 7449 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); |
| 7448 | 7450 | ||
| 7449 | tg->se[cpu] = se; | 7451 | tg->se[cpu] = se; |
| 7452 | /* se could be NULL for init_task_group */ | ||
| 7453 | if (!se) | ||
| 7454 | return; | ||
| 7455 | |||
| 7450 | se->cfs_rq = &rq->cfs; | 7456 | se->cfs_rq = &rq->cfs; |
| 7451 | se->my_q = cfs_rq; | 7457 | se->my_q = cfs_rq; |
| 7452 | se->load.weight = tg->shares; | 7458 | se->load.weight = tg->shares; |
| @@ -7469,6 +7475,9 @@ static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, | |||
| 7469 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | 7475 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); |
| 7470 | 7476 | ||
| 7471 | tg->rt_se[cpu] = rt_se; | 7477 | tg->rt_se[cpu] = rt_se; |
| 7478 | if (!rt_se) | ||
| 7479 | return; | ||
| 7480 | |||
| 7472 | rt_se->rt_rq = &rq->rt; | 7481 | rt_se->rt_rq = &rq->rt; |
| 7473 | rt_se->my_q = rt_rq; | 7482 | rt_se->my_q = rt_rq; |
| 7474 | rt_se->parent = NULL; | 7483 | rt_se->parent = NULL; |
| @@ -7539,18 +7548,56 @@ void __init sched_init(void) | |||
| 7539 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7548 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7540 | init_task_group.shares = init_task_group_load; | 7549 | init_task_group.shares = init_task_group_load; |
| 7541 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 7550 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
| 7551 | #ifdef CONFIG_CGROUP_SCHED | ||
| 7552 | /* | ||
| 7553 | * How much cpu bandwidth does init_task_group get? | ||
| 7554 | * | ||
| 7555 | * In case of task-groups formed thr' the cgroup filesystem, it | ||
| 7556 | * gets 100% of the cpu resources in the system. This overall | ||
| 7557 | * system cpu resource is divided among the tasks of | ||
| 7558 | * init_task_group and its child task-groups in a fair manner, | ||
| 7559 | * based on each entity's (task or task-group's) weight | ||
| 7560 | * (se->load.weight). | ||
| 7561 | * | ||
| 7562 | * In other words, if init_task_group has 10 tasks of weight | ||
| 7563 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | ||
| 7564 | * then A0's share of the cpu resource is: | ||
| 7565 | * | ||
| 7566 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | ||
| 7567 | * | ||
| 7568 | * We achieve this by letting init_task_group's tasks sit | ||
| 7569 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | ||
| 7570 | */ | ||
| 7571 | init_tg_cfs_entry(rq, &init_task_group, &rq->cfs, NULL, i, 1); | ||
| 7572 | #elif defined CONFIG_USER_SCHED | ||
| 7573 | /* | ||
| 7574 | * In case of task-groups formed thr' the user id of tasks, | ||
| 7575 | * init_task_group represents tasks belonging to root user. | ||
| 7576 | * Hence it forms a sibling of all subsequent groups formed. | ||
| 7577 | * In this case, init_task_group gets only a fraction of overall | ||
| 7578 | * system cpu resource, based on the weight assigned to root | ||
| 7579 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | ||
| 7580 | * by letting tasks of init_task_group sit in a separate cfs_rq | ||
| 7581 | * (init_cfs_rq) and having one entity represent this group of | ||
| 7582 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | ||
| 7583 | */ | ||
| 7542 | init_tg_cfs_entry(rq, &init_task_group, | 7584 | init_tg_cfs_entry(rq, &init_task_group, |
| 7543 | &per_cpu(init_cfs_rq, i), | 7585 | &per_cpu(init_cfs_rq, i), |
| 7544 | &per_cpu(init_sched_entity, i), i, 1); | 7586 | &per_cpu(init_sched_entity, i), i, 1); |
| 7545 | 7587 | ||
| 7546 | #endif | 7588 | #endif |
| 7589 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 7590 | |||
| 7591 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | ||
| 7547 | #ifdef CONFIG_RT_GROUP_SCHED | 7592 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7548 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 7593 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
| 7594 | #ifdef CONFIG_CGROUP_SCHED | ||
| 7595 | init_tg_rt_entry(rq, &init_task_group, &rq->rt, NULL, i, 1); | ||
| 7596 | #elif defined CONFIG_USER_SCHED | ||
| 7549 | init_tg_rt_entry(rq, &init_task_group, | 7597 | init_tg_rt_entry(rq, &init_task_group, |
| 7550 | &per_cpu(init_rt_rq, i), | 7598 | &per_cpu(init_rt_rq, i), |
| 7551 | &per_cpu(init_sched_rt_entity, i), i, 1); | 7599 | &per_cpu(init_sched_rt_entity, i), i, 1); |
| 7552 | #else | 7600 | #endif |
| 7553 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | ||
| 7554 | #endif | 7601 | #endif |
| 7555 | 7602 | ||
| 7556 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 7603 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 022e036f2c..3dde0f0ec9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -1133,6 +1133,17 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) | |||
| 1133 | return 0; | 1133 | return 0; |
| 1134 | } | 1134 | } |
| 1135 | 1135 | ||
| 1136 | /* return depth at which a sched entity is present in the hierarchy */ | ||
| 1137 | static inline int depth_se(struct sched_entity *se) | ||
| 1138 | { | ||
| 1139 | int depth = 0; | ||
| 1140 | |||
| 1141 | for_each_sched_entity(se) | ||
| 1142 | depth++; | ||
| 1143 | |||
| 1144 | return depth; | ||
| 1145 | } | ||
| 1146 | |||
| 1136 | /* | 1147 | /* |
| 1137 | * Preempt the current task with a newly woken task if needed: | 1148 | * Preempt the current task with a newly woken task if needed: |
| 1138 | */ | 1149 | */ |
| @@ -1141,6 +1152,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
| 1141 | struct task_struct *curr = rq->curr; | 1152 | struct task_struct *curr = rq->curr; |
| 1142 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1153 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
| 1143 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1154 | struct sched_entity *se = &curr->se, *pse = &p->se; |
| 1155 | int se_depth, pse_depth; | ||
| 1144 | 1156 | ||
| 1145 | if (unlikely(rt_prio(p->prio))) { | 1157 | if (unlikely(rt_prio(p->prio))) { |
| 1146 | update_rq_clock(rq); | 1158 | update_rq_clock(rq); |
| @@ -1165,6 +1177,27 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
| 1165 | if (!sched_feat(WAKEUP_PREEMPT)) | 1177 | if (!sched_feat(WAKEUP_PREEMPT)) |
| 1166 | return; | 1178 | return; |
| 1167 | 1179 | ||
| 1180 | /* | ||
| 1181 | * preemption test can be made between sibling entities who are in the | ||
| 1182 | * same cfs_rq i.e who have a common parent. Walk up the hierarchy of | ||
| 1183 | * both tasks until we find their ancestors who are siblings of common | ||
| 1184 | * parent. | ||
| 1185 | */ | ||
| 1186 | |||
| 1187 | /* First walk up until both entities are at same depth */ | ||
| 1188 | se_depth = depth_se(se); | ||
| 1189 | pse_depth = depth_se(pse); | ||
| 1190 | |||
| 1191 | while (se_depth > pse_depth) { | ||
| 1192 | se_depth--; | ||
| 1193 | se = parent_entity(se); | ||
| 1194 | } | ||
| 1195 | |||
| 1196 | while (pse_depth > se_depth) { | ||
| 1197 | pse_depth--; | ||
| 1198 | pse = parent_entity(pse); | ||
| 1199 | } | ||
| 1200 | |||
| 1168 | while (!is_same_group(se, pse)) { | 1201 | while (!is_same_group(se, pse)) { |
| 1169 | se = parent_entity(se); | 1202 | se = parent_entity(se); |
| 1170 | pse = parent_entity(pse); | 1203 | pse = parent_entity(pse); |
| @@ -1223,13 +1256,22 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
| 1223 | static struct task_struct * | 1256 | static struct task_struct * |
| 1224 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) | 1257 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) |
| 1225 | { | 1258 | { |
| 1226 | struct task_struct *p; | 1259 | struct task_struct *p = NULL; |
| 1260 | struct sched_entity *se; | ||
| 1227 | 1261 | ||
| 1228 | if (!curr) | 1262 | if (!curr) |
| 1229 | return NULL; | 1263 | return NULL; |
| 1230 | 1264 | ||
| 1231 | p = rb_entry(curr, struct task_struct, se.run_node); | 1265 | /* Skip over entities that are not tasks */ |
| 1232 | cfs_rq->rb_load_balance_curr = rb_next(curr); | 1266 | do { |
| 1267 | se = rb_entry(curr, struct sched_entity, run_node); | ||
| 1268 | curr = rb_next(curr); | ||
| 1269 | } while (curr && !entity_is_task(se)); | ||
| 1270 | |||
| 1271 | cfs_rq->rb_load_balance_curr = curr; | ||
| 1272 | |||
| 1273 | if (entity_is_task(se)) | ||
| 1274 | p = task_of(se); | ||
| 1233 | 1275 | ||
| 1234 | return p; | 1276 | return p; |
| 1235 | } | 1277 | } |
| @@ -1489,9 +1531,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu) | |||
| 1489 | { | 1531 | { |
| 1490 | struct cfs_rq *cfs_rq; | 1532 | struct cfs_rq *cfs_rq; |
| 1491 | 1533 | ||
| 1492 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1493 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); | ||
| 1494 | #endif | ||
| 1495 | rcu_read_lock(); | 1534 | rcu_read_lock(); |
| 1496 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 1535 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) |
| 1497 | print_cfs_rq(m, cpu, cfs_rq); | 1536 | print_cfs_rq(m, cpu, cfs_rq); |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8ff824565e..201a69382a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
| @@ -374,11 +374,15 @@ static void update_curr_rt(struct rq *rq) | |||
| 374 | curr->se.exec_start = rq->clock; | 374 | curr->se.exec_start = rq->clock; |
| 375 | cpuacct_charge(curr, delta_exec); | 375 | cpuacct_charge(curr, delta_exec); |
| 376 | 376 | ||
| 377 | spin_lock(&rt_rq->rt_runtime_lock); | 377 | for_each_sched_rt_entity(rt_se) { |
| 378 | rt_rq->rt_time += delta_exec; | 378 | rt_rq = rt_rq_of_se(rt_se); |
| 379 | if (sched_rt_runtime_exceeded(rt_rq)) | 379 | |
| 380 | resched_task(curr); | 380 | spin_lock(&rt_rq->rt_runtime_lock); |
| 381 | spin_unlock(&rt_rq->rt_runtime_lock); | 381 | rt_rq->rt_time += delta_exec; |
| 382 | if (sched_rt_runtime_exceeded(rt_rq)) | ||
| 383 | resched_task(curr); | ||
| 384 | spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 385 | } | ||
| 382 | } | 386 | } |
| 383 | 387 | ||
| 384 | static inline | 388 | static inline |
| @@ -477,7 +481,6 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
| 477 | * entries, we must remove entries top - down. | 481 | * entries, we must remove entries top - down. |
| 478 | * | 482 | * |
| 479 | * XXX: O(1/2 h^2) because we can only walk up, not down the chain. | 483 | * XXX: O(1/2 h^2) because we can only walk up, not down the chain. |
| 480 | * doesn't matter much for now, as h=2 for GROUP_SCHED. | ||
| 481 | */ | 484 | */ |
| 482 | static void dequeue_rt_stack(struct task_struct *p) | 485 | static void dequeue_rt_stack(struct task_struct *p) |
| 483 | { | 486 | { |
