diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2008-04-19 13:45:00 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-04-19 13:45:00 -0400 |
commit | 18d95a2832c1392a2d63227a7a6d433cb9f2037e (patch) | |
tree | fa85b700aa3caac5b1309edd8e31d9b957957a83 /kernel/sched_fair.c | |
parent | 1d3504fcf5606579d60b649d19f44b3871c1ddae (diff) |
sched: fair-group: SMP-nice for group scheduling
Implement SMP nice support for the full group hierarchy.
On each load-balance action, compile a sched_domain wide view of the full
task_group tree. We compute the domain wide view when walking down the
hierarchy, and readjust the weights when walking back up.
After collecting and readjusting the domain wide view, we try to balance the
tasks within the task_groups. The current approach is a naively balance each
task group until we've moved the targeted amount of load.
Inspired by Srivatsa Vaddsgiri's previous code and Abhishek Chandra's H-SMP
paper.
XXX: there will be some numerical issues due to the limited nature of
SCHED_LOAD_SCALE wrt to representing a task_groups influence on the
total weight. When the tree is deep enough, or the task weight small
enough, we'll run out of bits.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Abhishek Chandra <chandra@cs.umn.edu>
CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 124 |
1 files changed, 80 insertions, 44 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b43748efaa7f..b89fec93a237 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -492,10 +492,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
492 | * Scheduling class queueing methods: | 492 | * Scheduling class queueing methods: |
493 | */ | 493 | */ |
494 | 494 | ||
495 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
496 | static void | ||
497 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
498 | { | ||
499 | cfs_rq->task_weight += weight; | ||
500 | } | ||
501 | #else | ||
502 | static inline void | ||
503 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
504 | { | ||
505 | } | ||
506 | #endif | ||
507 | |||
495 | static void | 508 | static void |
496 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 509 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
497 | { | 510 | { |
498 | update_load_add(&cfs_rq->load, se->load.weight); | 511 | update_load_add(&cfs_rq->load, se->load.weight); |
512 | if (!parent_entity(se)) | ||
513 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
514 | if (entity_is_task(se)) | ||
515 | add_cfs_task_weight(cfs_rq, se->load.weight); | ||
499 | cfs_rq->nr_running++; | 516 | cfs_rq->nr_running++; |
500 | se->on_rq = 1; | 517 | se->on_rq = 1; |
501 | } | 518 | } |
@@ -504,6 +521,10 @@ static void | |||
504 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 521 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
505 | { | 522 | { |
506 | update_load_sub(&cfs_rq->load, se->load.weight); | 523 | update_load_sub(&cfs_rq->load, se->load.weight); |
524 | if (!parent_entity(se)) | ||
525 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
526 | if (entity_is_task(se)) | ||
527 | add_cfs_task_weight(cfs_rq, -se->load.weight); | ||
507 | cfs_rq->nr_running--; | 528 | cfs_rq->nr_running--; |
508 | se->on_rq = 0; | 529 | se->on_rq = 0; |
509 | } | 530 | } |
@@ -1286,75 +1307,90 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
1286 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 1307 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); |
1287 | } | 1308 | } |
1288 | 1309 | ||
1289 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1310 | static unsigned long |
1290 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | 1311 | __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1312 | unsigned long max_load_move, struct sched_domain *sd, | ||
1313 | enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, | ||
1314 | struct cfs_rq *cfs_rq) | ||
1291 | { | 1315 | { |
1292 | struct sched_entity *curr; | 1316 | struct rq_iterator cfs_rq_iterator; |
1293 | struct task_struct *p; | ||
1294 | |||
1295 | if (!cfs_rq->nr_running || !first_fair(cfs_rq)) | ||
1296 | return MAX_PRIO; | ||
1297 | |||
1298 | curr = cfs_rq->curr; | ||
1299 | if (!curr) | ||
1300 | curr = __pick_next_entity(cfs_rq); | ||
1301 | 1317 | ||
1302 | p = task_of(curr); | 1318 | cfs_rq_iterator.start = load_balance_start_fair; |
1319 | cfs_rq_iterator.next = load_balance_next_fair; | ||
1320 | cfs_rq_iterator.arg = cfs_rq; | ||
1303 | 1321 | ||
1304 | return p->prio; | 1322 | return balance_tasks(this_rq, this_cpu, busiest, |
1323 | max_load_move, sd, idle, all_pinned, | ||
1324 | this_best_prio, &cfs_rq_iterator); | ||
1305 | } | 1325 | } |
1306 | #endif | ||
1307 | 1326 | ||
1327 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1308 | static unsigned long | 1328 | static unsigned long |
1309 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1329 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1310 | unsigned long max_load_move, | 1330 | unsigned long max_load_move, |
1311 | struct sched_domain *sd, enum cpu_idle_type idle, | 1331 | struct sched_domain *sd, enum cpu_idle_type idle, |
1312 | int *all_pinned, int *this_best_prio) | 1332 | int *all_pinned, int *this_best_prio) |
1313 | { | 1333 | { |
1314 | struct cfs_rq *busy_cfs_rq; | ||
1315 | long rem_load_move = max_load_move; | 1334 | long rem_load_move = max_load_move; |
1316 | struct rq_iterator cfs_rq_iterator; | 1335 | int busiest_cpu = cpu_of(busiest); |
1336 | struct task_group *tg; | ||
1317 | 1337 | ||
1318 | cfs_rq_iterator.start = load_balance_start_fair; | 1338 | rcu_read_lock(); |
1319 | cfs_rq_iterator.next = load_balance_next_fair; | 1339 | list_for_each_entry(tg, &task_groups, list) { |
1320 | |||
1321 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | ||
1322 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1323 | struct cfs_rq *this_cfs_rq; | ||
1324 | long imbalance; | 1340 | long imbalance; |
1325 | unsigned long maxload; | 1341 | unsigned long this_weight, busiest_weight; |
1342 | long rem_load, max_load, moved_load; | ||
1343 | |||
1344 | /* | ||
1345 | * empty group | ||
1346 | */ | ||
1347 | if (!aggregate(tg, sd)->task_weight) | ||
1348 | continue; | ||
1349 | |||
1350 | rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; | ||
1351 | rem_load /= aggregate(tg, sd)->load + 1; | ||
1326 | 1352 | ||
1327 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 1353 | this_weight = tg->cfs_rq[this_cpu]->task_weight; |
1354 | busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; | ||
1328 | 1355 | ||
1329 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; | 1356 | imbalance = (busiest_weight - this_weight) / 2; |
1330 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | 1357 | |
1331 | if (imbalance <= 0) | 1358 | if (imbalance < 0) |
1359 | imbalance = busiest_weight; | ||
1360 | |||
1361 | max_load = max(rem_load, imbalance); | ||
1362 | moved_load = __load_balance_fair(this_rq, this_cpu, busiest, | ||
1363 | max_load, sd, idle, all_pinned, this_best_prio, | ||
1364 | tg->cfs_rq[busiest_cpu]); | ||
1365 | |||
1366 | if (!moved_load) | ||
1332 | continue; | 1367 | continue; |
1333 | 1368 | ||
1334 | /* Don't pull more than imbalance/2 */ | 1369 | move_group_shares(tg, sd, busiest_cpu, this_cpu); |
1335 | imbalance /= 2; | ||
1336 | maxload = min(rem_load_move, imbalance); | ||
1337 | 1370 | ||
1338 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1371 | moved_load *= aggregate(tg, sd)->load; |
1339 | #else | 1372 | moved_load /= aggregate(tg, sd)->rq_weight + 1; |
1340 | # define maxload rem_load_move | ||
1341 | #endif | ||
1342 | /* | ||
1343 | * pass busy_cfs_rq argument into | ||
1344 | * load_balance_[start|next]_fair iterators | ||
1345 | */ | ||
1346 | cfs_rq_iterator.arg = busy_cfs_rq; | ||
1347 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | ||
1348 | maxload, sd, idle, all_pinned, | ||
1349 | this_best_prio, | ||
1350 | &cfs_rq_iterator); | ||
1351 | 1373 | ||
1352 | if (rem_load_move <= 0) | 1374 | rem_load_move -= moved_load; |
1375 | if (rem_load_move < 0) | ||
1353 | break; | 1376 | break; |
1354 | } | 1377 | } |
1378 | rcu_read_unlock(); | ||
1355 | 1379 | ||
1356 | return max_load_move - rem_load_move; | 1380 | return max_load_move - rem_load_move; |
1357 | } | 1381 | } |
1382 | #else | ||
1383 | static unsigned long | ||
1384 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1385 | unsigned long max_load_move, | ||
1386 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
1387 | int *all_pinned, int *this_best_prio) | ||
1388 | { | ||
1389 | return __load_balance_fair(this_rq, this_cpu, busiest, | ||
1390 | max_load_move, sd, idle, all_pinned, | ||
1391 | this_best_prio, &busiest->cfs); | ||
1392 | } | ||
1393 | #endif | ||
1358 | 1394 | ||
1359 | static int | 1395 | static int |
1360 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1396 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |