diff options
author | Ken Chen <kenchen@google.com> | 2008-11-19 01:41:57 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-11-19 12:39:37 -0500 |
commit | ec4e0e2fe018992d980910db901637c814575914 (patch) | |
tree | 58f5df8581387afc90774ee2d1923302ae209b3c /kernel/sched.c | |
parent | 3ac3ba0b396fd99550e08034b0e4c27fdf39c252 (diff) |
sched: fix inconsistency when redistribute per-cpu tg->cfs_rq shares
Impact: make load-balancing more consistent
In the update_shares() path leading to tg_shares_up(), the calculation of
per-cpu cfs_rq shares is rather erratic even under moderate task wake up
rate. The problem is that the per-cpu tg->cfs_rq load weight used in the
sd_rq_weight aggregation and actual redistribution of the cfs_rq->shares
are collected at different time. Under moderate system load, we've seen
quite a bit of variation on the cfs_rq->shares and ultimately wildly
affects sched_entity's load weight.
This patch caches the result of initial per-cpu load weight when doing the
sum calculation, and then pass it down to update_group_shares_cpu() for
redistributing per-cpu cfs_rq shares. This allows consistent total cfs_rq
shares across all CPUs. It also simplifies the rounding and zero load
weight check.
Signed-off-by: Ken Chen <kenchen@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 41 |
1 files changed, 15 insertions, 26 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index a4c156d9a4a5..93bfb086e60f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -1453,27 +1453,13 @@ static void | |||
1453 | update_group_shares_cpu(struct task_group *tg, int cpu, | 1453 | update_group_shares_cpu(struct task_group *tg, int cpu, |
1454 | unsigned long sd_shares, unsigned long sd_rq_weight) | 1454 | unsigned long sd_shares, unsigned long sd_rq_weight) |
1455 | { | 1455 | { |
1456 | int boost = 0; | ||
1457 | unsigned long shares; | 1456 | unsigned long shares; |
1458 | unsigned long rq_weight; | 1457 | unsigned long rq_weight; |
1459 | 1458 | ||
1460 | if (!tg->se[cpu]) | 1459 | if (!tg->se[cpu]) |
1461 | return; | 1460 | return; |
1462 | 1461 | ||
1463 | rq_weight = tg->cfs_rq[cpu]->load.weight; | 1462 | rq_weight = tg->cfs_rq[cpu]->rq_weight; |
1464 | |||
1465 | /* | ||
1466 | * If there are currently no tasks on the cpu pretend there is one of | ||
1467 | * average load so that when a new task gets to run here it will not | ||
1468 | * get delayed by group starvation. | ||
1469 | */ | ||
1470 | if (!rq_weight) { | ||
1471 | boost = 1; | ||
1472 | rq_weight = NICE_0_LOAD; | ||
1473 | } | ||
1474 | |||
1475 | if (unlikely(rq_weight > sd_rq_weight)) | ||
1476 | rq_weight = sd_rq_weight; | ||
1477 | 1463 | ||
1478 | /* | 1464 | /* |
1479 | * \Sum shares * rq_weight | 1465 | * \Sum shares * rq_weight |
@@ -1481,7 +1467,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1481 | * \Sum rq_weight | 1467 | * \Sum rq_weight |
1482 | * | 1468 | * |
1483 | */ | 1469 | */ |
1484 | shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); | 1470 | shares = (sd_shares * rq_weight) / sd_rq_weight; |
1485 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | 1471 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); |
1486 | 1472 | ||
1487 | if (abs(shares - tg->se[cpu]->load.weight) > | 1473 | if (abs(shares - tg->se[cpu]->load.weight) > |
@@ -1490,11 +1476,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1490 | unsigned long flags; | 1476 | unsigned long flags; |
1491 | 1477 | ||
1492 | spin_lock_irqsave(&rq->lock, flags); | 1478 | spin_lock_irqsave(&rq->lock, flags); |
1493 | /* | 1479 | tg->cfs_rq[cpu]->shares = shares; |
1494 | * record the actual number of shares, not the boosted amount. | ||
1495 | */ | ||
1496 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | ||
1497 | tg->cfs_rq[cpu]->rq_weight = rq_weight; | ||
1498 | 1480 | ||
1499 | __set_se_shares(tg->se[cpu], shares); | 1481 | __set_se_shares(tg->se[cpu], shares); |
1500 | spin_unlock_irqrestore(&rq->lock, flags); | 1482 | spin_unlock_irqrestore(&rq->lock, flags); |
@@ -1508,13 +1490,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1508 | */ | 1490 | */ |
1509 | static int tg_shares_up(struct task_group *tg, void *data) | 1491 | static int tg_shares_up(struct task_group *tg, void *data) |
1510 | { | 1492 | { |
1511 | unsigned long rq_weight = 0; | 1493 | unsigned long weight, rq_weight = 0; |
1512 | unsigned long shares = 0; | 1494 | unsigned long shares = 0; |
1513 | struct sched_domain *sd = data; | 1495 | struct sched_domain *sd = data; |
1514 | int i; | 1496 | int i; |
1515 | 1497 | ||
1516 | for_each_cpu_mask(i, sd->span) { | 1498 | for_each_cpu_mask(i, sd->span) { |
1517 | rq_weight += tg->cfs_rq[i]->load.weight; | 1499 | /* |
1500 | * If there are currently no tasks on the cpu pretend there | ||
1501 | * is one of average load so that when a new task gets to | ||
1502 | * run here it will not get delayed by group starvation. | ||
1503 | */ | ||
1504 | weight = tg->cfs_rq[i]->load.weight; | ||
1505 | if (!weight) | ||
1506 | weight = NICE_0_LOAD; | ||
1507 | |||
1508 | tg->cfs_rq[i]->rq_weight = weight; | ||
1509 | rq_weight += weight; | ||
1518 | shares += tg->cfs_rq[i]->shares; | 1510 | shares += tg->cfs_rq[i]->shares; |
1519 | } | 1511 | } |
1520 | 1512 | ||
@@ -1524,9 +1516,6 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
1524 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) | 1516 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) |
1525 | shares = tg->shares; | 1517 | shares = tg->shares; |
1526 | 1518 | ||
1527 | if (!rq_weight) | ||
1528 | rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; | ||
1529 | |||
1530 | for_each_cpu_mask(i, sd->span) | 1519 | for_each_cpu_mask(i, sd->span) |
1531 | update_group_shares_cpu(tg, i, shares, rq_weight); | 1520 | update_group_shares_cpu(tg, i, shares, rq_weight); |
1532 | 1521 | ||