aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKen Chen <kenchen@google.com>2008-11-19 01:41:57 -0500
committerIngo Molnar <mingo@elte.hu>2008-11-19 12:39:37 -0500
commitec4e0e2fe018992d980910db901637c814575914 (patch)
tree58f5df8581387afc90774ee2d1923302ae209b3c
parent3ac3ba0b396fd99550e08034b0e4c27fdf39c252 (diff)
sched: fix inconsistency when redistribute per-cpu tg->cfs_rq shares
Impact: make load-balancing more consistent In the update_shares() path leading to tg_shares_up(), the calculation of per-cpu cfs_rq shares is rather erratic even under moderate task wake up rate. The problem is that the per-cpu tg->cfs_rq load weight used in the sd_rq_weight aggregation and actual redistribution of the cfs_rq->shares are collected at different time. Under moderate system load, we've seen quite a bit of variation on the cfs_rq->shares and ultimately wildly affects sched_entity's load weight. This patch caches the result of initial per-cpu load weight when doing the sum calculation, and then pass it down to update_group_shares_cpu() for redistributing per-cpu cfs_rq shares. This allows consistent total cfs_rq shares across all CPUs. It also simplifies the rounding and zero load weight check. Signed-off-by: Ken Chen <kenchen@google.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--kernel/sched.c41
1 files changed, 15 insertions, 26 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index a4c156d9a4a5..93bfb086e60f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1453,27 +1453,13 @@ static void
1453update_group_shares_cpu(struct task_group *tg, int cpu, 1453update_group_shares_cpu(struct task_group *tg, int cpu,
1454 unsigned long sd_shares, unsigned long sd_rq_weight) 1454 unsigned long sd_shares, unsigned long sd_rq_weight)
1455{ 1455{
1456 int boost = 0;
1457 unsigned long shares; 1456 unsigned long shares;
1458 unsigned long rq_weight; 1457 unsigned long rq_weight;
1459 1458
1460 if (!tg->se[cpu]) 1459 if (!tg->se[cpu])
1461 return; 1460 return;
1462 1461
1463 rq_weight = tg->cfs_rq[cpu]->load.weight; 1462 rq_weight = tg->cfs_rq[cpu]->rq_weight;
1464
1465 /*
1466 * If there are currently no tasks on the cpu pretend there is one of
1467 * average load so that when a new task gets to run here it will not
1468 * get delayed by group starvation.
1469 */
1470 if (!rq_weight) {
1471 boost = 1;
1472 rq_weight = NICE_0_LOAD;
1473 }
1474
1475 if (unlikely(rq_weight > sd_rq_weight))
1476 rq_weight = sd_rq_weight;
1477 1463
1478 /* 1464 /*
1479 * \Sum shares * rq_weight 1465 * \Sum shares * rq_weight
@@ -1481,7 +1467,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1481 * \Sum rq_weight 1467 * \Sum rq_weight
1482 * 1468 *
1483 */ 1469 */
1484 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); 1470 shares = (sd_shares * rq_weight) / sd_rq_weight;
1485 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1471 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1486 1472
1487 if (abs(shares - tg->se[cpu]->load.weight) > 1473 if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1490,11 +1476,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1490 unsigned long flags; 1476 unsigned long flags;
1491 1477
1492 spin_lock_irqsave(&rq->lock, flags); 1478 spin_lock_irqsave(&rq->lock, flags);
1493 /* 1479 tg->cfs_rq[cpu]->shares = shares;
1494 * record the actual number of shares, not the boosted amount.
1495 */
1496 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1497 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1498 1480
1499 __set_se_shares(tg->se[cpu], shares); 1481 __set_se_shares(tg->se[cpu], shares);
1500 spin_unlock_irqrestore(&rq->lock, flags); 1482 spin_unlock_irqrestore(&rq->lock, flags);
@@ -1508,13 +1490,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1508 */ 1490 */
1509static int tg_shares_up(struct task_group *tg, void *data) 1491static int tg_shares_up(struct task_group *tg, void *data)
1510{ 1492{
1511 unsigned long rq_weight = 0; 1493 unsigned long weight, rq_weight = 0;
1512 unsigned long shares = 0; 1494 unsigned long shares = 0;
1513 struct sched_domain *sd = data; 1495 struct sched_domain *sd = data;
1514 int i; 1496 int i;
1515 1497
1516 for_each_cpu_mask(i, sd->span) { 1498 for_each_cpu_mask(i, sd->span) {
1517 rq_weight += tg->cfs_rq[i]->load.weight; 1499 /*
1500 * If there are currently no tasks on the cpu pretend there
1501 * is one of average load so that when a new task gets to
1502 * run here it will not get delayed by group starvation.
1503 */
1504 weight = tg->cfs_rq[i]->load.weight;
1505 if (!weight)
1506 weight = NICE_0_LOAD;
1507
1508 tg->cfs_rq[i]->rq_weight = weight;
1509 rq_weight += weight;
1518 shares += tg->cfs_rq[i]->shares; 1510 shares += tg->cfs_rq[i]->shares;
1519 } 1511 }
1520 1512
@@ -1524,9 +1516,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
1524 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1516 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1525 shares = tg->shares; 1517 shares = tg->shares;
1526 1518
1527 if (!rq_weight)
1528 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1529
1530 for_each_cpu_mask(i, sd->span) 1519 for_each_cpu_mask(i, sd->span)
1531 update_group_shares_cpu(tg, i, shares, rq_weight); 1520 update_group_shares_cpu(tg, i, shares, rq_weight);
1532 1521