sched: optimize group load balancer

I noticed that tg_shares_up() unconditionally takes rq-locks for all cpus in the sched_domain. This hurts. We need the rq-locks whenever we change the weight of the per-cpu group sched entities. To allevate this a little, only change the weight when the new weight is at least shares_thresh away from the old value. This avoids the rq-lock for the top level entries, since those will never be re-weighted, and fuzzes the lower level entries a little to gain performance in semi-stable situations. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2008-10-17 13:27:02 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-10-20 08:05:02 -0400
commit: ffda12a17a324103e9900fa1035309811eecbfe5 (patch)
tree: 79fe8aae79a41b467f2cdd055036b3017642a9f6
parent: b0aa51b999c449e5e3f9faa1ee406e052d407fe7 (diff)
3 files changed, 36 insertions, 20 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6eda6ad735dc..4f59c8e8597d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1621,6 +1621,7 @@ extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_shares_ratelimit;
+extern unsigned int sysctl_sched_shares_thresh;
 int sched_nr_latency_handler(struct ctl_table *table, int write,
                struct file *file, void __user *buffer, size_t *length,
diff --git a/kernel/sched.c b/kernel/sched.c
index c530b84c7f80..11ca39017835 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -818,6 +818,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
 unsigned int sysctl_sched_shares_ratelimit = 250000;
 /*
+ * Inject some fuzzyness into changing the per-cpu group shares
+ * this avoids remote rq-locks at the expense of fairness.
+ * default: 4
+ */
+unsigned int sysctl_sched_shares_thresh = 4;
+/*
 * period over which we measure -rt task cpu usage in us.
 * default: 1s
 */
@@ -1453,8 +1460,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 * Calculate and set the cpu's group shares.
 */
 static void
-__update_group_shares_cpu(struct task_group *tg, int cpu,
+update_group_shares_cpu(struct task_group *tg, int cpu,
-                          unsigned long sd_shares, unsigned long sd_rq_weight)
+                        unsigned long sd_shares, unsigned long sd_rq_weight)
 {
        int boost = 0;
        unsigned long shares;
@@ -1485,19 +1492,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
         *
         */
        shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
-        /*
+        if (abs(shares - tg->se[cpu]->load.weight) >
-         * record the actual number of shares, not the boosted amount.
+                        sysctl_sched_shares_thresh) {
-         */
+                struct rq *rq = cpu_rq(cpu);
-        tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+                unsigned long flags;
-        tg->cfs_rq[cpu]->rq_weight = rq_weight;
-        if (shares < MIN_SHARES)
+                spin_lock_irqsave(&rq->lock, flags);
-                shares = MIN_SHARES;
+                /*
-        else if (shares > MAX_SHARES)
+                 * record the actual number of shares, not the boosted amount.
-                shares = MAX_SHARES;
+                 */
+                tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+                tg->cfs_rq[cpu]->rq_weight = rq_weight;
-        __set_se_shares(tg->se[cpu], shares);
+                __set_se_shares(tg->se[cpu], shares);
+                spin_unlock_irqrestore(&rq->lock, flags);
+        }
 }
 /*
@@ -1526,14 +1537,8 @@ static int tg_shares_up(struct task_group *tg, void *data)
        if (!rq_weight)
                rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
-        for_each_cpu_mask(i, sd->span) {
+        for_each_cpu_mask(i, sd->span)
-                struct rq *rq = cpu_rq(i);
+                update_group_shares_cpu(tg, i, shares, rq_weight);
-                unsigned long flags;
-                spin_lock_irqsave(&rq->lock, flags);
-                __update_group_shares_cpu(tg, i, shares, rq_weight);
-                spin_unlock_irqrestore(&rq->lock, flags);
-        }
        return 0;
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 617d41e4d6a0..3d804f41e649 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -276,6 +276,16 @@ static struct ctl_table kern_table[] = {
        },
        {
                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "sched_shares_thresh",
+                .data           = &sysctl_sched_shares_thresh,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_minmax,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &zero,
+        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "sched_child_runs_first",
                .data           = &sysctl_sched_child_runs_first,
                .maxlen         = sizeof(unsigned int),
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2008-10-17 13:27:02 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-10-20 08:05:02 -0400
commit	ffda12a17a324103e9900fa1035309811eecbfe5 (patch)
tree	79fe8aae79a41b467f2cdd055036b3017642a9f6
parent	b0aa51b999c449e5e3f9faa1ee406e052d407fe7 (diff)