aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorVladimir Davydov <vdavydov@parallels.com>2013-07-15 09:49:19 -0400
committerIngo Molnar <mingo@kernel.org>2013-07-23 06:18:41 -0400
commit685207963be973fbb73550db6edaf920a283e1a7 (patch)
tree95ce6c29454a7c72fe4c9410119bc6129d95f193 /kernel
parent3bd5a5fc8c6b9fe769777abf74b0ab5fbd7930b4 (diff)
sched: Move h_load calculation to task_h_load()
The bad thing about update_h_load(), which computes hierarchical load factor for task groups, is that it is called for each task group in the system before every load balancer run, and since rebalance can be triggered very often, this function can eat really a lot of cpu time if there are many cpu cgroups in the system. Although the situation was improved significantly by commit a35b646 ('sched, cgroup: Reduce rq->lock hold times for large cgroup hierarchies'), the problem still can arise under some kinds of loads, e.g. when cpus are switching from idle to busy and back very frequently. For instance, when I start 1000 of processes that wake up every millisecond on my 8 cpus host, 'top' and 'perf top' show: Cpu(s): 17.8%us, 24.3%sy, 0.0%ni, 57.9%id, 0.0%wa, 0.0%hi, 0.0%si Events: 243K cycles 7.57% [kernel] [k] __schedule 7.08% [kernel] [k] timerqueue_add 6.13% libc-2.12.so [.] usleep Then if I create 10000 *idle* cpu cgroups (no processes in them), cpu usage increases significantly although the 'wakers' are still executing in the root cpu cgroup: Cpu(s): 19.1%us, 48.7%sy, 0.0%ni, 31.6%id, 0.0%wa, 0.0%hi, 0.7%si Events: 230K cycles 24.56% [kernel] [k] tg_load_down 5.76% [kernel] [k] __schedule This happens because this particular kind of load triggers 'new idle' rebalance very frequently, which requires calling update_h_load(), which, in turn, calls tg_load_down() for every *idle* cpu cgroup even though it is absolutely useless, because idle cpu cgroups have no tasks to pull. This patch tries to improve the situation by making h_load calculation proceed only when h_load is really necessary. To achieve this, it substitutes update_h_load() with update_cfs_rq_h_load(), which computes h_load only for a given cfs_rq and all its ascendants, and makes the load balancer call this function whenever it considers if a task should be pulled, i.e. it moves h_load calculations directly to task_h_load(). For h_load of the same cfs_rq not to be updated multiple times (in case several tasks in the same cgroup are considered during the same balance run), the patch keeps the time of the last h_load update for each cfs_rq and breaks calculation when it finds h_load to be uptodate. The benefit of it is that h_load is computed only for those cfs_rq's, which really need it, in particular all idle task groups are skipped. Although this, in fact, moves h_load calculation under rq lock, it should not affect latency much, because the amount of work done under rq lock while trying to pull tasks is limited by sched_nr_migrate. After the patch applied with the setup described above (1000 wakers in the root cgroup and 10000 idle cgroups), I get: Cpu(s): 16.9%us, 24.8%sy, 0.0%ni, 58.4%id, 0.0%wa, 0.0%hi, 0.0%si Events: 242K cycles 7.57% [kernel] [k] __schedule 6.70% [kernel] [k] timerqueue_add 5.93% libc-2.12.so [.] usleep Signed-off-by: Vladimir Davydov <vdavydov@parallels.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1373896159-1278-1-git-send-email-vdavydov@parallels.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/fair.c58
-rw-r--r--kernel/sched/sched.h7
2 files changed, 30 insertions, 35 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bb456f44b7b1..765d87acdf05 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4171,47 +4171,48 @@ static void update_blocked_averages(int cpu)
4171} 4171}
4172 4172
4173/* 4173/*
4174 * Compute the cpu's hierarchical load factor for each task group. 4174 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
4175 * This needs to be done in a top-down fashion because the load of a child 4175 * This needs to be done in a top-down fashion because the load of a child
4176 * group is a fraction of its parents load. 4176 * group is a fraction of its parents load.
4177 */ 4177 */
4178static int tg_load_down(struct task_group *tg, void *data) 4178static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
4179{ 4179{
4180 unsigned long load; 4180 struct rq *rq = rq_of(cfs_rq);
4181 long cpu = (long)data; 4181 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
4182
4183 if (!tg->parent) {
4184 load = cpu_rq(cpu)->avg.load_avg_contrib;
4185 } else {
4186 load = tg->parent->cfs_rq[cpu]->h_load;
4187 load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
4188 tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
4189 }
4190
4191 tg->cfs_rq[cpu]->h_load = load;
4192
4193 return 0;
4194}
4195
4196static void update_h_load(long cpu)
4197{
4198 struct rq *rq = cpu_rq(cpu);
4199 unsigned long now = jiffies; 4182 unsigned long now = jiffies;
4183 unsigned long load;
4200 4184
4201 if (rq->h_load_throttle == now) 4185 if (cfs_rq->last_h_load_update == now)
4202 return; 4186 return;
4203 4187
4204 rq->h_load_throttle = now; 4188 cfs_rq->h_load_next = NULL;
4189 for_each_sched_entity(se) {
4190 cfs_rq = cfs_rq_of(se);
4191 cfs_rq->h_load_next = se;
4192 if (cfs_rq->last_h_load_update == now)
4193 break;
4194 }
4205 4195
4206 rcu_read_lock(); 4196 if (!se) {
4207 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 4197 cfs_rq->h_load = rq->avg.load_avg_contrib;
4208 rcu_read_unlock(); 4198 cfs_rq->last_h_load_update = now;
4199 }
4200
4201 while ((se = cfs_rq->h_load_next) != NULL) {
4202 load = cfs_rq->h_load;
4203 load = div64_ul(load * se->avg.load_avg_contrib,
4204 cfs_rq->runnable_load_avg + 1);
4205 cfs_rq = group_cfs_rq(se);
4206 cfs_rq->h_load = load;
4207 cfs_rq->last_h_load_update = now;
4208 }
4209} 4209}
4210 4210
4211static unsigned long task_h_load(struct task_struct *p) 4211static unsigned long task_h_load(struct task_struct *p)
4212{ 4212{
4213 struct cfs_rq *cfs_rq = task_cfs_rq(p); 4213 struct cfs_rq *cfs_rq = task_cfs_rq(p);
4214 4214
4215 update_cfs_rq_h_load(cfs_rq);
4215 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, 4216 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
4216 cfs_rq->runnable_load_avg + 1); 4217 cfs_rq->runnable_load_avg + 1);
4217} 4218}
@@ -4220,10 +4221,6 @@ static inline void update_blocked_averages(int cpu)
4220{ 4221{
4221} 4222}
4222 4223
4223static inline void update_h_load(long cpu)
4224{
4225}
4226
4227static unsigned long task_h_load(struct task_struct *p) 4224static unsigned long task_h_load(struct task_struct *p)
4228{ 4225{
4229 return p->se.avg.load_avg_contrib; 4226 return p->se.avg.load_avg_contrib;
@@ -5108,7 +5105,6 @@ redo:
5108 env.src_rq = busiest; 5105 env.src_rq = busiest;
5109 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 5106 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
5110 5107
5111 update_h_load(env.src_cpu);
5112more_balance: 5108more_balance:
5113 local_irq_save(flags); 5109 local_irq_save(flags);
5114 double_rq_lock(env.dst_rq, busiest); 5110 double_rq_lock(env.dst_rq, busiest);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef0a7b2439dd..5e129efb84ce 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -285,7 +285,6 @@ struct cfs_rq {
285 /* Required to track per-cpu representation of a task_group */ 285 /* Required to track per-cpu representation of a task_group */
286 u32 tg_runnable_contrib; 286 u32 tg_runnable_contrib;
287 unsigned long tg_load_contrib; 287 unsigned long tg_load_contrib;
288#endif /* CONFIG_FAIR_GROUP_SCHED */
289 288
290 /* 289 /*
291 * h_load = weight * f(tg) 290 * h_load = weight * f(tg)
@@ -294,6 +293,9 @@ struct cfs_rq {
294 * this group. 293 * this group.
295 */ 294 */
296 unsigned long h_load; 295 unsigned long h_load;
296 u64 last_h_load_update;
297 struct sched_entity *h_load_next;
298#endif /* CONFIG_FAIR_GROUP_SCHED */
297#endif /* CONFIG_SMP */ 299#endif /* CONFIG_SMP */
298 300
299#ifdef CONFIG_FAIR_GROUP_SCHED 301#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -429,9 +431,6 @@ struct rq {
429#ifdef CONFIG_FAIR_GROUP_SCHED 431#ifdef CONFIG_FAIR_GROUP_SCHED
430 /* list of leaf cfs_rq on this cpu: */ 432 /* list of leaf cfs_rq on this cpu: */
431 struct list_head leaf_cfs_rq_list; 433 struct list_head leaf_cfs_rq_list;
432#ifdef CONFIG_SMP
433 unsigned long h_load_throttle;
434#endif /* CONFIG_SMP */
435#endif /* CONFIG_FAIR_GROUP_SCHED */ 434#endif /* CONFIG_FAIR_GROUP_SCHED */
436 435
437#ifdef CONFIG_RT_GROUP_SCHED 436#ifdef CONFIG_RT_GROUP_SCHED