aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2012-08-08 15:46:40 -0400
committerThomas Gleixner <tglx@linutronix.de>2012-08-13 12:41:54 -0400
commita35b6466aabb051568b844e8c63f87a356d3d129 (patch)
treea5d38cce8290a60f6729f97591cfa25d545c6474 /kernel
parentb9403130a5350fca59a50ed11c198cb8c7e54119 (diff)
sched, cgroup: Reduce rq->lock hold times for large cgroup hierarchies
Peter Portante reported that for large cgroup hierarchies (and or on large CPU counts) we get immense lock contention on rq->lock and stuff stops working properly. His workload was a ton of processes, each in their own cgroup, everybody idling except for a sporadic wakeup once every so often. It was found that: schedule() idle_balance() load_balance() local_irq_save() double_rq_lock() update_h_load() walk_tg_tree(tg_load_down) tg_load_down() Results in an entire cgroup hierarchy walk under rq->lock for every new-idle balance and since new-idle balance isn't throttled this results in a lot of work while holding the rq->lock. This patch does two things, it removes the work from under rq->lock based on the good principle of race and pray which is widely employed in the load-balancer as a whole. And secondly it throttles the update_h_load() calculation to max once per jiffy. I considered excluding update_h_load() for new-idle balance all-together, but purely relying on regular balance passes to update this data might not work out under some rare circumstances where the new-idle busiest isn't the regular busiest for a while (unlikely, but a nightmare to debug if someone hits it and suffers). Cc: pjt@google.com Cc: Larry Woodman <lwoodman@redhat.com> Cc: Mike Galbraith <efault@gmx.de> Reported-by: Peter Portante <pportant@redhat.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/n/tip-aaarrzfpnaam7pqrekofu8a6@git.kernel.org Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/fair.c11
-rw-r--r--kernel/sched/sched.h6
2 files changed, 14 insertions, 3 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d0cc03b3e70..c219bf8d704 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3387,6 +3387,14 @@ static int tg_load_down(struct task_group *tg, void *data)
3387 3387
3388static void update_h_load(long cpu) 3388static void update_h_load(long cpu)
3389{ 3389{
3390 struct rq *rq = cpu_rq(cpu);
3391 unsigned long now = jiffies;
3392
3393 if (rq->h_load_throttle == now)
3394 return;
3395
3396 rq->h_load_throttle = now;
3397
3390 rcu_read_lock(); 3398 rcu_read_lock();
3391 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 3399 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
3392 rcu_read_unlock(); 3400 rcu_read_unlock();
@@ -4293,11 +4301,10 @@ redo:
4293 env.src_rq = busiest; 4301 env.src_rq = busiest;
4294 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 4302 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
4295 4303
4304 update_h_load(env.src_cpu);
4296more_balance: 4305more_balance:
4297 local_irq_save(flags); 4306 local_irq_save(flags);
4298 double_rq_lock(this_rq, busiest); 4307 double_rq_lock(this_rq, busiest);
4299 if (!env.loop)
4300 update_h_load(env.src_cpu);
4301 4308
4302 /* 4309 /*
4303 * cur_ld_moved - load moved in current iteration 4310 * cur_ld_moved - load moved in current iteration
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c35a1a7dd4d..531411b1044 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -374,7 +374,11 @@ struct rq {
374#ifdef CONFIG_FAIR_GROUP_SCHED 374#ifdef CONFIG_FAIR_GROUP_SCHED
375 /* list of leaf cfs_rq on this cpu: */ 375 /* list of leaf cfs_rq on this cpu: */
376 struct list_head leaf_cfs_rq_list; 376 struct list_head leaf_cfs_rq_list;
377#endif 377#ifdef CONFIG_SMP
378 unsigned long h_load_throttle;
379#endif /* CONFIG_SMP */
380#endif /* CONFIG_FAIR_GROUP_SCHED */
381
378#ifdef CONFIG_RT_GROUP_SCHED 382#ifdef CONFIG_RT_GROUP_SCHED
379 struct list_head leaf_rt_rq_list; 383 struct list_head leaf_rt_rq_list;
380#endif 384#endif