diff options
| author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2012-08-08 15:46:40 -0400 |
|---|---|---|
| committer | Thomas Gleixner <tglx@linutronix.de> | 2012-08-13 12:41:54 -0400 |
| commit | a35b6466aabb051568b844e8c63f87a356d3d129 (patch) | |
| tree | a5d38cce8290a60f6729f97591cfa25d545c6474 | |
| parent | b9403130a5350fca59a50ed11c198cb8c7e54119 (diff) | |
sched, cgroup: Reduce rq->lock hold times for large cgroup hierarchies
Peter Portante reported that for large cgroup hierarchies (and or on
large CPU counts) we get immense lock contention on rq->lock and stuff
stops working properly.
His workload was a ton of processes, each in their own cgroup,
everybody idling except for a sporadic wakeup once every so often.
It was found that:
schedule()
idle_balance()
load_balance()
local_irq_save()
double_rq_lock()
update_h_load()
walk_tg_tree(tg_load_down)
tg_load_down()
Results in an entire cgroup hierarchy walk under rq->lock for every
new-idle balance and since new-idle balance isn't throttled this
results in a lot of work while holding the rq->lock.
This patch does two things, it removes the work from under rq->lock
based on the good principle of race and pray which is widely employed
in the load-balancer as a whole. And secondly it throttles the
update_h_load() calculation to max once per jiffy.
I considered excluding update_h_load() for new-idle balance
all-together, but purely relying on regular balance passes to update
this data might not work out under some rare circumstances where the
new-idle busiest isn't the regular busiest for a while (unlikely, but
a nightmare to debug if someone hits it and suffers).
Cc: pjt@google.com
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Reported-by: Peter Portante <pportant@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/n/tip-aaarrzfpnaam7pqrekofu8a6@git.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
| -rw-r--r-- | kernel/sched/fair.c | 11 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 6 |
2 files changed, 14 insertions, 3 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d0cc03b3e70b..c219bf8d704c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -3387,6 +3387,14 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
| 3387 | 3387 | ||
| 3388 | static void update_h_load(long cpu) | 3388 | static void update_h_load(long cpu) |
| 3389 | { | 3389 | { |
| 3390 | struct rq *rq = cpu_rq(cpu); | ||
| 3391 | unsigned long now = jiffies; | ||
| 3392 | |||
| 3393 | if (rq->h_load_throttle == now) | ||
| 3394 | return; | ||
| 3395 | |||
| 3396 | rq->h_load_throttle = now; | ||
| 3397 | |||
| 3390 | rcu_read_lock(); | 3398 | rcu_read_lock(); |
| 3391 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 3399 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
| 3392 | rcu_read_unlock(); | 3400 | rcu_read_unlock(); |
| @@ -4293,11 +4301,10 @@ redo: | |||
| 4293 | env.src_rq = busiest; | 4301 | env.src_rq = busiest; |
| 4294 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 4302 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
| 4295 | 4303 | ||
| 4304 | update_h_load(env.src_cpu); | ||
| 4296 | more_balance: | 4305 | more_balance: |
| 4297 | local_irq_save(flags); | 4306 | local_irq_save(flags); |
| 4298 | double_rq_lock(this_rq, busiest); | 4307 | double_rq_lock(this_rq, busiest); |
| 4299 | if (!env.loop) | ||
| 4300 | update_h_load(env.src_cpu); | ||
| 4301 | 4308 | ||
| 4302 | /* | 4309 | /* |
| 4303 | * cur_ld_moved - load moved in current iteration | 4310 | * cur_ld_moved - load moved in current iteration |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c35a1a7dd4d6..531411b1044e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -374,7 +374,11 @@ struct rq { | |||
| 374 | #ifdef CONFIG_FAIR_GROUP_SCHED | 374 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 375 | /* list of leaf cfs_rq on this cpu: */ | 375 | /* list of leaf cfs_rq on this cpu: */ |
| 376 | struct list_head leaf_cfs_rq_list; | 376 | struct list_head leaf_cfs_rq_list; |
| 377 | #endif | 377 | #ifdef CONFIG_SMP |
| 378 | unsigned long h_load_throttle; | ||
| 379 | #endif /* CONFIG_SMP */ | ||
| 380 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 381 | |||
| 378 | #ifdef CONFIG_RT_GROUP_SCHED | 382 | #ifdef CONFIG_RT_GROUP_SCHED |
| 379 | struct list_head leaf_rt_rq_list; | 383 | struct list_head leaf_rt_rq_list; |
| 380 | #endif | 384 | #endif |
