diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2010-11-15 18:47:01 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2010-11-18 07:27:47 -0500 |
commit | 3d4b47b4b040c9d77dd68104cfc1055d89a55afd (patch) | |
tree | a4b39b5d7c89a319b81543c1b26778d6220e772b /kernel/sched.c | |
parent | 2069dd75c7d0f49355939e5586daf5a9ab216db7 (diff) |
sched: Implement on-demand (active) cfs_rq list
Make certain load-balance actions scale per number of active cgroups
instead of the number of existing cgroups.
This makes wakeup/sleep paths more expensive, but is a win for systems
where the vast majority of existing cgroups are idle.
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234937.666535048@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 105 |
1 files changed, 28 insertions, 77 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index e2f1a3024a99..22436dd2e19f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -274,9 +274,7 @@ struct task_group { | |||
274 | 274 | ||
275 | #define root_task_group init_task_group | 275 | #define root_task_group init_task_group |
276 | 276 | ||
277 | /* task_group_lock serializes add/remove of task groups and also changes to | 277 | /* task_group_lock serializes the addition/removal of task groups */ |
278 | * a task group's cpu shares. | ||
279 | */ | ||
280 | static DEFINE_SPINLOCK(task_group_lock); | 278 | static DEFINE_SPINLOCK(task_group_lock); |
281 | 279 | ||
282 | #ifdef CONFIG_FAIR_GROUP_SCHED | 280 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -344,6 +342,7 @@ struct cfs_rq { | |||
344 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | 342 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This |
345 | * list is used during load balance. | 343 | * list is used during load balance. |
346 | */ | 344 | */ |
345 | int on_list; | ||
347 | struct list_head leaf_cfs_rq_list; | 346 | struct list_head leaf_cfs_rq_list; |
348 | struct task_group *tg; /* group that "owns" this runqueue */ | 347 | struct task_group *tg; /* group that "owns" this runqueue */ |
349 | 348 | ||
@@ -1547,7 +1546,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1547 | 1546 | ||
1548 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1547 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1549 | 1548 | ||
1550 | static void update_cfs_load(struct cfs_rq *cfs_rq); | 1549 | static void update_cfs_load(struct cfs_rq *cfs_rq, int lb); |
1551 | static void update_cfs_shares(struct cfs_rq *cfs_rq); | 1550 | static void update_cfs_shares(struct cfs_rq *cfs_rq); |
1552 | 1551 | ||
1553 | /* | 1552 | /* |
@@ -1570,7 +1569,7 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
1570 | raw_spin_lock_irqsave(&rq->lock, flags); | 1569 | raw_spin_lock_irqsave(&rq->lock, flags); |
1571 | 1570 | ||
1572 | update_rq_clock(rq); | 1571 | update_rq_clock(rq); |
1573 | update_cfs_load(cfs_rq); | 1572 | update_cfs_load(cfs_rq, 1); |
1574 | 1573 | ||
1575 | load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); | 1574 | load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); |
1576 | load_avg -= cfs_rq->load_contribution; | 1575 | load_avg -= cfs_rq->load_contribution; |
@@ -7688,15 +7687,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
7688 | 7687 | ||
7689 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7688 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7690 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | 7689 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
7691 | struct sched_entity *se, int cpu, int add, | 7690 | struct sched_entity *se, int cpu, |
7692 | struct sched_entity *parent) | 7691 | struct sched_entity *parent) |
7693 | { | 7692 | { |
7694 | struct rq *rq = cpu_rq(cpu); | 7693 | struct rq *rq = cpu_rq(cpu); |
7695 | tg->cfs_rq[cpu] = cfs_rq; | 7694 | tg->cfs_rq[cpu] = cfs_rq; |
7696 | init_cfs_rq(cfs_rq, rq); | 7695 | init_cfs_rq(cfs_rq, rq); |
7697 | cfs_rq->tg = tg; | 7696 | cfs_rq->tg = tg; |
7698 | if (add) | ||
7699 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7700 | 7697 | ||
7701 | tg->se[cpu] = se; | 7698 | tg->se[cpu] = se; |
7702 | /* se could be NULL for init_task_group */ | 7699 | /* se could be NULL for init_task_group */ |
@@ -7716,7 +7713,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
7716 | 7713 | ||
7717 | #ifdef CONFIG_RT_GROUP_SCHED | 7714 | #ifdef CONFIG_RT_GROUP_SCHED |
7718 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | 7715 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, |
7719 | struct sched_rt_entity *rt_se, int cpu, int add, | 7716 | struct sched_rt_entity *rt_se, int cpu, |
7720 | struct sched_rt_entity *parent) | 7717 | struct sched_rt_entity *parent) |
7721 | { | 7718 | { |
7722 | struct rq *rq = cpu_rq(cpu); | 7719 | struct rq *rq = cpu_rq(cpu); |
@@ -7725,8 +7722,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
7725 | init_rt_rq(rt_rq, rq); | 7722 | init_rt_rq(rt_rq, rq); |
7726 | rt_rq->tg = tg; | 7723 | rt_rq->tg = tg; |
7727 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 7724 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
7728 | if (add) | ||
7729 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
7730 | 7725 | ||
7731 | tg->rt_se[cpu] = rt_se; | 7726 | tg->rt_se[cpu] = rt_se; |
7732 | if (!rt_se) | 7727 | if (!rt_se) |
@@ -7835,7 +7830,7 @@ void __init sched_init(void) | |||
7835 | * We achieve this by letting init_task_group's tasks sit | 7830 | * We achieve this by letting init_task_group's tasks sit |
7836 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | 7831 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). |
7837 | */ | 7832 | */ |
7838 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); | 7833 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL); |
7839 | #endif | 7834 | #endif |
7840 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7835 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7841 | 7836 | ||
@@ -7843,7 +7838,7 @@ void __init sched_init(void) | |||
7843 | #ifdef CONFIG_RT_GROUP_SCHED | 7838 | #ifdef CONFIG_RT_GROUP_SCHED |
7844 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 7839 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
7845 | #ifdef CONFIG_CGROUP_SCHED | 7840 | #ifdef CONFIG_CGROUP_SCHED |
7846 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); | 7841 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL); |
7847 | #endif | 7842 | #endif |
7848 | #endif | 7843 | #endif |
7849 | 7844 | ||
@@ -8119,7 +8114,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8119 | if (!se) | 8114 | if (!se) |
8120 | goto err_free_rq; | 8115 | goto err_free_rq; |
8121 | 8116 | ||
8122 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); | 8117 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); |
8123 | } | 8118 | } |
8124 | 8119 | ||
8125 | return 1; | 8120 | return 1; |
@@ -8130,15 +8125,22 @@ err: | |||
8130 | return 0; | 8125 | return 0; |
8131 | } | 8126 | } |
8132 | 8127 | ||
8133 | static inline void register_fair_sched_group(struct task_group *tg, int cpu) | ||
8134 | { | ||
8135 | list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, | ||
8136 | &cpu_rq(cpu)->leaf_cfs_rq_list); | ||
8137 | } | ||
8138 | |||
8139 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8128 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8140 | { | 8129 | { |
8141 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); | 8130 | struct rq *rq = cpu_rq(cpu); |
8131 | unsigned long flags; | ||
8132 | int i; | ||
8133 | |||
8134 | /* | ||
8135 | * Only empty task groups can be destroyed; so we can speculatively | ||
8136 | * check on_list without danger of it being re-added. | ||
8137 | */ | ||
8138 | if (!tg->cfs_rq[cpu]->on_list) | ||
8139 | return; | ||
8140 | |||
8141 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8142 | list_del_leaf_cfs_rq(tg->cfs_rq[i]); | ||
8143 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8142 | } | 8144 | } |
8143 | #else /* !CONFG_FAIR_GROUP_SCHED */ | 8145 | #else /* !CONFG_FAIR_GROUP_SCHED */ |
8144 | static inline void free_fair_sched_group(struct task_group *tg) | 8146 | static inline void free_fair_sched_group(struct task_group *tg) |
@@ -8151,10 +8153,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8151 | return 1; | 8153 | return 1; |
8152 | } | 8154 | } |
8153 | 8155 | ||
8154 | static inline void register_fair_sched_group(struct task_group *tg, int cpu) | ||
8155 | { | ||
8156 | } | ||
8157 | |||
8158 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8156 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8159 | { | 8157 | { |
8160 | } | 8158 | } |
@@ -8209,7 +8207,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8209 | if (!rt_se) | 8207 | if (!rt_se) |
8210 | goto err_free_rq; | 8208 | goto err_free_rq; |
8211 | 8209 | ||
8212 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); | 8210 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); |
8213 | } | 8211 | } |
8214 | 8212 | ||
8215 | return 1; | 8213 | return 1; |
@@ -8219,17 +8217,6 @@ err_free_rq: | |||
8219 | err: | 8217 | err: |
8220 | return 0; | 8218 | return 0; |
8221 | } | 8219 | } |
8222 | |||
8223 | static inline void register_rt_sched_group(struct task_group *tg, int cpu) | ||
8224 | { | ||
8225 | list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, | ||
8226 | &cpu_rq(cpu)->leaf_rt_rq_list); | ||
8227 | } | ||
8228 | |||
8229 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | ||
8230 | { | ||
8231 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); | ||
8232 | } | ||
8233 | #else /* !CONFIG_RT_GROUP_SCHED */ | 8220 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8234 | static inline void free_rt_sched_group(struct task_group *tg) | 8221 | static inline void free_rt_sched_group(struct task_group *tg) |
8235 | { | 8222 | { |
@@ -8240,14 +8227,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8240 | { | 8227 | { |
8241 | return 1; | 8228 | return 1; |
8242 | } | 8229 | } |
8243 | |||
8244 | static inline void register_rt_sched_group(struct task_group *tg, int cpu) | ||
8245 | { | ||
8246 | } | ||
8247 | |||
8248 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | ||
8249 | { | ||
8250 | } | ||
8251 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8230 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8252 | 8231 | ||
8253 | #ifdef CONFIG_CGROUP_SCHED | 8232 | #ifdef CONFIG_CGROUP_SCHED |
@@ -8263,7 +8242,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
8263 | { | 8242 | { |
8264 | struct task_group *tg; | 8243 | struct task_group *tg; |
8265 | unsigned long flags; | 8244 | unsigned long flags; |
8266 | int i; | ||
8267 | 8245 | ||
8268 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); | 8246 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); |
8269 | if (!tg) | 8247 | if (!tg) |
@@ -8276,10 +8254,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
8276 | goto err; | 8254 | goto err; |
8277 | 8255 | ||
8278 | spin_lock_irqsave(&task_group_lock, flags); | 8256 | spin_lock_irqsave(&task_group_lock, flags); |
8279 | for_each_possible_cpu(i) { | ||
8280 | register_fair_sched_group(tg, i); | ||
8281 | register_rt_sched_group(tg, i); | ||
8282 | } | ||
8283 | list_add_rcu(&tg->list, &task_groups); | 8257 | list_add_rcu(&tg->list, &task_groups); |
8284 | 8258 | ||
8285 | WARN_ON(!parent); /* root should already exist */ | 8259 | WARN_ON(!parent); /* root should already exist */ |
@@ -8309,11 +8283,11 @@ void sched_destroy_group(struct task_group *tg) | |||
8309 | unsigned long flags; | 8283 | unsigned long flags; |
8310 | int i; | 8284 | int i; |
8311 | 8285 | ||
8312 | spin_lock_irqsave(&task_group_lock, flags); | 8286 | /* end participation in shares distribution */ |
8313 | for_each_possible_cpu(i) { | 8287 | for_each_possible_cpu(i) |
8314 | unregister_fair_sched_group(tg, i); | 8288 | unregister_fair_sched_group(tg, i); |
8315 | unregister_rt_sched_group(tg, i); | 8289 | |
8316 | } | 8290 | spin_lock_irqsave(&task_group_lock, flags); |
8317 | list_del_rcu(&tg->list); | 8291 | list_del_rcu(&tg->list); |
8318 | list_del_rcu(&tg->siblings); | 8292 | list_del_rcu(&tg->siblings); |
8319 | spin_unlock_irqrestore(&task_group_lock, flags); | 8293 | spin_unlock_irqrestore(&task_group_lock, flags); |
@@ -8391,7 +8365,6 @@ static DEFINE_MUTEX(shares_mutex); | |||
8391 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 8365 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
8392 | { | 8366 | { |
8393 | int i; | 8367 | int i; |
8394 | unsigned long flags; | ||
8395 | 8368 | ||
8396 | /* | 8369 | /* |
8397 | * We can't change the weight of the root cgroup. | 8370 | * We can't change the weight of the root cgroup. |
@@ -8408,19 +8381,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8408 | if (tg->shares == shares) | 8381 | if (tg->shares == shares) |
8409 | goto done; | 8382 | goto done; |
8410 | 8383 | ||
8411 | spin_lock_irqsave(&task_group_lock, flags); | ||
8412 | for_each_possible_cpu(i) | ||
8413 | unregister_fair_sched_group(tg, i); | ||
8414 | list_del_rcu(&tg->siblings); | ||
8415 | spin_unlock_irqrestore(&task_group_lock, flags); | ||
8416 | |||
8417 | /* wait for any ongoing reference to this group to finish */ | ||
8418 | synchronize_sched(); | ||
8419 | |||
8420 | /* | ||
8421 | * Now we are free to modify the group's share on each cpu | ||
8422 | * w/o tripping rebalance_share or load_balance_fair. | ||
8423 | */ | ||
8424 | tg->shares = shares; | 8384 | tg->shares = shares; |
8425 | for_each_possible_cpu(i) { | 8385 | for_each_possible_cpu(i) { |
8426 | /* | 8386 | /* |
@@ -8429,15 +8389,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8429 | set_se_shares(tg->se[i], shares); | 8389 | set_se_shares(tg->se[i], shares); |
8430 | } | 8390 | } |
8431 | 8391 | ||
8432 | /* | ||
8433 | * Enable load balance activity on this group, by inserting it back on | ||
8434 | * each cpu's rq->leaf_cfs_rq_list. | ||
8435 | */ | ||
8436 | spin_lock_irqsave(&task_group_lock, flags); | ||
8437 | for_each_possible_cpu(i) | ||
8438 | register_fair_sched_group(tg, i); | ||
8439 | list_add_rcu(&tg->siblings, &tg->parent->children); | ||
8440 | spin_unlock_irqrestore(&task_group_lock, flags); | ||
8441 | done: | 8392 | done: |
8442 | mutex_unlock(&shares_mutex); | 8393 | mutex_unlock(&shares_mutex); |
8443 | return 0; | 8394 | return 0; |