diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2011-07-15 04:35:52 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-07-20 12:32:41 -0400 |
commit | e3589f6c81e4764d32a25d2a2a0afe54fa344f5c (patch) | |
tree | 414bf6bdbad3f04f629fa2a72254ea85acf723f4 | |
parent | 9c3f75cbd144014bea6af866a154cc2e73ab2287 (diff) |
sched: Allow for overlapping sched_domain spans
Allow for sched_domain spans that overlap by giving such domains their
own sched_group list instead of sharing the sched_groups amongst
each-other.
This is needed for machines with more than 16 nodes, because
sched_domain_node_span() will generate a node mask from the
16 nearest nodes without regard if these masks have any overlap.
Currently sched_domains have a sched_group that maps to their child
sched_domain span, and since there is no overlap we share the
sched_group between the sched_domains of the various CPUs. If however
there is overlap, we would need to link the sched_group list in
different ways for each cpu, and hence sharing isn't possible.
In order to solve this, allocate private sched_groups for each CPU's
sched_domain but have the sched_groups share a sched_group_power
structure such that we can uniquely track the power.
Reported-and-tested-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-08bxqw9wis3qti9u5inifh3y@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | include/linux/sched.h | 2 | ||||
-rw-r--r-- | kernel/sched.c | 157 | ||||
-rw-r--r-- | kernel/sched_features.h | 2 |
3 files changed, 132 insertions, 29 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 2e5b3c8e2d3e..bde99d5358dc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -844,6 +844,7 @@ enum cpu_idle_type { | |||
844 | #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ | 844 | #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ |
845 | #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ | 845 | #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ |
846 | #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ | 846 | #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ |
847 | #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ | ||
847 | 848 | ||
848 | enum powersavings_balance_level { | 849 | enum powersavings_balance_level { |
849 | POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ | 850 | POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ |
@@ -894,6 +895,7 @@ static inline int sd_power_saving_flags(void) | |||
894 | } | 895 | } |
895 | 896 | ||
896 | struct sched_group_power { | 897 | struct sched_group_power { |
898 | atomic_t ref; | ||
897 | /* | 899 | /* |
898 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a | 900 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a |
899 | * single CPU. | 901 | * single CPU. |
diff --git a/kernel/sched.c b/kernel/sched.c index 36c10d25d4cd..921adf6f6fad 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -6774,10 +6774,36 @@ static struct root_domain *alloc_rootdomain(void) | |||
6774 | return rd; | 6774 | return rd; |
6775 | } | 6775 | } |
6776 | 6776 | ||
6777 | static void free_sched_groups(struct sched_group *sg, int free_sgp) | ||
6778 | { | ||
6779 | struct sched_group *tmp, *first; | ||
6780 | |||
6781 | if (!sg) | ||
6782 | return; | ||
6783 | |||
6784 | first = sg; | ||
6785 | do { | ||
6786 | tmp = sg->next; | ||
6787 | |||
6788 | if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) | ||
6789 | kfree(sg->sgp); | ||
6790 | |||
6791 | kfree(sg); | ||
6792 | sg = tmp; | ||
6793 | } while (sg != first); | ||
6794 | } | ||
6795 | |||
6777 | static void free_sched_domain(struct rcu_head *rcu) | 6796 | static void free_sched_domain(struct rcu_head *rcu) |
6778 | { | 6797 | { |
6779 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | 6798 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); |
6780 | if (atomic_dec_and_test(&sd->groups->ref)) { | 6799 | |
6800 | /* | ||
6801 | * If its an overlapping domain it has private groups, iterate and | ||
6802 | * nuke them all. | ||
6803 | */ | ||
6804 | if (sd->flags & SD_OVERLAP) { | ||
6805 | free_sched_groups(sd->groups, 1); | ||
6806 | } else if (atomic_dec_and_test(&sd->groups->ref)) { | ||
6781 | kfree(sd->groups->sgp); | 6807 | kfree(sd->groups->sgp); |
6782 | kfree(sd->groups); | 6808 | kfree(sd->groups); |
6783 | } | 6809 | } |
@@ -6967,15 +6993,73 @@ struct sched_domain_topology_level; | |||
6967 | typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); | 6993 | typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); |
6968 | typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); | 6994 | typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); |
6969 | 6995 | ||
6996 | #define SDTL_OVERLAP 0x01 | ||
6997 | |||
6970 | struct sched_domain_topology_level { | 6998 | struct sched_domain_topology_level { |
6971 | sched_domain_init_f init; | 6999 | sched_domain_init_f init; |
6972 | sched_domain_mask_f mask; | 7000 | sched_domain_mask_f mask; |
7001 | int flags; | ||
6973 | struct sd_data data; | 7002 | struct sd_data data; |
6974 | }; | 7003 | }; |
6975 | 7004 | ||
6976 | /* | 7005 | static int |
6977 | * Assumes the sched_domain tree is fully constructed | 7006 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) |
6978 | */ | 7007 | { |
7008 | struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; | ||
7009 | const struct cpumask *span = sched_domain_span(sd); | ||
7010 | struct cpumask *covered = sched_domains_tmpmask; | ||
7011 | struct sd_data *sdd = sd->private; | ||
7012 | struct sched_domain *child; | ||
7013 | int i; | ||
7014 | |||
7015 | cpumask_clear(covered); | ||
7016 | |||
7017 | for_each_cpu(i, span) { | ||
7018 | struct cpumask *sg_span; | ||
7019 | |||
7020 | if (cpumask_test_cpu(i, covered)) | ||
7021 | continue; | ||
7022 | |||
7023 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
7024 | GFP_KERNEL, cpu_to_node(i)); | ||
7025 | |||
7026 | if (!sg) | ||
7027 | goto fail; | ||
7028 | |||
7029 | sg_span = sched_group_cpus(sg); | ||
7030 | |||
7031 | child = *per_cpu_ptr(sdd->sd, i); | ||
7032 | if (child->child) { | ||
7033 | child = child->child; | ||
7034 | cpumask_copy(sg_span, sched_domain_span(child)); | ||
7035 | } else | ||
7036 | cpumask_set_cpu(i, sg_span); | ||
7037 | |||
7038 | cpumask_or(covered, covered, sg_span); | ||
7039 | |||
7040 | sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); | ||
7041 | atomic_inc(&sg->sgp->ref); | ||
7042 | |||
7043 | if (cpumask_test_cpu(cpu, sg_span)) | ||
7044 | groups = sg; | ||
7045 | |||
7046 | if (!first) | ||
7047 | first = sg; | ||
7048 | if (last) | ||
7049 | last->next = sg; | ||
7050 | last = sg; | ||
7051 | last->next = first; | ||
7052 | } | ||
7053 | sd->groups = groups; | ||
7054 | |||
7055 | return 0; | ||
7056 | |||
7057 | fail: | ||
7058 | free_sched_groups(first, 0); | ||
7059 | |||
7060 | return -ENOMEM; | ||
7061 | } | ||
7062 | |||
6979 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) | 7063 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) |
6980 | { | 7064 | { |
6981 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | 7065 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); |
@@ -6987,23 +7071,21 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) | |||
6987 | if (sg) { | 7071 | if (sg) { |
6988 | *sg = *per_cpu_ptr(sdd->sg, cpu); | 7072 | *sg = *per_cpu_ptr(sdd->sg, cpu); |
6989 | (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); | 7073 | (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); |
7074 | atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ | ||
6990 | } | 7075 | } |
6991 | 7076 | ||
6992 | return cpu; | 7077 | return cpu; |
6993 | } | 7078 | } |
6994 | 7079 | ||
6995 | /* | 7080 | /* |
6996 | * build_sched_groups takes the cpumask we wish to span, and a pointer | ||
6997 | * to a function which identifies what group(along with sched group) a CPU | ||
6998 | * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids | ||
6999 | * (due to the fact that we keep track of groups covered with a struct cpumask). | ||
7000 | * | ||
7001 | * build_sched_groups will build a circular linked list of the groups | 7081 | * build_sched_groups will build a circular linked list of the groups |
7002 | * covered by the given span, and will set each group's ->cpumask correctly, | 7082 | * covered by the given span, and will set each group's ->cpumask correctly, |
7003 | * and ->cpu_power to 0. | 7083 | * and ->cpu_power to 0. |
7084 | * | ||
7085 | * Assumes the sched_domain tree is fully constructed | ||
7004 | */ | 7086 | */ |
7005 | static void | 7087 | static int |
7006 | build_sched_groups(struct sched_domain *sd) | 7088 | build_sched_groups(struct sched_domain *sd, int cpu) |
7007 | { | 7089 | { |
7008 | struct sched_group *first = NULL, *last = NULL; | 7090 | struct sched_group *first = NULL, *last = NULL; |
7009 | struct sd_data *sdd = sd->private; | 7091 | struct sd_data *sdd = sd->private; |
@@ -7011,6 +7093,12 @@ build_sched_groups(struct sched_domain *sd) | |||
7011 | struct cpumask *covered; | 7093 | struct cpumask *covered; |
7012 | int i; | 7094 | int i; |
7013 | 7095 | ||
7096 | get_group(cpu, sdd, &sd->groups); | ||
7097 | atomic_inc(&sd->groups->ref); | ||
7098 | |||
7099 | if (cpu != cpumask_first(sched_domain_span(sd))) | ||
7100 | return 0; | ||
7101 | |||
7014 | lockdep_assert_held(&sched_domains_mutex); | 7102 | lockdep_assert_held(&sched_domains_mutex); |
7015 | covered = sched_domains_tmpmask; | 7103 | covered = sched_domains_tmpmask; |
7016 | 7104 | ||
@@ -7042,6 +7130,8 @@ build_sched_groups(struct sched_domain *sd) | |||
7042 | last = sg; | 7130 | last = sg; |
7043 | } | 7131 | } |
7044 | last->next = first; | 7132 | last->next = first; |
7133 | |||
7134 | return 0; | ||
7045 | } | 7135 | } |
7046 | 7136 | ||
7047 | /* | 7137 | /* |
@@ -7056,12 +7146,17 @@ build_sched_groups(struct sched_domain *sd) | |||
7056 | */ | 7146 | */ |
7057 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 7147 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
7058 | { | 7148 | { |
7059 | WARN_ON(!sd || !sd->groups); | 7149 | struct sched_group *sg = sd->groups; |
7060 | 7150 | ||
7061 | if (cpu != group_first_cpu(sd->groups)) | 7151 | WARN_ON(!sd || !sg); |
7062 | return; | 7152 | |
7153 | do { | ||
7154 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); | ||
7155 | sg = sg->next; | ||
7156 | } while (sg != sd->groups); | ||
7063 | 7157 | ||
7064 | sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); | 7158 | if (cpu != group_first_cpu(sg)) |
7159 | return; | ||
7065 | 7160 | ||
7066 | update_group_power(sd, cpu); | 7161 | update_group_power(sd, cpu); |
7067 | } | 7162 | } |
@@ -7182,16 +7277,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | |||
7182 | static void claim_allocations(int cpu, struct sched_domain *sd) | 7277 | static void claim_allocations(int cpu, struct sched_domain *sd) |
7183 | { | 7278 | { |
7184 | struct sd_data *sdd = sd->private; | 7279 | struct sd_data *sdd = sd->private; |
7185 | struct sched_group *sg = sd->groups; | ||
7186 | 7280 | ||
7187 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); | 7281 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); |
7188 | *per_cpu_ptr(sdd->sd, cpu) = NULL; | 7282 | *per_cpu_ptr(sdd->sd, cpu) = NULL; |
7189 | 7283 | ||
7190 | if (cpu == cpumask_first(sched_group_cpus(sg))) { | 7284 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) |
7191 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); | ||
7192 | *per_cpu_ptr(sdd->sg, cpu) = NULL; | 7285 | *per_cpu_ptr(sdd->sg, cpu) = NULL; |
7286 | |||
7287 | if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) | ||
7193 | *per_cpu_ptr(sdd->sgp, cpu) = NULL; | 7288 | *per_cpu_ptr(sdd->sgp, cpu) = NULL; |
7194 | } | ||
7195 | } | 7289 | } |
7196 | 7290 | ||
7197 | #ifdef CONFIG_SCHED_SMT | 7291 | #ifdef CONFIG_SCHED_SMT |
@@ -7216,7 +7310,7 @@ static struct sched_domain_topology_level default_topology[] = { | |||
7216 | #endif | 7310 | #endif |
7217 | { sd_init_CPU, cpu_cpu_mask, }, | 7311 | { sd_init_CPU, cpu_cpu_mask, }, |
7218 | #ifdef CONFIG_NUMA | 7312 | #ifdef CONFIG_NUMA |
7219 | { sd_init_NODE, cpu_node_mask, }, | 7313 | { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, |
7220 | { sd_init_ALLNODES, cpu_allnodes_mask, }, | 7314 | { sd_init_ALLNODES, cpu_allnodes_mask, }, |
7221 | #endif | 7315 | #endif |
7222 | { NULL, }, | 7316 | { NULL, }, |
@@ -7284,7 +7378,9 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
7284 | struct sd_data *sdd = &tl->data; | 7378 | struct sd_data *sdd = &tl->data; |
7285 | 7379 | ||
7286 | for_each_cpu(j, cpu_map) { | 7380 | for_each_cpu(j, cpu_map) { |
7287 | kfree(*per_cpu_ptr(sdd->sd, j)); | 7381 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); |
7382 | if (sd && (sd->flags & SD_OVERLAP)) | ||
7383 | free_sched_groups(sd->groups, 0); | ||
7288 | kfree(*per_cpu_ptr(sdd->sg, j)); | 7384 | kfree(*per_cpu_ptr(sdd->sg, j)); |
7289 | kfree(*per_cpu_ptr(sdd->sgp, j)); | 7385 | kfree(*per_cpu_ptr(sdd->sgp, j)); |
7290 | } | 7386 | } |
@@ -7336,8 +7432,11 @@ static int build_sched_domains(const struct cpumask *cpu_map, | |||
7336 | struct sched_domain_topology_level *tl; | 7432 | struct sched_domain_topology_level *tl; |
7337 | 7433 | ||
7338 | sd = NULL; | 7434 | sd = NULL; |
7339 | for (tl = sched_domain_topology; tl->init; tl++) | 7435 | for (tl = sched_domain_topology; tl->init; tl++) { |
7340 | sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); | 7436 | sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); |
7437 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) | ||
7438 | sd->flags |= SD_OVERLAP; | ||
7439 | } | ||
7341 | 7440 | ||
7342 | while (sd->child) | 7441 | while (sd->child) |
7343 | sd = sd->child; | 7442 | sd = sd->child; |
@@ -7349,13 +7448,13 @@ static int build_sched_domains(const struct cpumask *cpu_map, | |||
7349 | for_each_cpu(i, cpu_map) { | 7448 | for_each_cpu(i, cpu_map) { |
7350 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | 7449 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
7351 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); | 7450 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); |
7352 | get_group(i, sd->private, &sd->groups); | 7451 | if (sd->flags & SD_OVERLAP) { |
7353 | atomic_inc(&sd->groups->ref); | 7452 | if (build_overlap_sched_groups(sd, i)) |
7354 | 7453 | goto error; | |
7355 | if (i != cpumask_first(sched_domain_span(sd))) | 7454 | } else { |
7356 | continue; | 7455 | if (build_sched_groups(sd, i)) |
7357 | 7456 | goto error; | |
7358 | build_sched_groups(sd); | 7457 | } |
7359 | } | 7458 | } |
7360 | } | 7459 | } |
7361 | 7460 | ||
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index be40f7371ee1..1e7066d76c26 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
@@ -70,3 +70,5 @@ SCHED_FEAT(NONIRQ_POWER, 1) | |||
70 | * using the scheduler IPI. Reduces rq->lock contention/bounces. | 70 | * using the scheduler IPI. Reduces rq->lock contention/bounces. |
71 | */ | 71 | */ |
72 | SCHED_FEAT(TTWU_QUEUE, 1) | 72 | SCHED_FEAT(TTWU_QUEUE, 1) |
73 | |||
74 | SCHED_FEAT(FORCE_SD_OVERLAP, 0) | ||