diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2011-04-07 08:09:45 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-04-11 06:58:17 -0400 |
commit | cd4ea6ae3982f6861da3b510e69cbc194f331d83 (patch) | |
tree | 7ca7e19bf9be55102768f7c1ab2bd76643e6524b /kernel/sched.c | |
parent | a06dadbec5c5df0bf3a35f33616f67d10ca9ba28 (diff) |
sched: Change NODE sched_domain group creation
The NODE sched_domain is 'special' in that it allocates sched_groups
per CPU, instead of sharing the sched_groups between all CPUs.
While this might have some benefits on large NUMA and avoid remote
memory accesses when iterating the sched_groups, this does break
current code that assumes sched_groups are shared between all
sched_domains (since the dynamic cpu_power patches).
So refactor the NODE groups to behave like all other groups.
(The ALLNODES domain again shared its groups across the CPUs for some
reason).
If someone does measure a performance decrease due to this change we
need to revisit this and come up with another way to have both dynamic
cpu_power and NUMA work nice together.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20110407122941.978111700@chello.nl
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 229 |
1 files changed, 32 insertions, 197 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index e3818f1b98fe..72d561fa67b7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -6861,29 +6861,18 @@ struct static_sched_domain { | |||
6861 | struct s_data { | 6861 | struct s_data { |
6862 | #ifdef CONFIG_NUMA | 6862 | #ifdef CONFIG_NUMA |
6863 | int sd_allnodes; | 6863 | int sd_allnodes; |
6864 | cpumask_var_t domainspan; | ||
6865 | cpumask_var_t covered; | ||
6866 | cpumask_var_t notcovered; | ||
6867 | #endif | 6864 | #endif |
6868 | cpumask_var_t nodemask; | 6865 | cpumask_var_t nodemask; |
6869 | cpumask_var_t send_covered; | 6866 | cpumask_var_t send_covered; |
6870 | cpumask_var_t tmpmask; | 6867 | cpumask_var_t tmpmask; |
6871 | struct sched_group **sched_group_nodes; | ||
6872 | struct root_domain *rd; | 6868 | struct root_domain *rd; |
6873 | }; | 6869 | }; |
6874 | 6870 | ||
6875 | enum s_alloc { | 6871 | enum s_alloc { |
6876 | sa_sched_groups = 0, | ||
6877 | sa_rootdomain, | 6872 | sa_rootdomain, |
6878 | sa_tmpmask, | 6873 | sa_tmpmask, |
6879 | sa_send_covered, | 6874 | sa_send_covered, |
6880 | sa_nodemask, | 6875 | sa_nodemask, |
6881 | sa_sched_group_nodes, | ||
6882 | #ifdef CONFIG_NUMA | ||
6883 | sa_notcovered, | ||
6884 | sa_covered, | ||
6885 | sa_domainspan, | ||
6886 | #endif | ||
6887 | sa_none, | 6876 | sa_none, |
6888 | }; | 6877 | }; |
6889 | 6878 | ||
@@ -6979,18 +6968,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, | |||
6979 | } | 6968 | } |
6980 | 6969 | ||
6981 | #ifdef CONFIG_NUMA | 6970 | #ifdef CONFIG_NUMA |
6982 | /* | ||
6983 | * The init_sched_build_groups can't handle what we want to do with node | ||
6984 | * groups, so roll our own. Now each node has its own list of groups which | ||
6985 | * gets dynamically allocated. | ||
6986 | */ | ||
6987 | static DEFINE_PER_CPU(struct static_sched_domain, node_domains); | 6971 | static DEFINE_PER_CPU(struct static_sched_domain, node_domains); |
6988 | static struct sched_group ***sched_group_nodes_bycpu; | 6972 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_node); |
6989 | |||
6990 | static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); | ||
6991 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); | ||
6992 | 6973 | ||
6993 | static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, | 6974 | static int cpu_to_node_group(int cpu, const struct cpumask *cpu_map, |
6994 | struct sched_group **sg, | 6975 | struct sched_group **sg, |
6995 | struct cpumask *nodemask) | 6976 | struct cpumask *nodemask) |
6996 | { | 6977 | { |
@@ -7000,142 +6981,27 @@ static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, | |||
7000 | group = cpumask_first(nodemask); | 6981 | group = cpumask_first(nodemask); |
7001 | 6982 | ||
7002 | if (sg) | 6983 | if (sg) |
7003 | *sg = &per_cpu(sched_group_allnodes, group).sg; | 6984 | *sg = &per_cpu(sched_group_node, group).sg; |
7004 | return group; | 6985 | return group; |
7005 | } | 6986 | } |
7006 | 6987 | ||
7007 | static void init_numa_sched_groups_power(struct sched_group *group_head) | 6988 | static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); |
7008 | { | 6989 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); |
7009 | struct sched_group *sg = group_head; | ||
7010 | int j; | ||
7011 | |||
7012 | if (!sg) | ||
7013 | return; | ||
7014 | do { | ||
7015 | for_each_cpu(j, sched_group_cpus(sg)) { | ||
7016 | struct sched_domain *sd; | ||
7017 | |||
7018 | sd = &per_cpu(phys_domains, j).sd; | ||
7019 | if (j != group_first_cpu(sd->groups)) { | ||
7020 | /* | ||
7021 | * Only add "power" once for each | ||
7022 | * physical package. | ||
7023 | */ | ||
7024 | continue; | ||
7025 | } | ||
7026 | |||
7027 | sg->cpu_power += sd->groups->cpu_power; | ||
7028 | } | ||
7029 | sg = sg->next; | ||
7030 | } while (sg != group_head); | ||
7031 | } | ||
7032 | 6990 | ||
7033 | static int build_numa_sched_groups(struct s_data *d, | 6991 | static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, |
7034 | const struct cpumask *cpu_map, int num) | 6992 | struct sched_group **sg, |
6993 | struct cpumask *nodemask) | ||
7035 | { | 6994 | { |
7036 | struct sched_domain *sd; | 6995 | int group; |
7037 | struct sched_group *sg, *prev; | ||
7038 | int n, j; | ||
7039 | |||
7040 | cpumask_clear(d->covered); | ||
7041 | cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); | ||
7042 | if (cpumask_empty(d->nodemask)) { | ||
7043 | d->sched_group_nodes[num] = NULL; | ||
7044 | goto out; | ||
7045 | } | ||
7046 | |||
7047 | sched_domain_node_span(num, d->domainspan); | ||
7048 | cpumask_and(d->domainspan, d->domainspan, cpu_map); | ||
7049 | |||
7050 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
7051 | GFP_KERNEL, num); | ||
7052 | if (!sg) { | ||
7053 | printk(KERN_WARNING "Can not alloc domain group for node %d\n", | ||
7054 | num); | ||
7055 | return -ENOMEM; | ||
7056 | } | ||
7057 | d->sched_group_nodes[num] = sg; | ||
7058 | |||
7059 | for_each_cpu(j, d->nodemask) { | ||
7060 | sd = &per_cpu(node_domains, j).sd; | ||
7061 | sd->groups = sg; | ||
7062 | } | ||
7063 | 6996 | ||
7064 | sg->cpu_power = 0; | 6997 | cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); |
7065 | cpumask_copy(sched_group_cpus(sg), d->nodemask); | 6998 | group = cpumask_first(nodemask); |
7066 | sg->next = sg; | ||
7067 | cpumask_or(d->covered, d->covered, d->nodemask); | ||
7068 | 6999 | ||
7069 | prev = sg; | 7000 | if (sg) |
7070 | for (j = 0; j < nr_node_ids; j++) { | 7001 | *sg = &per_cpu(sched_group_allnodes, group).sg; |
7071 | n = (num + j) % nr_node_ids; | 7002 | return group; |
7072 | cpumask_complement(d->notcovered, d->covered); | ||
7073 | cpumask_and(d->tmpmask, d->notcovered, cpu_map); | ||
7074 | cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); | ||
7075 | if (cpumask_empty(d->tmpmask)) | ||
7076 | break; | ||
7077 | cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); | ||
7078 | if (cpumask_empty(d->tmpmask)) | ||
7079 | continue; | ||
7080 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
7081 | GFP_KERNEL, num); | ||
7082 | if (!sg) { | ||
7083 | printk(KERN_WARNING | ||
7084 | "Can not alloc domain group for node %d\n", j); | ||
7085 | return -ENOMEM; | ||
7086 | } | ||
7087 | sg->cpu_power = 0; | ||
7088 | cpumask_copy(sched_group_cpus(sg), d->tmpmask); | ||
7089 | sg->next = prev->next; | ||
7090 | cpumask_or(d->covered, d->covered, d->tmpmask); | ||
7091 | prev->next = sg; | ||
7092 | prev = sg; | ||
7093 | } | ||
7094 | out: | ||
7095 | return 0; | ||
7096 | } | 7003 | } |
7097 | #endif /* CONFIG_NUMA */ | ||
7098 | |||
7099 | #ifdef CONFIG_NUMA | ||
7100 | /* Free memory allocated for various sched_group structures */ | ||
7101 | static void free_sched_groups(const struct cpumask *cpu_map, | ||
7102 | struct cpumask *nodemask) | ||
7103 | { | ||
7104 | int cpu, i; | ||
7105 | 7004 | ||
7106 | for_each_cpu(cpu, cpu_map) { | ||
7107 | struct sched_group **sched_group_nodes | ||
7108 | = sched_group_nodes_bycpu[cpu]; | ||
7109 | |||
7110 | if (!sched_group_nodes) | ||
7111 | continue; | ||
7112 | |||
7113 | for (i = 0; i < nr_node_ids; i++) { | ||
7114 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
7115 | |||
7116 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | ||
7117 | if (cpumask_empty(nodemask)) | ||
7118 | continue; | ||
7119 | |||
7120 | if (sg == NULL) | ||
7121 | continue; | ||
7122 | sg = sg->next; | ||
7123 | next_sg: | ||
7124 | oldsg = sg; | ||
7125 | sg = sg->next; | ||
7126 | kfree(oldsg); | ||
7127 | if (oldsg != sched_group_nodes[i]) | ||
7128 | goto next_sg; | ||
7129 | } | ||
7130 | kfree(sched_group_nodes); | ||
7131 | sched_group_nodes_bycpu[cpu] = NULL; | ||
7132 | } | ||
7133 | } | ||
7134 | #else /* !CONFIG_NUMA */ | ||
7135 | static void free_sched_groups(const struct cpumask *cpu_map, | ||
7136 | struct cpumask *nodemask) | ||
7137 | { | ||
7138 | } | ||
7139 | #endif /* CONFIG_NUMA */ | 7005 | #endif /* CONFIG_NUMA */ |
7140 | 7006 | ||
7141 | /* | 7007 | /* |
@@ -7236,9 +7102,6 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | |||
7236 | const struct cpumask *cpu_map) | 7102 | const struct cpumask *cpu_map) |
7237 | { | 7103 | { |
7238 | switch (what) { | 7104 | switch (what) { |
7239 | case sa_sched_groups: | ||
7240 | free_sched_groups(cpu_map, d->tmpmask); /* fall through */ | ||
7241 | d->sched_group_nodes = NULL; | ||
7242 | case sa_rootdomain: | 7105 | case sa_rootdomain: |
7243 | free_rootdomain(d->rd); /* fall through */ | 7106 | free_rootdomain(d->rd); /* fall through */ |
7244 | case sa_tmpmask: | 7107 | case sa_tmpmask: |
@@ -7247,16 +7110,6 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | |||
7247 | free_cpumask_var(d->send_covered); /* fall through */ | 7110 | free_cpumask_var(d->send_covered); /* fall through */ |
7248 | case sa_nodemask: | 7111 | case sa_nodemask: |
7249 | free_cpumask_var(d->nodemask); /* fall through */ | 7112 | free_cpumask_var(d->nodemask); /* fall through */ |
7250 | case sa_sched_group_nodes: | ||
7251 | #ifdef CONFIG_NUMA | ||
7252 | kfree(d->sched_group_nodes); /* fall through */ | ||
7253 | case sa_notcovered: | ||
7254 | free_cpumask_var(d->notcovered); /* fall through */ | ||
7255 | case sa_covered: | ||
7256 | free_cpumask_var(d->covered); /* fall through */ | ||
7257 | case sa_domainspan: | ||
7258 | free_cpumask_var(d->domainspan); /* fall through */ | ||
7259 | #endif | ||
7260 | case sa_none: | 7113 | case sa_none: |
7261 | break; | 7114 | break; |
7262 | } | 7115 | } |
@@ -7265,24 +7118,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | |||
7265 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | 7118 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, |
7266 | const struct cpumask *cpu_map) | 7119 | const struct cpumask *cpu_map) |
7267 | { | 7120 | { |
7268 | #ifdef CONFIG_NUMA | ||
7269 | if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) | ||
7270 | return sa_none; | ||
7271 | if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) | ||
7272 | return sa_domainspan; | ||
7273 | if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) | ||
7274 | return sa_covered; | ||
7275 | /* Allocate the per-node list of sched groups */ | ||
7276 | d->sched_group_nodes = kcalloc(nr_node_ids, | ||
7277 | sizeof(struct sched_group *), GFP_KERNEL); | ||
7278 | if (!d->sched_group_nodes) { | ||
7279 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | ||
7280 | return sa_notcovered; | ||
7281 | } | ||
7282 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; | ||
7283 | #endif | ||
7284 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) | 7121 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) |
7285 | return sa_sched_group_nodes; | 7122 | return sa_none; |
7286 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | 7123 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) |
7287 | return sa_nodemask; | 7124 | return sa_nodemask; |
7288 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | 7125 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) |
@@ -7322,6 +7159,7 @@ static struct sched_domain *__build_numa_sched_domains(struct s_data *d, | |||
7322 | if (parent) | 7159 | if (parent) |
7323 | parent->child = sd; | 7160 | parent->child = sd; |
7324 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); | 7161 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); |
7162 | cpu_to_node_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7325 | #endif | 7163 | #endif |
7326 | return sd; | 7164 | return sd; |
7327 | } | 7165 | } |
@@ -7434,6 +7272,13 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | |||
7434 | d->send_covered, d->tmpmask); | 7272 | d->send_covered, d->tmpmask); |
7435 | break; | 7273 | break; |
7436 | #ifdef CONFIG_NUMA | 7274 | #ifdef CONFIG_NUMA |
7275 | case SD_LV_NODE: | ||
7276 | sd = &per_cpu(node_domains, cpu).sd; | ||
7277 | if (cpu == cpumask_first(sched_domain_span(sd))) | ||
7278 | init_sched_build_groups(sched_domain_span(sd), cpu_map, | ||
7279 | &cpu_to_node_group, | ||
7280 | d->send_covered, d->tmpmask); | ||
7281 | |||
7437 | case SD_LV_ALLNODES: | 7282 | case SD_LV_ALLNODES: |
7438 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, | 7283 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, |
7439 | d->send_covered, d->tmpmask); | 7284 | d->send_covered, d->tmpmask); |
@@ -7462,7 +7307,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
7462 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | 7307 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
7463 | if (alloc_state != sa_rootdomain) | 7308 | if (alloc_state != sa_rootdomain) |
7464 | goto error; | 7309 | goto error; |
7465 | alloc_state = sa_sched_groups; | ||
7466 | 7310 | ||
7467 | /* | 7311 | /* |
7468 | * Set up domains for cpus specified by the cpu_map. | 7312 | * Set up domains for cpus specified by the cpu_map. |
@@ -7486,16 +7330,13 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
7486 | build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); | 7330 | build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); |
7487 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | 7331 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); |
7488 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); | 7332 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); |
7333 | build_sched_groups(&d, SD_LV_NODE, cpu_map, i); | ||
7489 | } | 7334 | } |
7490 | 7335 | ||
7491 | #ifdef CONFIG_NUMA | 7336 | #ifdef CONFIG_NUMA |
7492 | /* Set up node groups */ | 7337 | /* Set up node groups */ |
7493 | if (d.sd_allnodes) | 7338 | if (d.sd_allnodes) |
7494 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); | 7339 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); |
7495 | |||
7496 | for (i = 0; i < nr_node_ids; i++) | ||
7497 | if (build_numa_sched_groups(&d, cpu_map, i)) | ||
7498 | goto error; | ||
7499 | #endif | 7340 | #endif |
7500 | 7341 | ||
7501 | /* Calculate CPU power for physical packages and nodes */ | 7342 | /* Calculate CPU power for physical packages and nodes */ |
@@ -7524,15 +7365,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
7524 | } | 7365 | } |
7525 | 7366 | ||
7526 | #ifdef CONFIG_NUMA | 7367 | #ifdef CONFIG_NUMA |
7527 | for (i = 0; i < nr_node_ids; i++) | 7368 | for_each_cpu(i, cpu_map) { |
7528 | init_numa_sched_groups_power(d.sched_group_nodes[i]); | 7369 | sd = &per_cpu(node_domains, i).sd; |
7370 | init_sched_groups_power(i, sd); | ||
7371 | } | ||
7529 | 7372 | ||
7530 | if (d.sd_allnodes) { | 7373 | if (d.sd_allnodes) { |
7531 | struct sched_group *sg; | 7374 | for_each_cpu(i, cpu_map) { |
7532 | 7375 | sd = &per_cpu(allnodes_domains, i).sd; | |
7533 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, | 7376 | init_sched_groups_power(i, sd); |
7534 | d.tmpmask); | 7377 | } |
7535 | init_numa_sched_groups_power(sg); | ||
7536 | } | 7378 | } |
7537 | #endif | 7379 | #endif |
7538 | 7380 | ||
@@ -7550,7 +7392,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
7550 | cpu_attach_domain(sd, d.rd, i); | 7392 | cpu_attach_domain(sd, d.rd, i); |
7551 | } | 7393 | } |
7552 | 7394 | ||
7553 | d.sched_group_nodes = NULL; /* don't free this we still need it */ | ||
7554 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); | 7395 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); |
7555 | return 0; | 7396 | return 0; |
7556 | 7397 | ||
@@ -7636,7 +7477,6 @@ static int init_sched_domains(const struct cpumask *cpu_map) | |||
7636 | static void destroy_sched_domains(const struct cpumask *cpu_map, | 7477 | static void destroy_sched_domains(const struct cpumask *cpu_map, |
7637 | struct cpumask *tmpmask) | 7478 | struct cpumask *tmpmask) |
7638 | { | 7479 | { |
7639 | free_sched_groups(cpu_map, tmpmask); | ||
7640 | } | 7480 | } |
7641 | 7481 | ||
7642 | /* | 7482 | /* |
@@ -7913,11 +7753,6 @@ void __init sched_init_smp(void) | |||
7913 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 7753 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
7914 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | 7754 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
7915 | 7755 | ||
7916 | #if defined(CONFIG_NUMA) | ||
7917 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), | ||
7918 | GFP_KERNEL); | ||
7919 | BUG_ON(sched_group_nodes_bycpu == NULL); | ||
7920 | #endif | ||
7921 | get_online_cpus(); | 7756 | get_online_cpus(); |
7922 | mutex_lock(&sched_domains_mutex); | 7757 | mutex_lock(&sched_domains_mutex); |
7923 | init_sched_domains(cpu_active_mask); | 7758 | init_sched_domains(cpu_active_mask); |