aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2011-04-07 08:09:45 -0400
committerIngo Molnar <mingo@elte.hu>2011-04-11 06:58:17 -0400
commitcd4ea6ae3982f6861da3b510e69cbc194f331d83 (patch)
tree7ca7e19bf9be55102768f7c1ab2bd76643e6524b /kernel/sched.c
parenta06dadbec5c5df0bf3a35f33616f67d10ca9ba28 (diff)
sched: Change NODE sched_domain group creation
The NODE sched_domain is 'special' in that it allocates sched_groups per CPU, instead of sharing the sched_groups between all CPUs. While this might have some benefits on large NUMA and avoid remote memory accesses when iterating the sched_groups, this does break current code that assumes sched_groups are shared between all sched_domains (since the dynamic cpu_power patches). So refactor the NODE groups to behave like all other groups. (The ALLNODES domain again shared its groups across the CPUs for some reason). If someone does measure a performance decrease due to this change we need to revisit this and come up with another way to have both dynamic cpu_power and NUMA work nice together. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Galbraith <efault@gmx.de> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Link: http://lkml.kernel.org/r/20110407122941.978111700@chello.nl Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c229
1 files changed, 32 insertions, 197 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index e3818f1b98fe..72d561fa67b7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6861,29 +6861,18 @@ struct static_sched_domain {
6861struct s_data { 6861struct s_data {
6862#ifdef CONFIG_NUMA 6862#ifdef CONFIG_NUMA
6863 int sd_allnodes; 6863 int sd_allnodes;
6864 cpumask_var_t domainspan;
6865 cpumask_var_t covered;
6866 cpumask_var_t notcovered;
6867#endif 6864#endif
6868 cpumask_var_t nodemask; 6865 cpumask_var_t nodemask;
6869 cpumask_var_t send_covered; 6866 cpumask_var_t send_covered;
6870 cpumask_var_t tmpmask; 6867 cpumask_var_t tmpmask;
6871 struct sched_group **sched_group_nodes;
6872 struct root_domain *rd; 6868 struct root_domain *rd;
6873}; 6869};
6874 6870
6875enum s_alloc { 6871enum s_alloc {
6876 sa_sched_groups = 0,
6877 sa_rootdomain, 6872 sa_rootdomain,
6878 sa_tmpmask, 6873 sa_tmpmask,
6879 sa_send_covered, 6874 sa_send_covered,
6880 sa_nodemask, 6875 sa_nodemask,
6881 sa_sched_group_nodes,
6882#ifdef CONFIG_NUMA
6883 sa_notcovered,
6884 sa_covered,
6885 sa_domainspan,
6886#endif
6887 sa_none, 6876 sa_none,
6888}; 6877};
6889 6878
@@ -6979,18 +6968,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6979} 6968}
6980 6969
6981#ifdef CONFIG_NUMA 6970#ifdef CONFIG_NUMA
6982/*
6983 * The init_sched_build_groups can't handle what we want to do with node
6984 * groups, so roll our own. Now each node has its own list of groups which
6985 * gets dynamically allocated.
6986 */
6987static DEFINE_PER_CPU(struct static_sched_domain, node_domains); 6971static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
6988static struct sched_group ***sched_group_nodes_bycpu; 6972static DEFINE_PER_CPU(struct static_sched_group, sched_group_node);
6989
6990static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
6991static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
6992 6973
6993static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, 6974static int cpu_to_node_group(int cpu, const struct cpumask *cpu_map,
6994 struct sched_group **sg, 6975 struct sched_group **sg,
6995 struct cpumask *nodemask) 6976 struct cpumask *nodemask)
6996{ 6977{
@@ -7000,142 +6981,27 @@ static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7000 group = cpumask_first(nodemask); 6981 group = cpumask_first(nodemask);
7001 6982
7002 if (sg) 6983 if (sg)
7003 *sg = &per_cpu(sched_group_allnodes, group).sg; 6984 *sg = &per_cpu(sched_group_node, group).sg;
7004 return group; 6985 return group;
7005} 6986}
7006 6987
7007static void init_numa_sched_groups_power(struct sched_group *group_head) 6988static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
7008{ 6989static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
7009 struct sched_group *sg = group_head;
7010 int j;
7011
7012 if (!sg)
7013 return;
7014 do {
7015 for_each_cpu(j, sched_group_cpus(sg)) {
7016 struct sched_domain *sd;
7017
7018 sd = &per_cpu(phys_domains, j).sd;
7019 if (j != group_first_cpu(sd->groups)) {
7020 /*
7021 * Only add "power" once for each
7022 * physical package.
7023 */
7024 continue;
7025 }
7026
7027 sg->cpu_power += sd->groups->cpu_power;
7028 }
7029 sg = sg->next;
7030 } while (sg != group_head);
7031}
7032 6990
7033static int build_numa_sched_groups(struct s_data *d, 6991static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7034 const struct cpumask *cpu_map, int num) 6992 struct sched_group **sg,
6993 struct cpumask *nodemask)
7035{ 6994{
7036 struct sched_domain *sd; 6995 int group;
7037 struct sched_group *sg, *prev;
7038 int n, j;
7039
7040 cpumask_clear(d->covered);
7041 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
7042 if (cpumask_empty(d->nodemask)) {
7043 d->sched_group_nodes[num] = NULL;
7044 goto out;
7045 }
7046
7047 sched_domain_node_span(num, d->domainspan);
7048 cpumask_and(d->domainspan, d->domainspan, cpu_map);
7049
7050 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7051 GFP_KERNEL, num);
7052 if (!sg) {
7053 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
7054 num);
7055 return -ENOMEM;
7056 }
7057 d->sched_group_nodes[num] = sg;
7058
7059 for_each_cpu(j, d->nodemask) {
7060 sd = &per_cpu(node_domains, j).sd;
7061 sd->groups = sg;
7062 }
7063 6996
7064 sg->cpu_power = 0; 6997 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
7065 cpumask_copy(sched_group_cpus(sg), d->nodemask); 6998 group = cpumask_first(nodemask);
7066 sg->next = sg;
7067 cpumask_or(d->covered, d->covered, d->nodemask);
7068 6999
7069 prev = sg; 7000 if (sg)
7070 for (j = 0; j < nr_node_ids; j++) { 7001 *sg = &per_cpu(sched_group_allnodes, group).sg;
7071 n = (num + j) % nr_node_ids; 7002 return group;
7072 cpumask_complement(d->notcovered, d->covered);
7073 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
7074 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
7075 if (cpumask_empty(d->tmpmask))
7076 break;
7077 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
7078 if (cpumask_empty(d->tmpmask))
7079 continue;
7080 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7081 GFP_KERNEL, num);
7082 if (!sg) {
7083 printk(KERN_WARNING
7084 "Can not alloc domain group for node %d\n", j);
7085 return -ENOMEM;
7086 }
7087 sg->cpu_power = 0;
7088 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
7089 sg->next = prev->next;
7090 cpumask_or(d->covered, d->covered, d->tmpmask);
7091 prev->next = sg;
7092 prev = sg;
7093 }
7094out:
7095 return 0;
7096} 7003}
7097#endif /* CONFIG_NUMA */
7098
7099#ifdef CONFIG_NUMA
7100/* Free memory allocated for various sched_group structures */
7101static void free_sched_groups(const struct cpumask *cpu_map,
7102 struct cpumask *nodemask)
7103{
7104 int cpu, i;
7105 7004
7106 for_each_cpu(cpu, cpu_map) {
7107 struct sched_group **sched_group_nodes
7108 = sched_group_nodes_bycpu[cpu];
7109
7110 if (!sched_group_nodes)
7111 continue;
7112
7113 for (i = 0; i < nr_node_ids; i++) {
7114 struct sched_group *oldsg, *sg = sched_group_nodes[i];
7115
7116 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
7117 if (cpumask_empty(nodemask))
7118 continue;
7119
7120 if (sg == NULL)
7121 continue;
7122 sg = sg->next;
7123next_sg:
7124 oldsg = sg;
7125 sg = sg->next;
7126 kfree(oldsg);
7127 if (oldsg != sched_group_nodes[i])
7128 goto next_sg;
7129 }
7130 kfree(sched_group_nodes);
7131 sched_group_nodes_bycpu[cpu] = NULL;
7132 }
7133}
7134#else /* !CONFIG_NUMA */
7135static void free_sched_groups(const struct cpumask *cpu_map,
7136 struct cpumask *nodemask)
7137{
7138}
7139#endif /* CONFIG_NUMA */ 7005#endif /* CONFIG_NUMA */
7140 7006
7141/* 7007/*
@@ -7236,9 +7102,6 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7236 const struct cpumask *cpu_map) 7102 const struct cpumask *cpu_map)
7237{ 7103{
7238 switch (what) { 7104 switch (what) {
7239 case sa_sched_groups:
7240 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
7241 d->sched_group_nodes = NULL;
7242 case sa_rootdomain: 7105 case sa_rootdomain:
7243 free_rootdomain(d->rd); /* fall through */ 7106 free_rootdomain(d->rd); /* fall through */
7244 case sa_tmpmask: 7107 case sa_tmpmask:
@@ -7247,16 +7110,6 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7247 free_cpumask_var(d->send_covered); /* fall through */ 7110 free_cpumask_var(d->send_covered); /* fall through */
7248 case sa_nodemask: 7111 case sa_nodemask:
7249 free_cpumask_var(d->nodemask); /* fall through */ 7112 free_cpumask_var(d->nodemask); /* fall through */
7250 case sa_sched_group_nodes:
7251#ifdef CONFIG_NUMA
7252 kfree(d->sched_group_nodes); /* fall through */
7253 case sa_notcovered:
7254 free_cpumask_var(d->notcovered); /* fall through */
7255 case sa_covered:
7256 free_cpumask_var(d->covered); /* fall through */
7257 case sa_domainspan:
7258 free_cpumask_var(d->domainspan); /* fall through */
7259#endif
7260 case sa_none: 7113 case sa_none:
7261 break; 7114 break;
7262 } 7115 }
@@ -7265,24 +7118,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7265static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 7118static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7266 const struct cpumask *cpu_map) 7119 const struct cpumask *cpu_map)
7267{ 7120{
7268#ifdef CONFIG_NUMA
7269 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
7270 return sa_none;
7271 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
7272 return sa_domainspan;
7273 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
7274 return sa_covered;
7275 /* Allocate the per-node list of sched groups */
7276 d->sched_group_nodes = kcalloc(nr_node_ids,
7277 sizeof(struct sched_group *), GFP_KERNEL);
7278 if (!d->sched_group_nodes) {
7279 printk(KERN_WARNING "Can not alloc sched group node list\n");
7280 return sa_notcovered;
7281 }
7282 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
7283#endif
7284 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) 7121 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
7285 return sa_sched_group_nodes; 7122 return sa_none;
7286 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) 7123 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7287 return sa_nodemask; 7124 return sa_nodemask;
7288 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) 7125 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
@@ -7322,6 +7159,7 @@ static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
7322 if (parent) 7159 if (parent)
7323 parent->child = sd; 7160 parent->child = sd;
7324 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); 7161 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
7162 cpu_to_node_group(i, cpu_map, &sd->groups, d->tmpmask);
7325#endif 7163#endif
7326 return sd; 7164 return sd;
7327} 7165}
@@ -7434,6 +7272,13 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
7434 d->send_covered, d->tmpmask); 7272 d->send_covered, d->tmpmask);
7435 break; 7273 break;
7436#ifdef CONFIG_NUMA 7274#ifdef CONFIG_NUMA
7275 case SD_LV_NODE:
7276 sd = &per_cpu(node_domains, cpu).sd;
7277 if (cpu == cpumask_first(sched_domain_span(sd)))
7278 init_sched_build_groups(sched_domain_span(sd), cpu_map,
7279 &cpu_to_node_group,
7280 d->send_covered, d->tmpmask);
7281
7437 case SD_LV_ALLNODES: 7282 case SD_LV_ALLNODES:
7438 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 7283 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
7439 d->send_covered, d->tmpmask); 7284 d->send_covered, d->tmpmask);
@@ -7462,7 +7307,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7462 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7307 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7463 if (alloc_state != sa_rootdomain) 7308 if (alloc_state != sa_rootdomain)
7464 goto error; 7309 goto error;
7465 alloc_state = sa_sched_groups;
7466 7310
7467 /* 7311 /*
7468 * Set up domains for cpus specified by the cpu_map. 7312 * Set up domains for cpus specified by the cpu_map.
@@ -7486,16 +7330,13 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7486 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); 7330 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7487 build_sched_groups(&d, SD_LV_MC, cpu_map, i); 7331 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7488 build_sched_groups(&d, SD_LV_CPU, cpu_map, i); 7332 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
7333 build_sched_groups(&d, SD_LV_NODE, cpu_map, i);
7489 } 7334 }
7490 7335
7491#ifdef CONFIG_NUMA 7336#ifdef CONFIG_NUMA
7492 /* Set up node groups */ 7337 /* Set up node groups */
7493 if (d.sd_allnodes) 7338 if (d.sd_allnodes)
7494 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); 7339 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
7495
7496 for (i = 0; i < nr_node_ids; i++)
7497 if (build_numa_sched_groups(&d, cpu_map, i))
7498 goto error;
7499#endif 7340#endif
7500 7341
7501 /* Calculate CPU power for physical packages and nodes */ 7342 /* Calculate CPU power for physical packages and nodes */
@@ -7524,15 +7365,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7524 } 7365 }
7525 7366
7526#ifdef CONFIG_NUMA 7367#ifdef CONFIG_NUMA
7527 for (i = 0; i < nr_node_ids; i++) 7368 for_each_cpu(i, cpu_map) {
7528 init_numa_sched_groups_power(d.sched_group_nodes[i]); 7369 sd = &per_cpu(node_domains, i).sd;
7370 init_sched_groups_power(i, sd);
7371 }
7529 7372
7530 if (d.sd_allnodes) { 7373 if (d.sd_allnodes) {
7531 struct sched_group *sg; 7374 for_each_cpu(i, cpu_map) {
7532 7375 sd = &per_cpu(allnodes_domains, i).sd;
7533 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 7376 init_sched_groups_power(i, sd);
7534 d.tmpmask); 7377 }
7535 init_numa_sched_groups_power(sg);
7536 } 7378 }
7537#endif 7379#endif
7538 7380
@@ -7550,7 +7392,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7550 cpu_attach_domain(sd, d.rd, i); 7392 cpu_attach_domain(sd, d.rd, i);
7551 } 7393 }
7552 7394
7553 d.sched_group_nodes = NULL; /* don't free this we still need it */
7554 __free_domain_allocs(&d, sa_tmpmask, cpu_map); 7395 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
7555 return 0; 7396 return 0;
7556 7397
@@ -7636,7 +7477,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
7636static void destroy_sched_domains(const struct cpumask *cpu_map, 7477static void destroy_sched_domains(const struct cpumask *cpu_map,
7637 struct cpumask *tmpmask) 7478 struct cpumask *tmpmask)
7638{ 7479{
7639 free_sched_groups(cpu_map, tmpmask);
7640} 7480}
7641 7481
7642/* 7482/*
@@ -7913,11 +7753,6 @@ void __init sched_init_smp(void)
7913 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7753 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7914 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7754 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7915 7755
7916#if defined(CONFIG_NUMA)
7917 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7918 GFP_KERNEL);
7919 BUG_ON(sched_group_nodes_bycpu == NULL);
7920#endif
7921 get_online_cpus(); 7756 get_online_cpus();
7922 mutex_lock(&sched_domains_mutex); 7757 mutex_lock(&sched_domains_mutex);
7923 init_sched_domains(cpu_active_mask); 7758 init_sched_domains(cpu_active_mask);