aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorVincent Guittot <vincent.guittot@linaro.org>2014-04-11 05:44:37 -0400
committerIngo Molnar <mingo@kernel.org>2014-05-07 07:33:49 -0400
commit143e1e28cb40bed836b0a06567208bd7347c9672 (patch)
tree7fee7ba69929c04534c17dca02bf39332e110c9a /kernel/sched
parent107437febd495a50e2cd09c81bbaa84d30e57b07 (diff)
sched: Rework sched_domain topology definition
We replace the old way to configure the scheduler topology with a new method which enables a platform to declare additionnal level (if needed). We still have a default topology table definition that can be used by platform that don't want more level than the SMT, MC, CPU and NUMA ones. This table can be overwritten by an arch which either wants to add new level where a load balance make sense like BOOK or powergating level or wants to change the flags configuration of some levels. For each level, we need a function pointer that returns cpumask for each cpu, a function pointer that returns the flags for the level and a name. Only flags that describe topology, can be set by an architecture. The current topology flags are: SD_SHARE_CPUPOWER SD_SHARE_PKG_RESOURCES SD_NUMA SD_ASYM_PACKING Then, each level must be a subset on the next one. The build sequence of the sched_domain will take care of removing useless levels like those with 1 CPU and those with the same CPU span and no more relevant information for load balancing than its children. Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com> Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com> Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Bjorn Helgaas <bhelgaas@google.com> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Christoph Lameter <cl@linux.com> Cc: David S. Miller <davem@davemloft.net> Cc: Fenghua Yu <fenghua.yu@intel.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Hanjun Guo <hanjun.guo@linaro.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Jason Low <jason.low2@hp.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Tony Luck <tony.luck@intel.com> Cc: linux390@de.ibm.com Cc: linux-ia64@vger.kernel.org Cc: linux-s390@vger.kernel.org Link: http://lkml.kernel.org/r/1397209481-28542-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c233
1 files changed, 120 insertions, 113 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 13584f1cccfc..7d332b7899cc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5566,17 +5566,6 @@ static int __init isolated_cpu_setup(char *str)
5566 5566
5567__setup("isolcpus=", isolated_cpu_setup); 5567__setup("isolcpus=", isolated_cpu_setup);
5568 5568
5569static const struct cpumask *cpu_cpu_mask(int cpu)
5570{
5571 return cpumask_of_node(cpu_to_node(cpu));
5572}
5573
5574struct sd_data {
5575 struct sched_domain **__percpu sd;
5576 struct sched_group **__percpu sg;
5577 struct sched_group_power **__percpu sgp;
5578};
5579
5580struct s_data { 5569struct s_data {
5581 struct sched_domain ** __percpu sd; 5570 struct sched_domain ** __percpu sd;
5582 struct root_domain *rd; 5571 struct root_domain *rd;
@@ -5589,21 +5578,6 @@ enum s_alloc {
5589 sa_none, 5578 sa_none,
5590}; 5579};
5591 5580
5592struct sched_domain_topology_level;
5593
5594typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5595typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5596
5597#define SDTL_OVERLAP 0x01
5598
5599struct sched_domain_topology_level {
5600 sched_domain_init_f init;
5601 sched_domain_mask_f mask;
5602 int flags;
5603 int numa_level;
5604 struct sd_data data;
5605};
5606
5607/* 5581/*
5608 * Build an iteration mask that can exclude certain CPUs from the upwards 5582 * Build an iteration mask that can exclude certain CPUs from the upwards
5609 * domain traversal. 5583 * domain traversal.
@@ -5832,34 +5806,6 @@ int __weak arch_sd_sibling_asym_packing(void)
5832 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 5806 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
5833 */ 5807 */
5834 5808
5835#ifdef CONFIG_SCHED_DEBUG
5836# define SD_INIT_NAME(sd, type) sd->name = #type
5837#else
5838# define SD_INIT_NAME(sd, type) do { } while (0)
5839#endif
5840
5841#define SD_INIT_FUNC(type) \
5842static noinline struct sched_domain * \
5843sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
5844{ \
5845 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
5846 *sd = SD_##type##_INIT; \
5847 SD_INIT_NAME(sd, type); \
5848 sd->private = &tl->data; \
5849 return sd; \
5850}
5851
5852SD_INIT_FUNC(CPU)
5853#ifdef CONFIG_SCHED_SMT
5854 SD_INIT_FUNC(SIBLING)
5855#endif
5856#ifdef CONFIG_SCHED_MC
5857 SD_INIT_FUNC(MC)
5858#endif
5859#ifdef CONFIG_SCHED_BOOK
5860 SD_INIT_FUNC(BOOK)
5861#endif
5862
5863static int default_relax_domain_level = -1; 5809static int default_relax_domain_level = -1;
5864int sched_domain_level_max; 5810int sched_domain_level_max;
5865 5811
@@ -5947,99 +5893,156 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
5947 *per_cpu_ptr(sdd->sgp, cpu) = NULL; 5893 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
5948} 5894}
5949 5895
5950#ifdef CONFIG_SCHED_SMT
5951static const struct cpumask *cpu_smt_mask(int cpu)
5952{
5953 return topology_thread_cpumask(cpu);
5954}
5955#endif
5956
5957/*
5958 * Topology list, bottom-up.
5959 */
5960static struct sched_domain_topology_level default_topology[] = {
5961#ifdef CONFIG_SCHED_SMT
5962 { sd_init_SIBLING, cpu_smt_mask, },
5963#endif
5964#ifdef CONFIG_SCHED_MC
5965 { sd_init_MC, cpu_coregroup_mask, },
5966#endif
5967#ifdef CONFIG_SCHED_BOOK
5968 { sd_init_BOOK, cpu_book_mask, },
5969#endif
5970 { sd_init_CPU, cpu_cpu_mask, },
5971 { NULL, },
5972};
5973
5974static struct sched_domain_topology_level *sched_domain_topology = default_topology;
5975
5976#define for_each_sd_topology(tl) \
5977 for (tl = sched_domain_topology; tl->init; tl++)
5978
5979#ifdef CONFIG_NUMA 5896#ifdef CONFIG_NUMA
5980
5981static int sched_domains_numa_levels; 5897static int sched_domains_numa_levels;
5982static int *sched_domains_numa_distance; 5898static int *sched_domains_numa_distance;
5983static struct cpumask ***sched_domains_numa_masks; 5899static struct cpumask ***sched_domains_numa_masks;
5984static int sched_domains_curr_level; 5900static int sched_domains_curr_level;
5901#endif
5985 5902
5986static inline int sd_local_flags(int level) 5903/*
5987{ 5904 * SD_flags allowed in topology descriptions.
5988 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) 5905 *
5989 return 0; 5906 * SD_SHARE_CPUPOWER - describes SMT topologies
5990 5907 * SD_SHARE_PKG_RESOURCES - describes shared caches
5991 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 5908 * SD_NUMA - describes NUMA topologies
5992} 5909 *
5910 * Odd one out:
5911 * SD_ASYM_PACKING - describes SMT quirks
5912 */
5913#define TOPOLOGY_SD_FLAGS \
5914 (SD_SHARE_CPUPOWER | \
5915 SD_SHARE_PKG_RESOURCES | \
5916 SD_NUMA | \
5917 SD_ASYM_PACKING)
5993 5918
5994static struct sched_domain * 5919static struct sched_domain *
5995sd_numa_init(struct sched_domain_topology_level *tl, int cpu) 5920sd_init(struct sched_domain_topology_level *tl, int cpu)
5996{ 5921{
5997 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 5922 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
5998 int level = tl->numa_level; 5923 int sd_weight, sd_flags = 0;
5999 int sd_weight = cpumask_weight( 5924
6000 sched_domains_numa_masks[level][cpu_to_node(cpu)]); 5925#ifdef CONFIG_NUMA
5926 /*
5927 * Ugly hack to pass state to sd_numa_mask()...
5928 */
5929 sched_domains_curr_level = tl->numa_level;
5930#endif
5931
5932 sd_weight = cpumask_weight(tl->mask(cpu));
5933
5934 if (tl->sd_flags)
5935 sd_flags = (*tl->sd_flags)();
5936 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
5937 "wrong sd_flags in topology description\n"))
5938 sd_flags &= ~TOPOLOGY_SD_FLAGS;
6001 5939
6002 *sd = (struct sched_domain){ 5940 *sd = (struct sched_domain){
6003 .min_interval = sd_weight, 5941 .min_interval = sd_weight,
6004 .max_interval = 2*sd_weight, 5942 .max_interval = 2*sd_weight,
6005 .busy_factor = 32, 5943 .busy_factor = 32,
6006 .imbalance_pct = 125, 5944 .imbalance_pct = 125,
6007 .cache_nice_tries = 2, 5945
6008 .busy_idx = 3, 5946 .cache_nice_tries = 0,
6009 .idle_idx = 2, 5947 .busy_idx = 0,
5948 .idle_idx = 0,
6010 .newidle_idx = 0, 5949 .newidle_idx = 0,
6011 .wake_idx = 0, 5950 .wake_idx = 0,
6012 .forkexec_idx = 0, 5951 .forkexec_idx = 0,
6013 5952
6014 .flags = 1*SD_LOAD_BALANCE 5953 .flags = 1*SD_LOAD_BALANCE
6015 | 1*SD_BALANCE_NEWIDLE 5954 | 1*SD_BALANCE_NEWIDLE
6016 | 0*SD_BALANCE_EXEC 5955 | 1*SD_BALANCE_EXEC
6017 | 0*SD_BALANCE_FORK 5956 | 1*SD_BALANCE_FORK
6018 | 0*SD_BALANCE_WAKE 5957 | 0*SD_BALANCE_WAKE
6019 | 0*SD_WAKE_AFFINE 5958 | 1*SD_WAKE_AFFINE
6020 | 0*SD_SHARE_CPUPOWER 5959 | 0*SD_SHARE_CPUPOWER
6021 | 0*SD_SHARE_PKG_RESOURCES 5960 | 0*SD_SHARE_PKG_RESOURCES
6022 | 1*SD_SERIALIZE 5961 | 0*SD_SERIALIZE
6023 | 0*SD_PREFER_SIBLING 5962 | 0*SD_PREFER_SIBLING
6024 | 1*SD_NUMA 5963 | 0*SD_NUMA
6025 | sd_local_flags(level) 5964 | sd_flags
6026 , 5965 ,
5966
6027 .last_balance = jiffies, 5967 .last_balance = jiffies,
6028 .balance_interval = sd_weight, 5968 .balance_interval = sd_weight,
5969 .smt_gain = 0,
6029 .max_newidle_lb_cost = 0, 5970 .max_newidle_lb_cost = 0,
6030 .next_decay_max_lb_cost = jiffies, 5971 .next_decay_max_lb_cost = jiffies,
5972#ifdef CONFIG_SCHED_DEBUG
5973 .name = tl->name,
5974#endif
6031 }; 5975 };
6032 SD_INIT_NAME(sd, NUMA);
6033 sd->private = &tl->data;
6034 5976
6035 /* 5977 /*
6036 * Ugly hack to pass state to sd_numa_mask()... 5978 * Convert topological properties into behaviour.
6037 */ 5979 */
6038 sched_domains_curr_level = tl->numa_level; 5980
5981 if (sd->flags & SD_SHARE_CPUPOWER) {
5982 sd->imbalance_pct = 110;
5983 sd->smt_gain = 1178; /* ~15% */
5984 sd->flags |= arch_sd_sibling_asym_packing();
5985
5986 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
5987 sd->imbalance_pct = 117;
5988 sd->cache_nice_tries = 1;
5989 sd->busy_idx = 2;
5990
5991#ifdef CONFIG_NUMA
5992 } else if (sd->flags & SD_NUMA) {
5993 sd->cache_nice_tries = 2;
5994 sd->busy_idx = 3;
5995 sd->idle_idx = 2;
5996
5997 sd->flags |= SD_SERIALIZE;
5998 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
5999 sd->flags &= ~(SD_BALANCE_EXEC |
6000 SD_BALANCE_FORK |
6001 SD_WAKE_AFFINE);
6002 }
6003
6004#endif
6005 } else {
6006 sd->flags |= SD_PREFER_SIBLING;
6007 sd->cache_nice_tries = 1;
6008 sd->busy_idx = 2;
6009 sd->idle_idx = 1;
6010 }
6011
6012 sd->private = &tl->data;
6039 6013
6040 return sd; 6014 return sd;
6041} 6015}
6042 6016
6017/*
6018 * Topology list, bottom-up.
6019 */
6020static struct sched_domain_topology_level default_topology[] = {
6021#ifdef CONFIG_SCHED_SMT
6022 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
6023#endif
6024#ifdef CONFIG_SCHED_MC
6025 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
6026#endif
6027#ifdef CONFIG_SCHED_BOOK
6028 { cpu_book_mask, SD_INIT_NAME(BOOK) },
6029#endif
6030 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
6031 { NULL, },
6032};
6033
6034struct sched_domain_topology_level *sched_domain_topology = default_topology;
6035
6036#define for_each_sd_topology(tl) \
6037 for (tl = sched_domain_topology; tl->mask; tl++)
6038
6039void set_sched_topology(struct sched_domain_topology_level *tl)
6040{
6041 sched_domain_topology = tl;
6042}
6043
6044#ifdef CONFIG_NUMA
6045
6043static const struct cpumask *sd_numa_mask(int cpu) 6046static const struct cpumask *sd_numa_mask(int cpu)
6044{ 6047{
6045 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6048 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@@ -6183,7 +6186,10 @@ static void sched_init_numa(void)
6183 } 6186 }
6184 } 6187 }
6185 6188
6186 tl = kzalloc((ARRAY_SIZE(default_topology) + level) * 6189 /* Compute default topology size */
6190 for (i = 0; sched_domain_topology[i].mask; i++);
6191
6192 tl = kzalloc((i + level) *
6187 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 6193 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6188 if (!tl) 6194 if (!tl)
6189 return; 6195 return;
@@ -6191,18 +6197,19 @@ static void sched_init_numa(void)
6191 /* 6197 /*
6192 * Copy the default topology bits.. 6198 * Copy the default topology bits..
6193 */ 6199 */
6194 for (i = 0; default_topology[i].init; i++) 6200 for (i = 0; sched_domain_topology[i].mask; i++)
6195 tl[i] = default_topology[i]; 6201 tl[i] = sched_domain_topology[i];
6196 6202
6197 /* 6203 /*
6198 * .. and append 'j' levels of NUMA goodness. 6204 * .. and append 'j' levels of NUMA goodness.
6199 */ 6205 */
6200 for (j = 0; j < level; i++, j++) { 6206 for (j = 0; j < level; i++, j++) {
6201 tl[i] = (struct sched_domain_topology_level){ 6207 tl[i] = (struct sched_domain_topology_level){
6202 .init = sd_numa_init,
6203 .mask = sd_numa_mask, 6208 .mask = sd_numa_mask,
6209 .sd_flags = cpu_numa_flags,
6204 .flags = SDTL_OVERLAP, 6210 .flags = SDTL_OVERLAP,
6205 .numa_level = j, 6211 .numa_level = j,
6212 SD_INIT_NAME(NUMA)
6206 }; 6213 };
6207 } 6214 }
6208 6215
@@ -6360,7 +6367,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6360 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6367 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6361 struct sched_domain *child, int cpu) 6368 struct sched_domain *child, int cpu)
6362{ 6369{
6363 struct sched_domain *sd = tl->init(tl, cpu); 6370 struct sched_domain *sd = sd_init(tl, cpu);
6364 if (!sd) 6371 if (!sd)
6365 return child; 6372 return child;
6366 6373