summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2017-02-01 07:10:18 -0500
committerIngo Molnar <mingo@kernel.org>2017-02-07 04:58:12 -0500
commitf2cb13609d5397cdd747f3ed6fb651233851717d (patch)
tree0714785a7b04430b41346653178afc7b9a7bca70 /kernel
parent004172bdad644327dc7a6543186b9d7b529ee944 (diff)
sched/topology: Split out scheduler topology code from core.c into topology.c
Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c1659
-rw-r--r--kernel/sched/sched.h23
-rw-r--r--kernel/sched/topology.c1658
4 files changed, 1684 insertions, 1658 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 5e59b832ae2b..130ce8ac725b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -18,7 +18,7 @@ endif
18obj-y += core.o loadavg.o clock.o cputime.o 18obj-y += core.o loadavg.o clock.o cputime.o
19obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o 19obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
20obj-y += wait.o swait.o completion.o idle.o 20obj-y += wait.o swait.o completion.o idle.o
21obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o 21obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o
22obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 22obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
23obj-$(CONFIG_SCHEDSTATS) += stats.o 23obj-$(CONFIG_SCHEDSTATS) += stats.o
24obj-$(CONFIG_SCHED_DEBUG) += debug.o 24obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1cea6c61fb01..e4aa470ed454 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -31,7 +31,6 @@
31#define CREATE_TRACE_POINTS 31#define CREATE_TRACE_POINTS
32#include <trace/events/sched.h> 32#include <trace/events/sched.h>
33 33
34DEFINE_MUTEX(sched_domains_mutex);
35DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 34DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
36 35
37/* 36/*
@@ -5446,7 +5445,7 @@ out:
5446 5445
5447#ifdef CONFIG_SMP 5446#ifdef CONFIG_SMP
5448 5447
5449static bool sched_smp_initialized __read_mostly; 5448bool sched_smp_initialized __read_mostly;
5450 5449
5451#ifdef CONFIG_NUMA_BALANCING 5450#ifdef CONFIG_NUMA_BALANCING
5452/* Migrate current task p to target_cpu */ 5451/* Migrate current task p to target_cpu */
@@ -5643,7 +5642,7 @@ static void migrate_tasks(struct rq *dead_rq)
5643} 5642}
5644#endif /* CONFIG_HOTPLUG_CPU */ 5643#endif /* CONFIG_HOTPLUG_CPU */
5645 5644
5646static void set_rq_online(struct rq *rq) 5645void set_rq_online(struct rq *rq)
5647{ 5646{
5648 if (!rq->online) { 5647 if (!rq->online) {
5649 const struct sched_class *class; 5648 const struct sched_class *class;
@@ -5658,7 +5657,7 @@ static void set_rq_online(struct rq *rq)
5658 } 5657 }
5659} 5658}
5660 5659
5661static void set_rq_offline(struct rq *rq) 5660void set_rq_offline(struct rq *rq)
5662{ 5661{
5663 if (rq->online) { 5662 if (rq->online) {
5664 const struct sched_class *class; 5663 const struct sched_class *class;
@@ -5680,1658 +5679,6 @@ static void set_cpu_rq_start_time(unsigned int cpu)
5680 rq->age_stamp = sched_clock_cpu(cpu); 5679 rq->age_stamp = sched_clock_cpu(cpu);
5681} 5680}
5682 5681
5683/* Protected by sched_domains_mutex: */
5684static cpumask_var_t sched_domains_tmpmask;
5685
5686#ifdef CONFIG_SCHED_DEBUG
5687
5688static __read_mostly int sched_debug_enabled;
5689
5690static int __init sched_debug_setup(char *str)
5691{
5692 sched_debug_enabled = 1;
5693
5694 return 0;
5695}
5696early_param("sched_debug", sched_debug_setup);
5697
5698static inline bool sched_debug(void)
5699{
5700 return sched_debug_enabled;
5701}
5702
5703static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5704 struct cpumask *groupmask)
5705{
5706 struct sched_group *group = sd->groups;
5707
5708 cpumask_clear(groupmask);
5709
5710 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5711
5712 if (!(sd->flags & SD_LOAD_BALANCE)) {
5713 printk("does not load-balance\n");
5714 if (sd->parent)
5715 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5716 " has parent");
5717 return -1;
5718 }
5719
5720 printk(KERN_CONT "span %*pbl level %s\n",
5721 cpumask_pr_args(sched_domain_span(sd)), sd->name);
5722
5723 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5724 printk(KERN_ERR "ERROR: domain->span does not contain "
5725 "CPU%d\n", cpu);
5726 }
5727 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5728 printk(KERN_ERR "ERROR: domain->groups does not contain"
5729 " CPU%d\n", cpu);
5730 }
5731
5732 printk(KERN_DEBUG "%*s groups:", level + 1, "");
5733 do {
5734 if (!group) {
5735 printk("\n");
5736 printk(KERN_ERR "ERROR: group is NULL\n");
5737 break;
5738 }
5739
5740 if (!cpumask_weight(sched_group_cpus(group))) {
5741 printk(KERN_CONT "\n");
5742 printk(KERN_ERR "ERROR: empty group\n");
5743 break;
5744 }
5745
5746 if (!(sd->flags & SD_OVERLAP) &&
5747 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5748 printk(KERN_CONT "\n");
5749 printk(KERN_ERR "ERROR: repeated CPUs\n");
5750 break;
5751 }
5752
5753 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5754
5755 printk(KERN_CONT " %*pbl",
5756 cpumask_pr_args(sched_group_cpus(group)));
5757 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
5758 printk(KERN_CONT " (cpu_capacity = %lu)",
5759 group->sgc->capacity);
5760 }
5761
5762 group = group->next;
5763 } while (group != sd->groups);
5764 printk(KERN_CONT "\n");
5765
5766 if (!cpumask_equal(sched_domain_span(sd), groupmask))
5767 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5768
5769 if (sd->parent &&
5770 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5771 printk(KERN_ERR "ERROR: parent span is not a superset "
5772 "of domain->span\n");
5773 return 0;
5774}
5775
5776static void sched_domain_debug(struct sched_domain *sd, int cpu)
5777{
5778 int level = 0;
5779
5780 if (!sched_debug_enabled)
5781 return;
5782
5783 if (!sd) {
5784 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5785 return;
5786 }
5787
5788 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5789
5790 for (;;) {
5791 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5792 break;
5793 level++;
5794 sd = sd->parent;
5795 if (!sd)
5796 break;
5797 }
5798}
5799#else /* !CONFIG_SCHED_DEBUG */
5800
5801# define sched_debug_enabled 0
5802# define sched_domain_debug(sd, cpu) do { } while (0)
5803static inline bool sched_debug(void)
5804{
5805 return false;
5806}
5807#endif /* CONFIG_SCHED_DEBUG */
5808
5809static int sd_degenerate(struct sched_domain *sd)
5810{
5811 if (cpumask_weight(sched_domain_span(sd)) == 1)
5812 return 1;
5813
5814 /* Following flags need at least 2 groups */
5815 if (sd->flags & (SD_LOAD_BALANCE |
5816 SD_BALANCE_NEWIDLE |
5817 SD_BALANCE_FORK |
5818 SD_BALANCE_EXEC |
5819 SD_SHARE_CPUCAPACITY |
5820 SD_ASYM_CPUCAPACITY |
5821 SD_SHARE_PKG_RESOURCES |
5822 SD_SHARE_POWERDOMAIN)) {
5823 if (sd->groups != sd->groups->next)
5824 return 0;
5825 }
5826
5827 /* Following flags don't use groups */
5828 if (sd->flags & (SD_WAKE_AFFINE))
5829 return 0;
5830
5831 return 1;
5832}
5833
5834static int
5835sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5836{
5837 unsigned long cflags = sd->flags, pflags = parent->flags;
5838
5839 if (sd_degenerate(parent))
5840 return 1;
5841
5842 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5843 return 0;
5844
5845 /* Flags needing groups don't count if only 1 group in parent */
5846 if (parent->groups == parent->groups->next) {
5847 pflags &= ~(SD_LOAD_BALANCE |
5848 SD_BALANCE_NEWIDLE |
5849 SD_BALANCE_FORK |
5850 SD_BALANCE_EXEC |
5851 SD_ASYM_CPUCAPACITY |
5852 SD_SHARE_CPUCAPACITY |
5853 SD_SHARE_PKG_RESOURCES |
5854 SD_PREFER_SIBLING |
5855 SD_SHARE_POWERDOMAIN);
5856 if (nr_node_ids == 1)
5857 pflags &= ~SD_SERIALIZE;
5858 }
5859 if (~cflags & pflags)
5860 return 0;
5861
5862 return 1;
5863}
5864
5865static void free_rootdomain(struct rcu_head *rcu)
5866{
5867 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5868
5869 cpupri_cleanup(&rd->cpupri);
5870 cpudl_cleanup(&rd->cpudl);
5871 free_cpumask_var(rd->dlo_mask);
5872 free_cpumask_var(rd->rto_mask);
5873 free_cpumask_var(rd->online);
5874 free_cpumask_var(rd->span);
5875 kfree(rd);
5876}
5877
5878static void rq_attach_root(struct rq *rq, struct root_domain *rd)
5879{
5880 struct root_domain *old_rd = NULL;
5881 unsigned long flags;
5882
5883 raw_spin_lock_irqsave(&rq->lock, flags);
5884
5885 if (rq->rd) {
5886 old_rd = rq->rd;
5887
5888 if (cpumask_test_cpu(rq->cpu, old_rd->online))
5889 set_rq_offline(rq);
5890
5891 cpumask_clear_cpu(rq->cpu, old_rd->span);
5892
5893 /*
5894 * If we dont want to free the old_rd yet then
5895 * set old_rd to NULL to skip the freeing later
5896 * in this function:
5897 */
5898 if (!atomic_dec_and_test(&old_rd->refcount))
5899 old_rd = NULL;
5900 }
5901
5902 atomic_inc(&rd->refcount);
5903 rq->rd = rd;
5904
5905 cpumask_set_cpu(rq->cpu, rd->span);
5906 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5907 set_rq_online(rq);
5908
5909 raw_spin_unlock_irqrestore(&rq->lock, flags);
5910
5911 if (old_rd)
5912 call_rcu_sched(&old_rd->rcu, free_rootdomain);
5913}
5914
5915static int init_rootdomain(struct root_domain *rd)
5916{
5917 memset(rd, 0, sizeof(*rd));
5918
5919 if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
5920 goto out;
5921 if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
5922 goto free_span;
5923 if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
5924 goto free_online;
5925 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5926 goto free_dlo_mask;
5927
5928 init_dl_bw(&rd->dl_bw);
5929 if (cpudl_init(&rd->cpudl) != 0)
5930 goto free_rto_mask;
5931
5932 if (cpupri_init(&rd->cpupri) != 0)
5933 goto free_cpudl;
5934 return 0;
5935
5936free_cpudl:
5937 cpudl_cleanup(&rd->cpudl);
5938free_rto_mask:
5939 free_cpumask_var(rd->rto_mask);
5940free_dlo_mask:
5941 free_cpumask_var(rd->dlo_mask);
5942free_online:
5943 free_cpumask_var(rd->online);
5944free_span:
5945 free_cpumask_var(rd->span);
5946out:
5947 return -ENOMEM;
5948}
5949
5950/*
5951 * By default the system creates a single root-domain with all CPUs as
5952 * members (mimicking the global state we have today).
5953 */
5954struct root_domain def_root_domain;
5955
5956static void init_defrootdomain(void)
5957{
5958 init_rootdomain(&def_root_domain);
5959
5960 atomic_set(&def_root_domain.refcount, 1);
5961}
5962
5963static struct root_domain *alloc_rootdomain(void)
5964{
5965 struct root_domain *rd;
5966
5967 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5968 if (!rd)
5969 return NULL;
5970
5971 if (init_rootdomain(rd) != 0) {
5972 kfree(rd);
5973 return NULL;
5974 }
5975
5976 return rd;
5977}
5978
5979static void free_sched_groups(struct sched_group *sg, int free_sgc)
5980{
5981 struct sched_group *tmp, *first;
5982
5983 if (!sg)
5984 return;
5985
5986 first = sg;
5987 do {
5988 tmp = sg->next;
5989
5990 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
5991 kfree(sg->sgc);
5992
5993 kfree(sg);
5994 sg = tmp;
5995 } while (sg != first);
5996}
5997
5998static void destroy_sched_domain(struct sched_domain *sd)
5999{
6000 /*
6001 * If its an overlapping domain it has private groups, iterate and
6002 * nuke them all.
6003 */
6004 if (sd->flags & SD_OVERLAP) {
6005 free_sched_groups(sd->groups, 1);
6006 } else if (atomic_dec_and_test(&sd->groups->ref)) {
6007 kfree(sd->groups->sgc);
6008 kfree(sd->groups);
6009 }
6010 if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
6011 kfree(sd->shared);
6012 kfree(sd);
6013}
6014
6015static void destroy_sched_domains_rcu(struct rcu_head *rcu)
6016{
6017 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6018
6019 while (sd) {
6020 struct sched_domain *parent = sd->parent;
6021 destroy_sched_domain(sd);
6022 sd = parent;
6023 }
6024}
6025
6026static void destroy_sched_domains(struct sched_domain *sd)
6027{
6028 if (sd)
6029 call_rcu(&sd->rcu, destroy_sched_domains_rcu);
6030}
6031
6032/*
6033 * Keep a special pointer to the highest sched_domain that has
6034 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
6035 * allows us to avoid some pointer chasing select_idle_sibling().
6036 *
6037 * Also keep a unique ID per domain (we use the first CPU number in
6038 * the cpumask of the domain), this allows us to quickly tell if
6039 * two CPUs are in the same cache domain, see cpus_share_cache().
6040 */
6041DEFINE_PER_CPU(struct sched_domain *, sd_llc);
6042DEFINE_PER_CPU(int, sd_llc_size);
6043DEFINE_PER_CPU(int, sd_llc_id);
6044DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
6045DEFINE_PER_CPU(struct sched_domain *, sd_numa);
6046DEFINE_PER_CPU(struct sched_domain *, sd_asym);
6047
6048static void update_top_cache_domain(int cpu)
6049{
6050 struct sched_domain_shared *sds = NULL;
6051 struct sched_domain *sd;
6052 int id = cpu;
6053 int size = 1;
6054
6055 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
6056 if (sd) {
6057 id = cpumask_first(sched_domain_span(sd));
6058 size = cpumask_weight(sched_domain_span(sd));
6059 sds = sd->shared;
6060 }
6061
6062 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
6063 per_cpu(sd_llc_size, cpu) = size;
6064 per_cpu(sd_llc_id, cpu) = id;
6065 rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
6066
6067 sd = lowest_flag_domain(cpu, SD_NUMA);
6068 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
6069
6070 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
6071 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
6072}
6073
6074/*
6075 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6076 * hold the hotplug lock.
6077 */
6078static void
6079cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6080{
6081 struct rq *rq = cpu_rq(cpu);
6082 struct sched_domain *tmp;
6083
6084 /* Remove the sched domains which do not contribute to scheduling. */
6085 for (tmp = sd; tmp; ) {
6086 struct sched_domain *parent = tmp->parent;
6087 if (!parent)
6088 break;
6089
6090 if (sd_parent_degenerate(tmp, parent)) {
6091 tmp->parent = parent->parent;
6092 if (parent->parent)
6093 parent->parent->child = tmp;
6094 /*
6095 * Transfer SD_PREFER_SIBLING down in case of a
6096 * degenerate parent; the spans match for this
6097 * so the property transfers.
6098 */
6099 if (parent->flags & SD_PREFER_SIBLING)
6100 tmp->flags |= SD_PREFER_SIBLING;
6101 destroy_sched_domain(parent);
6102 } else
6103 tmp = tmp->parent;
6104 }
6105
6106 if (sd && sd_degenerate(sd)) {
6107 tmp = sd;
6108 sd = sd->parent;
6109 destroy_sched_domain(tmp);
6110 if (sd)
6111 sd->child = NULL;
6112 }
6113
6114 sched_domain_debug(sd, cpu);
6115
6116 rq_attach_root(rq, rd);
6117 tmp = rq->sd;
6118 rcu_assign_pointer(rq->sd, sd);
6119 destroy_sched_domains(tmp);
6120
6121 update_top_cache_domain(cpu);
6122}
6123
6124/* Setup the mask of CPUs configured for isolated domains */
6125static int __init isolated_cpu_setup(char *str)
6126{
6127 int ret;
6128
6129 alloc_bootmem_cpumask_var(&cpu_isolated_map);
6130 ret = cpulist_parse(str, cpu_isolated_map);
6131 if (ret) {
6132 pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
6133 return 0;
6134 }
6135 return 1;
6136}
6137__setup("isolcpus=", isolated_cpu_setup);
6138
6139struct s_data {
6140 struct sched_domain ** __percpu sd;
6141 struct root_domain *rd;
6142};
6143
6144enum s_alloc {
6145 sa_rootdomain,
6146 sa_sd,
6147 sa_sd_storage,
6148 sa_none,
6149};
6150
6151/*
6152 * Build an iteration mask that can exclude certain CPUs from the upwards
6153 * domain traversal.
6154 *
6155 * Asymmetric node setups can result in situations where the domain tree is of
6156 * unequal depth, make sure to skip domains that already cover the entire
6157 * range.
6158 *
6159 * In that case build_sched_domains() will have terminated the iteration early
6160 * and our sibling sd spans will be empty. Domains should always include the
6161 * CPU they're built on, so check that.
6162 */
6163static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
6164{
6165 const struct cpumask *span = sched_domain_span(sd);
6166 struct sd_data *sdd = sd->private;
6167 struct sched_domain *sibling;
6168 int i;
6169
6170 for_each_cpu(i, span) {
6171 sibling = *per_cpu_ptr(sdd->sd, i);
6172 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
6173 continue;
6174
6175 cpumask_set_cpu(i, sched_group_mask(sg));
6176 }
6177}
6178
6179/*
6180 * Return the canonical balance CPU for this group, this is the first CPU
6181 * of this group that's also in the iteration mask.
6182 */
6183int group_balance_cpu(struct sched_group *sg)
6184{
6185 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
6186}
6187
6188static int
6189build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6190{
6191 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
6192 const struct cpumask *span = sched_domain_span(sd);
6193 struct cpumask *covered = sched_domains_tmpmask;
6194 struct sd_data *sdd = sd->private;
6195 struct sched_domain *sibling;
6196 int i;
6197
6198 cpumask_clear(covered);
6199
6200 for_each_cpu(i, span) {
6201 struct cpumask *sg_span;
6202
6203 if (cpumask_test_cpu(i, covered))
6204 continue;
6205
6206 sibling = *per_cpu_ptr(sdd->sd, i);
6207
6208 /* See the comment near build_group_mask(). */
6209 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
6210 continue;
6211
6212 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6213 GFP_KERNEL, cpu_to_node(cpu));
6214
6215 if (!sg)
6216 goto fail;
6217
6218 sg_span = sched_group_cpus(sg);
6219 if (sibling->child)
6220 cpumask_copy(sg_span, sched_domain_span(sibling->child));
6221 else
6222 cpumask_set_cpu(i, sg_span);
6223
6224 cpumask_or(covered, covered, sg_span);
6225
6226 sg->sgc = *per_cpu_ptr(sdd->sgc, i);
6227 if (atomic_inc_return(&sg->sgc->ref) == 1)
6228 build_group_mask(sd, sg);
6229
6230 /*
6231 * Initialize sgc->capacity such that even if we mess up the
6232 * domains and no possible iteration will get us here, we won't
6233 * die on a /0 trap.
6234 */
6235 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
6236 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
6237
6238 /*
6239 * Make sure the first group of this domain contains the
6240 * canonical balance CPU. Otherwise the sched_domain iteration
6241 * breaks. See update_sg_lb_stats().
6242 */
6243 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
6244 group_balance_cpu(sg) == cpu)
6245 groups = sg;
6246
6247 if (!first)
6248 first = sg;
6249 if (last)
6250 last->next = sg;
6251 last = sg;
6252 last->next = first;
6253 }
6254 sd->groups = groups;
6255
6256 return 0;
6257
6258fail:
6259 free_sched_groups(first, 0);
6260
6261 return -ENOMEM;
6262}
6263
6264static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6265{
6266 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6267 struct sched_domain *child = sd->child;
6268
6269 if (child)
6270 cpu = cpumask_first(sched_domain_span(child));
6271
6272 if (sg) {
6273 *sg = *per_cpu_ptr(sdd->sg, cpu);
6274 (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
6275
6276 /* For claim_allocations: */
6277 atomic_set(&(*sg)->sgc->ref, 1);
6278 }
6279
6280 return cpu;
6281}
6282
6283/*
6284 * build_sched_groups will build a circular linked list of the groups
6285 * covered by the given span, and will set each group's ->cpumask correctly,
6286 * and ->cpu_capacity to 0.
6287 *
6288 * Assumes the sched_domain tree is fully constructed
6289 */
6290static int
6291build_sched_groups(struct sched_domain *sd, int cpu)
6292{
6293 struct sched_group *first = NULL, *last = NULL;
6294 struct sd_data *sdd = sd->private;
6295 const struct cpumask *span = sched_domain_span(sd);
6296 struct cpumask *covered;
6297 int i;
6298
6299 get_group(cpu, sdd, &sd->groups);
6300 atomic_inc(&sd->groups->ref);
6301
6302 if (cpu != cpumask_first(span))
6303 return 0;
6304
6305 lockdep_assert_held(&sched_domains_mutex);
6306 covered = sched_domains_tmpmask;
6307
6308 cpumask_clear(covered);
6309
6310 for_each_cpu(i, span) {
6311 struct sched_group *sg;
6312 int group, j;
6313
6314 if (cpumask_test_cpu(i, covered))
6315 continue;
6316
6317 group = get_group(i, sdd, &sg);
6318 cpumask_setall(sched_group_mask(sg));
6319
6320 for_each_cpu(j, span) {
6321 if (get_group(j, sdd, NULL) != group)
6322 continue;
6323
6324 cpumask_set_cpu(j, covered);
6325 cpumask_set_cpu(j, sched_group_cpus(sg));
6326 }
6327
6328 if (!first)
6329 first = sg;
6330 if (last)
6331 last->next = sg;
6332 last = sg;
6333 }
6334 last->next = first;
6335
6336 return 0;
6337}
6338
6339/*
6340 * Initialize sched groups cpu_capacity.
6341 *
6342 * cpu_capacity indicates the capacity of sched group, which is used while
6343 * distributing the load between different sched groups in a sched domain.
6344 * Typically cpu_capacity for all the groups in a sched domain will be same
6345 * unless there are asymmetries in the topology. If there are asymmetries,
6346 * group having more cpu_capacity will pickup more load compared to the
6347 * group having less cpu_capacity.
6348 */
6349static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
6350{
6351 struct sched_group *sg = sd->groups;
6352
6353 WARN_ON(!sg);
6354
6355 do {
6356 int cpu, max_cpu = -1;
6357
6358 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
6359
6360 if (!(sd->flags & SD_ASYM_PACKING))
6361 goto next;
6362
6363 for_each_cpu(cpu, sched_group_cpus(sg)) {
6364 if (max_cpu < 0)
6365 max_cpu = cpu;
6366 else if (sched_asym_prefer(cpu, max_cpu))
6367 max_cpu = cpu;
6368 }
6369 sg->asym_prefer_cpu = max_cpu;
6370
6371next:
6372 sg = sg->next;
6373 } while (sg != sd->groups);
6374
6375 if (cpu != group_balance_cpu(sg))
6376 return;
6377
6378 update_group_capacity(sd, cpu);
6379}
6380
6381/*
6382 * Initializers for schedule domains
6383 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
6384 */
6385
6386static int default_relax_domain_level = -1;
6387int sched_domain_level_max;
6388
6389static int __init setup_relax_domain_level(char *str)
6390{
6391 if (kstrtoint(str, 0, &default_relax_domain_level))
6392 pr_warn("Unable to set relax_domain_level\n");
6393
6394 return 1;
6395}
6396__setup("relax_domain_level=", setup_relax_domain_level);
6397
6398static void set_domain_attribute(struct sched_domain *sd,
6399 struct sched_domain_attr *attr)
6400{
6401 int request;
6402
6403 if (!attr || attr->relax_domain_level < 0) {
6404 if (default_relax_domain_level < 0)
6405 return;
6406 else
6407 request = default_relax_domain_level;
6408 } else
6409 request = attr->relax_domain_level;
6410 if (request < sd->level) {
6411 /* Turn off idle balance on this domain: */
6412 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6413 } else {
6414 /* Turn on idle balance on this domain: */
6415 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6416 }
6417}
6418
6419static void __sdt_free(const struct cpumask *cpu_map);
6420static int __sdt_alloc(const struct cpumask *cpu_map);
6421
6422static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6423 const struct cpumask *cpu_map)
6424{
6425 switch (what) {
6426 case sa_rootdomain:
6427 if (!atomic_read(&d->rd->refcount))
6428 free_rootdomain(&d->rd->rcu);
6429 /* Fall through */
6430 case sa_sd:
6431 free_percpu(d->sd);
6432 /* Fall through */
6433 case sa_sd_storage:
6434 __sdt_free(cpu_map);
6435 /* Fall through */
6436 case sa_none:
6437 break;
6438 }
6439}
6440
6441static enum s_alloc
6442__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
6443{
6444 memset(d, 0, sizeof(*d));
6445
6446 if (__sdt_alloc(cpu_map))
6447 return sa_sd_storage;
6448 d->sd = alloc_percpu(struct sched_domain *);
6449 if (!d->sd)
6450 return sa_sd_storage;
6451 d->rd = alloc_rootdomain();
6452 if (!d->rd)
6453 return sa_sd;
6454 return sa_rootdomain;
6455}
6456
6457/*
6458 * NULL the sd_data elements we've used to build the sched_domain and
6459 * sched_group structure so that the subsequent __free_domain_allocs()
6460 * will not free the data we're using.
6461 */
6462static void claim_allocations(int cpu, struct sched_domain *sd)
6463{
6464 struct sd_data *sdd = sd->private;
6465
6466 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
6467 *per_cpu_ptr(sdd->sd, cpu) = NULL;
6468
6469 if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
6470 *per_cpu_ptr(sdd->sds, cpu) = NULL;
6471
6472 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
6473 *per_cpu_ptr(sdd->sg, cpu) = NULL;
6474
6475 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
6476 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
6477}
6478
6479#ifdef CONFIG_NUMA
6480static int sched_domains_numa_levels;
6481enum numa_topology_type sched_numa_topology_type;
6482static int *sched_domains_numa_distance;
6483int sched_max_numa_distance;
6484static struct cpumask ***sched_domains_numa_masks;
6485static int sched_domains_curr_level;
6486#endif
6487
6488/*
6489 * SD_flags allowed in topology descriptions.
6490 *
6491 * These flags are purely descriptive of the topology and do not prescribe
6492 * behaviour. Behaviour is artificial and mapped in the below sd_init()
6493 * function:
6494 *
6495 * SD_SHARE_CPUCAPACITY - describes SMT topologies
6496 * SD_SHARE_PKG_RESOURCES - describes shared caches
6497 * SD_NUMA - describes NUMA topologies
6498 * SD_SHARE_POWERDOMAIN - describes shared power domain
6499 * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
6500 *
6501 * Odd one out, which beside describing the topology has a quirk also
6502 * prescribes the desired behaviour that goes along with it:
6503 *
6504 * SD_ASYM_PACKING - describes SMT quirks
6505 */
6506#define TOPOLOGY_SD_FLAGS \
6507 (SD_SHARE_CPUCAPACITY | \
6508 SD_SHARE_PKG_RESOURCES | \
6509 SD_NUMA | \
6510 SD_ASYM_PACKING | \
6511 SD_ASYM_CPUCAPACITY | \
6512 SD_SHARE_POWERDOMAIN)
6513
6514static struct sched_domain *
6515sd_init(struct sched_domain_topology_level *tl,
6516 const struct cpumask *cpu_map,
6517 struct sched_domain *child, int cpu)
6518{
6519 struct sd_data *sdd = &tl->data;
6520 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6521 int sd_id, sd_weight, sd_flags = 0;
6522
6523#ifdef CONFIG_NUMA
6524 /*
6525 * Ugly hack to pass state to sd_numa_mask()...
6526 */
6527 sched_domains_curr_level = tl->numa_level;
6528#endif
6529
6530 sd_weight = cpumask_weight(tl->mask(cpu));
6531
6532 if (tl->sd_flags)
6533 sd_flags = (*tl->sd_flags)();
6534 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
6535 "wrong sd_flags in topology description\n"))
6536 sd_flags &= ~TOPOLOGY_SD_FLAGS;
6537
6538 *sd = (struct sched_domain){
6539 .min_interval = sd_weight,
6540 .max_interval = 2*sd_weight,
6541 .busy_factor = 32,
6542 .imbalance_pct = 125,
6543
6544 .cache_nice_tries = 0,
6545 .busy_idx = 0,
6546 .idle_idx = 0,
6547 .newidle_idx = 0,
6548 .wake_idx = 0,
6549 .forkexec_idx = 0,
6550
6551 .flags = 1*SD_LOAD_BALANCE
6552 | 1*SD_BALANCE_NEWIDLE
6553 | 1*SD_BALANCE_EXEC
6554 | 1*SD_BALANCE_FORK
6555 | 0*SD_BALANCE_WAKE
6556 | 1*SD_WAKE_AFFINE
6557 | 0*SD_SHARE_CPUCAPACITY
6558 | 0*SD_SHARE_PKG_RESOURCES
6559 | 0*SD_SERIALIZE
6560 | 0*SD_PREFER_SIBLING
6561 | 0*SD_NUMA
6562 | sd_flags
6563 ,
6564
6565 .last_balance = jiffies,
6566 .balance_interval = sd_weight,
6567 .smt_gain = 0,
6568 .max_newidle_lb_cost = 0,
6569 .next_decay_max_lb_cost = jiffies,
6570 .child = child,
6571#ifdef CONFIG_SCHED_DEBUG
6572 .name = tl->name,
6573#endif
6574 };
6575
6576 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6577 sd_id = cpumask_first(sched_domain_span(sd));
6578
6579 /*
6580 * Convert topological properties into behaviour.
6581 */
6582
6583 if (sd->flags & SD_ASYM_CPUCAPACITY) {
6584 struct sched_domain *t = sd;
6585
6586 for_each_lower_domain(t)
6587 t->flags |= SD_BALANCE_WAKE;
6588 }
6589
6590 if (sd->flags & SD_SHARE_CPUCAPACITY) {
6591 sd->flags |= SD_PREFER_SIBLING;
6592 sd->imbalance_pct = 110;
6593 sd->smt_gain = 1178; /* ~15% */
6594
6595 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
6596 sd->imbalance_pct = 117;
6597 sd->cache_nice_tries = 1;
6598 sd->busy_idx = 2;
6599
6600#ifdef CONFIG_NUMA
6601 } else if (sd->flags & SD_NUMA) {
6602 sd->cache_nice_tries = 2;
6603 sd->busy_idx = 3;
6604 sd->idle_idx = 2;
6605
6606 sd->flags |= SD_SERIALIZE;
6607 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
6608 sd->flags &= ~(SD_BALANCE_EXEC |
6609 SD_BALANCE_FORK |
6610 SD_WAKE_AFFINE);
6611 }
6612
6613#endif
6614 } else {
6615 sd->flags |= SD_PREFER_SIBLING;
6616 sd->cache_nice_tries = 1;
6617 sd->busy_idx = 2;
6618 sd->idle_idx = 1;
6619 }
6620
6621 /*
6622 * For all levels sharing cache; connect a sched_domain_shared
6623 * instance.
6624 */
6625 if (sd->flags & SD_SHARE_PKG_RESOURCES) {
6626 sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
6627 atomic_inc(&sd->shared->ref);
6628 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
6629 }
6630
6631 sd->private = sdd;
6632
6633 return sd;
6634}
6635
6636/*
6637 * Topology list, bottom-up.
6638 */
6639static struct sched_domain_topology_level default_topology[] = {
6640#ifdef CONFIG_SCHED_SMT
6641 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
6642#endif
6643#ifdef CONFIG_SCHED_MC
6644 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
6645#endif
6646 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
6647 { NULL, },
6648};
6649
6650static struct sched_domain_topology_level *sched_domain_topology =
6651 default_topology;
6652
6653#define for_each_sd_topology(tl) \
6654 for (tl = sched_domain_topology; tl->mask; tl++)
6655
6656void set_sched_topology(struct sched_domain_topology_level *tl)
6657{
6658 if (WARN_ON_ONCE(sched_smp_initialized))
6659 return;
6660
6661 sched_domain_topology = tl;
6662}
6663
6664#ifdef CONFIG_NUMA
6665
6666static const struct cpumask *sd_numa_mask(int cpu)
6667{
6668 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6669}
6670
6671static void sched_numa_warn(const char *str)
6672{
6673 static int done = false;
6674 int i,j;
6675
6676 if (done)
6677 return;
6678
6679 done = true;
6680
6681 printk(KERN_WARNING "ERROR: %s\n\n", str);
6682
6683 for (i = 0; i < nr_node_ids; i++) {
6684 printk(KERN_WARNING " ");
6685 for (j = 0; j < nr_node_ids; j++)
6686 printk(KERN_CONT "%02d ", node_distance(i,j));
6687 printk(KERN_CONT "\n");
6688 }
6689 printk(KERN_WARNING "\n");
6690}
6691
6692bool find_numa_distance(int distance)
6693{
6694 int i;
6695
6696 if (distance == node_distance(0, 0))
6697 return true;
6698
6699 for (i = 0; i < sched_domains_numa_levels; i++) {
6700 if (sched_domains_numa_distance[i] == distance)
6701 return true;
6702 }
6703
6704 return false;
6705}
6706
6707/*
6708 * A system can have three types of NUMA topology:
6709 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
6710 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
6711 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
6712 *
6713 * The difference between a glueless mesh topology and a backplane
6714 * topology lies in whether communication between not directly
6715 * connected nodes goes through intermediary nodes (where programs
6716 * could run), or through backplane controllers. This affects
6717 * placement of programs.
6718 *
6719 * The type of topology can be discerned with the following tests:
6720 * - If the maximum distance between any nodes is 1 hop, the system
6721 * is directly connected.
6722 * - If for two nodes A and B, located N > 1 hops away from each other,
6723 * there is an intermediary node C, which is < N hops away from both
6724 * nodes A and B, the system is a glueless mesh.
6725 */
6726static void init_numa_topology_type(void)
6727{
6728 int a, b, c, n;
6729
6730 n = sched_max_numa_distance;
6731
6732 if (sched_domains_numa_levels <= 1) {
6733 sched_numa_topology_type = NUMA_DIRECT;
6734 return;
6735 }
6736
6737 for_each_online_node(a) {
6738 for_each_online_node(b) {
6739 /* Find two nodes furthest removed from each other. */
6740 if (node_distance(a, b) < n)
6741 continue;
6742
6743 /* Is there an intermediary node between a and b? */
6744 for_each_online_node(c) {
6745 if (node_distance(a, c) < n &&
6746 node_distance(b, c) < n) {
6747 sched_numa_topology_type =
6748 NUMA_GLUELESS_MESH;
6749 return;
6750 }
6751 }
6752
6753 sched_numa_topology_type = NUMA_BACKPLANE;
6754 return;
6755 }
6756 }
6757}
6758
6759static void sched_init_numa(void)
6760{
6761 int next_distance, curr_distance = node_distance(0, 0);
6762 struct sched_domain_topology_level *tl;
6763 int level = 0;
6764 int i, j, k;
6765
6766 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6767 if (!sched_domains_numa_distance)
6768 return;
6769
6770 /*
6771 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6772 * unique distances in the node_distance() table.
6773 *
6774 * Assumes node_distance(0,j) includes all distances in
6775 * node_distance(i,j) in order to avoid cubic time.
6776 */
6777 next_distance = curr_distance;
6778 for (i = 0; i < nr_node_ids; i++) {
6779 for (j = 0; j < nr_node_ids; j++) {
6780 for (k = 0; k < nr_node_ids; k++) {
6781 int distance = node_distance(i, k);
6782
6783 if (distance > curr_distance &&
6784 (distance < next_distance ||
6785 next_distance == curr_distance))
6786 next_distance = distance;
6787
6788 /*
6789 * While not a strong assumption it would be nice to know
6790 * about cases where if node A is connected to B, B is not
6791 * equally connected to A.
6792 */
6793 if (sched_debug() && node_distance(k, i) != distance)
6794 sched_numa_warn("Node-distance not symmetric");
6795
6796 if (sched_debug() && i && !find_numa_distance(distance))
6797 sched_numa_warn("Node-0 not representative");
6798 }
6799 if (next_distance != curr_distance) {
6800 sched_domains_numa_distance[level++] = next_distance;
6801 sched_domains_numa_levels = level;
6802 curr_distance = next_distance;
6803 } else break;
6804 }
6805
6806 /*
6807 * In case of sched_debug() we verify the above assumption.
6808 */
6809 if (!sched_debug())
6810 break;
6811 }
6812
6813 if (!level)
6814 return;
6815
6816 /*
6817 * 'level' contains the number of unique distances, excluding the
6818 * identity distance node_distance(i,i).
6819 *
6820 * The sched_domains_numa_distance[] array includes the actual distance
6821 * numbers.
6822 */
6823
6824 /*
6825 * Here, we should temporarily reset sched_domains_numa_levels to 0.
6826 * If it fails to allocate memory for array sched_domains_numa_masks[][],
6827 * the array will contain less then 'level' members. This could be
6828 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
6829 * in other functions.
6830 *
6831 * We reset it to 'level' at the end of this function.
6832 */
6833 sched_domains_numa_levels = 0;
6834
6835 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6836 if (!sched_domains_numa_masks)
6837 return;
6838
6839 /*
6840 * Now for each level, construct a mask per node which contains all
6841 * CPUs of nodes that are that many hops away from us.
6842 */
6843 for (i = 0; i < level; i++) {
6844 sched_domains_numa_masks[i] =
6845 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6846 if (!sched_domains_numa_masks[i])
6847 return;
6848
6849 for (j = 0; j < nr_node_ids; j++) {
6850 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6851 if (!mask)
6852 return;
6853
6854 sched_domains_numa_masks[i][j] = mask;
6855
6856 for_each_node(k) {
6857 if (node_distance(j, k) > sched_domains_numa_distance[i])
6858 continue;
6859
6860 cpumask_or(mask, mask, cpumask_of_node(k));
6861 }
6862 }
6863 }
6864
6865 /* Compute default topology size */
6866 for (i = 0; sched_domain_topology[i].mask; i++);
6867
6868 tl = kzalloc((i + level + 1) *
6869 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6870 if (!tl)
6871 return;
6872
6873 /*
6874 * Copy the default topology bits..
6875 */
6876 for (i = 0; sched_domain_topology[i].mask; i++)
6877 tl[i] = sched_domain_topology[i];
6878
6879 /*
6880 * .. and append 'j' levels of NUMA goodness.
6881 */
6882 for (j = 0; j < level; i++, j++) {
6883 tl[i] = (struct sched_domain_topology_level){
6884 .mask = sd_numa_mask,
6885 .sd_flags = cpu_numa_flags,
6886 .flags = SDTL_OVERLAP,
6887 .numa_level = j,
6888 SD_INIT_NAME(NUMA)
6889 };
6890 }
6891
6892 sched_domain_topology = tl;
6893
6894 sched_domains_numa_levels = level;
6895 sched_max_numa_distance = sched_domains_numa_distance[level - 1];
6896
6897 init_numa_topology_type();
6898}
6899
6900static void sched_domains_numa_masks_set(unsigned int cpu)
6901{
6902 int node = cpu_to_node(cpu);
6903 int i, j;
6904
6905 for (i = 0; i < sched_domains_numa_levels; i++) {
6906 for (j = 0; j < nr_node_ids; j++) {
6907 if (node_distance(j, node) <= sched_domains_numa_distance[i])
6908 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
6909 }
6910 }
6911}
6912
6913static void sched_domains_numa_masks_clear(unsigned int cpu)
6914{
6915 int i, j;
6916
6917 for (i = 0; i < sched_domains_numa_levels; i++) {
6918 for (j = 0; j < nr_node_ids; j++)
6919 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
6920 }
6921}
6922
6923#else
6924static inline void sched_init_numa(void) { }
6925static void sched_domains_numa_masks_set(unsigned int cpu) { }
6926static void sched_domains_numa_masks_clear(unsigned int cpu) { }
6927#endif /* CONFIG_NUMA */
6928
6929static int __sdt_alloc(const struct cpumask *cpu_map)
6930{
6931 struct sched_domain_topology_level *tl;
6932 int j;
6933
6934 for_each_sd_topology(tl) {
6935 struct sd_data *sdd = &tl->data;
6936
6937 sdd->sd = alloc_percpu(struct sched_domain *);
6938 if (!sdd->sd)
6939 return -ENOMEM;
6940
6941 sdd->sds = alloc_percpu(struct sched_domain_shared *);
6942 if (!sdd->sds)
6943 return -ENOMEM;
6944
6945 sdd->sg = alloc_percpu(struct sched_group *);
6946 if (!sdd->sg)
6947 return -ENOMEM;
6948
6949 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
6950 if (!sdd->sgc)
6951 return -ENOMEM;
6952
6953 for_each_cpu(j, cpu_map) {
6954 struct sched_domain *sd;
6955 struct sched_domain_shared *sds;
6956 struct sched_group *sg;
6957 struct sched_group_capacity *sgc;
6958
6959 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6960 GFP_KERNEL, cpu_to_node(j));
6961 if (!sd)
6962 return -ENOMEM;
6963
6964 *per_cpu_ptr(sdd->sd, j) = sd;
6965
6966 sds = kzalloc_node(sizeof(struct sched_domain_shared),
6967 GFP_KERNEL, cpu_to_node(j));
6968 if (!sds)
6969 return -ENOMEM;
6970
6971 *per_cpu_ptr(sdd->sds, j) = sds;
6972
6973 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6974 GFP_KERNEL, cpu_to_node(j));
6975 if (!sg)
6976 return -ENOMEM;
6977
6978 sg->next = sg;
6979
6980 *per_cpu_ptr(sdd->sg, j) = sg;
6981
6982 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
6983 GFP_KERNEL, cpu_to_node(j));
6984 if (!sgc)
6985 return -ENOMEM;
6986
6987 *per_cpu_ptr(sdd->sgc, j) = sgc;
6988 }
6989 }
6990
6991 return 0;
6992}
6993
6994static void __sdt_free(const struct cpumask *cpu_map)
6995{
6996 struct sched_domain_topology_level *tl;
6997 int j;
6998
6999 for_each_sd_topology(tl) {
7000 struct sd_data *sdd = &tl->data;
7001
7002 for_each_cpu(j, cpu_map) {
7003 struct sched_domain *sd;
7004
7005 if (sdd->sd) {
7006 sd = *per_cpu_ptr(sdd->sd, j);
7007 if (sd && (sd->flags & SD_OVERLAP))
7008 free_sched_groups(sd->groups, 0);
7009 kfree(*per_cpu_ptr(sdd->sd, j));
7010 }
7011
7012 if (sdd->sds)
7013 kfree(*per_cpu_ptr(sdd->sds, j));
7014 if (sdd->sg)
7015 kfree(*per_cpu_ptr(sdd->sg, j));
7016 if (sdd->sgc)
7017 kfree(*per_cpu_ptr(sdd->sgc, j));
7018 }
7019 free_percpu(sdd->sd);
7020 sdd->sd = NULL;
7021 free_percpu(sdd->sds);
7022 sdd->sds = NULL;
7023 free_percpu(sdd->sg);
7024 sdd->sg = NULL;
7025 free_percpu(sdd->sgc);
7026 sdd->sgc = NULL;
7027 }
7028}
7029
7030struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7031 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7032 struct sched_domain *child, int cpu)
7033{
7034 struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
7035
7036 if (child) {
7037 sd->level = child->level + 1;
7038 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7039 child->parent = sd;
7040
7041 if (!cpumask_subset(sched_domain_span(child),
7042 sched_domain_span(sd))) {
7043 pr_err("BUG: arch topology borken\n");
7044#ifdef CONFIG_SCHED_DEBUG
7045 pr_err(" the %s domain not a subset of the %s domain\n",
7046 child->name, sd->name);
7047#endif
7048 /* Fixup, ensure @sd has at least @child cpus. */
7049 cpumask_or(sched_domain_span(sd),
7050 sched_domain_span(sd),
7051 sched_domain_span(child));
7052 }
7053
7054 }
7055 set_domain_attribute(sd, attr);
7056
7057 return sd;
7058}
7059
7060/*
7061 * Build sched domains for a given set of CPUs and attach the sched domains
7062 * to the individual CPUs
7063 */
7064static int
7065build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
7066{
7067 enum s_alloc alloc_state;
7068 struct sched_domain *sd;
7069 struct s_data d;
7070 struct rq *rq = NULL;
7071 int i, ret = -ENOMEM;
7072
7073 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7074 if (alloc_state != sa_rootdomain)
7075 goto error;
7076
7077 /* Set up domains for CPUs specified by the cpu_map: */
7078 for_each_cpu(i, cpu_map) {
7079 struct sched_domain_topology_level *tl;
7080
7081 sd = NULL;
7082 for_each_sd_topology(tl) {
7083 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
7084 if (tl == sched_domain_topology)
7085 *per_cpu_ptr(d.sd, i) = sd;
7086 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
7087 sd->flags |= SD_OVERLAP;
7088 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
7089 break;
7090 }
7091 }
7092
7093 /* Build the groups for the domains */
7094 for_each_cpu(i, cpu_map) {
7095 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7096 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7097 if (sd->flags & SD_OVERLAP) {
7098 if (build_overlap_sched_groups(sd, i))
7099 goto error;
7100 } else {
7101 if (build_sched_groups(sd, i))
7102 goto error;
7103 }
7104 }
7105 }
7106
7107 /* Calculate CPU capacity for physical packages and nodes */
7108 for (i = nr_cpumask_bits-1; i >= 0; i--) {
7109 if (!cpumask_test_cpu(i, cpu_map))
7110 continue;
7111
7112 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7113 claim_allocations(i, sd);
7114 init_sched_groups_capacity(i, sd);
7115 }
7116 }
7117
7118 /* Attach the domains */
7119 rcu_read_lock();
7120 for_each_cpu(i, cpu_map) {
7121 rq = cpu_rq(i);
7122 sd = *per_cpu_ptr(d.sd, i);
7123
7124 /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
7125 if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
7126 WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
7127
7128 cpu_attach_domain(sd, d.rd, i);
7129 }
7130 rcu_read_unlock();
7131
7132 if (rq && sched_debug_enabled) {
7133 pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
7134 cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
7135 }
7136
7137 ret = 0;
7138error:
7139 __free_domain_allocs(&d, alloc_state, cpu_map);
7140 return ret;
7141}
7142
7143/* Current sched domains: */
7144static cpumask_var_t *doms_cur;
7145
7146/* Number of sched domains in 'doms_cur': */
7147static int ndoms_cur;
7148
7149/* Attribues of custom domains in 'doms_cur' */
7150static struct sched_domain_attr *dattr_cur;
7151
7152/*
7153 * Special case: If a kmalloc() of a doms_cur partition (array of
7154 * cpumask) fails, then fallback to a single sched domain,
7155 * as determined by the single cpumask fallback_doms.
7156 */
7157static cpumask_var_t fallback_doms;
7158
7159/*
7160 * arch_update_cpu_topology lets virtualized architectures update the
7161 * CPU core maps. It is supposed to return 1 if the topology changed
7162 * or 0 if it stayed the same.
7163 */
7164int __weak arch_update_cpu_topology(void)
7165{
7166 return 0;
7167}
7168
7169cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
7170{
7171 int i;
7172 cpumask_var_t *doms;
7173
7174 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
7175 if (!doms)
7176 return NULL;
7177 for (i = 0; i < ndoms; i++) {
7178 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
7179 free_sched_domains(doms, i);
7180 return NULL;
7181 }
7182 }
7183 return doms;
7184}
7185
7186void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7187{
7188 unsigned int i;
7189 for (i = 0; i < ndoms; i++)
7190 free_cpumask_var(doms[i]);
7191 kfree(doms);
7192}
7193
7194/*
7195 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
7196 * For now this just excludes isolated CPUs, but could be used to
7197 * exclude other special cases in the future.
7198 */
7199static int init_sched_domains(const struct cpumask *cpu_map)
7200{
7201 int err;
7202
7203 arch_update_cpu_topology();
7204 ndoms_cur = 1;
7205 doms_cur = alloc_sched_domains(ndoms_cur);
7206 if (!doms_cur)
7207 doms_cur = &fallback_doms;
7208 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7209 err = build_sched_domains(doms_cur[0], NULL);
7210 register_sched_domain_sysctl();
7211
7212 return err;
7213}
7214
7215/*
7216 * Detach sched domains from a group of CPUs specified in cpu_map
7217 * These CPUs will now be attached to the NULL domain
7218 */
7219static void detach_destroy_domains(const struct cpumask *cpu_map)
7220{
7221 int i;
7222
7223 rcu_read_lock();
7224 for_each_cpu(i, cpu_map)
7225 cpu_attach_domain(NULL, &def_root_domain, i);
7226 rcu_read_unlock();
7227}
7228
7229/* handle null as "default" */
7230static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7231 struct sched_domain_attr *new, int idx_new)
7232{
7233 struct sched_domain_attr tmp;
7234
7235 /* Fast path: */
7236 if (!new && !cur)
7237 return 1;
7238
7239 tmp = SD_ATTR_INIT;
7240 return !memcmp(cur ? (cur + idx_cur) : &tmp,
7241 new ? (new + idx_new) : &tmp,
7242 sizeof(struct sched_domain_attr));
7243}
7244
7245/*
7246 * Partition sched domains as specified by the 'ndoms_new'
7247 * cpumasks in the array doms_new[] of cpumasks. This compares
7248 * doms_new[] to the current sched domain partitioning, doms_cur[].
7249 * It destroys each deleted domain and builds each new domain.
7250 *
7251 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
7252 * The masks don't intersect (don't overlap.) We should setup one
7253 * sched domain for each mask. CPUs not in any of the cpumasks will
7254 * not be load balanced. If the same cpumask appears both in the
7255 * current 'doms_cur' domains and in the new 'doms_new', we can leave
7256 * it as it is.
7257 *
7258 * The passed in 'doms_new' should be allocated using
7259 * alloc_sched_domains. This routine takes ownership of it and will
7260 * free_sched_domains it when done with it. If the caller failed the
7261 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
7262 * and partition_sched_domains() will fallback to the single partition
7263 * 'fallback_doms', it also forces the domains to be rebuilt.
7264 *
7265 * If doms_new == NULL it will be replaced with cpu_online_mask.
7266 * ndoms_new == 0 is a special case for destroying existing domains,
7267 * and it will not create the default domain.
7268 *
7269 * Call with hotplug lock held
7270 */
7271void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
7272 struct sched_domain_attr *dattr_new)
7273{
7274 int i, j, n;
7275 int new_topology;
7276
7277 mutex_lock(&sched_domains_mutex);
7278
7279 /* Always unregister in case we don't destroy any domains: */
7280 unregister_sched_domain_sysctl();
7281
7282 /* Let the architecture update CPU core mappings: */
7283 new_topology = arch_update_cpu_topology();
7284
7285 n = doms_new ? ndoms_new : 0;
7286
7287 /* Destroy deleted domains: */
7288 for (i = 0; i < ndoms_cur; i++) {
7289 for (j = 0; j < n && !new_topology; j++) {
7290 if (cpumask_equal(doms_cur[i], doms_new[j])
7291 && dattrs_equal(dattr_cur, i, dattr_new, j))
7292 goto match1;
7293 }
7294 /* No match - a current sched domain not in new doms_new[] */
7295 detach_destroy_domains(doms_cur[i]);
7296match1:
7297 ;
7298 }
7299
7300 n = ndoms_cur;
7301 if (doms_new == NULL) {
7302 n = 0;
7303 doms_new = &fallback_doms;
7304 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
7305 WARN_ON_ONCE(dattr_new);
7306 }
7307
7308 /* Build new domains: */
7309 for (i = 0; i < ndoms_new; i++) {
7310 for (j = 0; j < n && !new_topology; j++) {
7311 if (cpumask_equal(doms_new[i], doms_cur[j])
7312 && dattrs_equal(dattr_new, i, dattr_cur, j))
7313 goto match2;
7314 }
7315 /* No match - add a new doms_new */
7316 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7317match2:
7318 ;
7319 }
7320
7321 /* Remember the new sched domains: */
7322 if (doms_cur != &fallback_doms)
7323 free_sched_domains(doms_cur, ndoms_cur);
7324
7325 kfree(dattr_cur);
7326 doms_cur = doms_new;
7327 dattr_cur = dattr_new;
7328 ndoms_cur = ndoms_new;
7329
7330 register_sched_domain_sysctl();
7331
7332 mutex_unlock(&sched_domains_mutex);
7333}
7334
7335/* 5682/*
7336 * used to mark begin/end of suspend/resume: 5683 * used to mark begin/end of suspend/resume:
7337 */ 5684 */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8ff5cc539e8a..17ed94b9b413 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -223,7 +223,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
223 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; 223 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
224} 224}
225 225
226extern struct mutex sched_domains_mutex; 226extern void init_dl_bw(struct dl_bw *dl_b);
227 227
228#ifdef CONFIG_CGROUP_SCHED 228#ifdef CONFIG_CGROUP_SCHED
229 229
@@ -584,6 +584,13 @@ struct root_domain {
584}; 584};
585 585
586extern struct root_domain def_root_domain; 586extern struct root_domain def_root_domain;
587extern struct mutex sched_domains_mutex;
588extern cpumask_var_t fallback_doms;
589extern cpumask_var_t sched_domains_tmpmask;
590
591extern void init_defrootdomain(void);
592extern int init_sched_domains(const struct cpumask *cpu_map);
593extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
587 594
588#endif /* CONFIG_SMP */ 595#endif /* CONFIG_SMP */
589 596
@@ -886,6 +893,16 @@ extern int sched_max_numa_distance;
886extern bool find_numa_distance(int distance); 893extern bool find_numa_distance(int distance);
887#endif 894#endif
888 895
896#ifdef CONFIG_NUMA
897extern void sched_init_numa(void);
898extern void sched_domains_numa_masks_set(unsigned int cpu);
899extern void sched_domains_numa_masks_clear(unsigned int cpu);
900#else
901static inline void sched_init_numa(void) { }
902static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
903static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
904#endif
905
889#ifdef CONFIG_NUMA_BALANCING 906#ifdef CONFIG_NUMA_BALANCING
890/* The regions in numa_faults array from task_struct */ 907/* The regions in numa_faults array from task_struct */
891enum numa_faults_stats { 908enum numa_faults_stats {
@@ -1752,6 +1769,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1752 __release(rq2->lock); 1769 __release(rq2->lock);
1753} 1770}
1754 1771
1772extern void set_rq_online (struct rq *rq);
1773extern void set_rq_offline(struct rq *rq);
1774extern bool sched_smp_initialized;
1775
1755#else /* CONFIG_SMP */ 1776#else /* CONFIG_SMP */
1756 1777
1757/* 1778/*
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
new file mode 100644
index 000000000000..1b0b4fb12837
--- /dev/null
+++ b/kernel/sched/topology.c
@@ -0,0 +1,1658 @@
1/*
2 * Scheduler topology setup/handling methods
3 */
4#include <linux/sched.h>
5#include <linux/mutex.h>
6
7#include "sched.h"
8
9DEFINE_MUTEX(sched_domains_mutex);
10
11/* Protected by sched_domains_mutex: */
12cpumask_var_t sched_domains_tmpmask;
13
14#ifdef CONFIG_SCHED_DEBUG
15
16static __read_mostly int sched_debug_enabled;
17
18static int __init sched_debug_setup(char *str)
19{
20 sched_debug_enabled = 1;
21
22 return 0;
23}
24early_param("sched_debug", sched_debug_setup);
25
26static inline bool sched_debug(void)
27{
28 return sched_debug_enabled;
29}
30
31static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
32 struct cpumask *groupmask)
33{
34 struct sched_group *group = sd->groups;
35
36 cpumask_clear(groupmask);
37
38 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
39
40 if (!(sd->flags & SD_LOAD_BALANCE)) {
41 printk("does not load-balance\n");
42 if (sd->parent)
43 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
44 " has parent");
45 return -1;
46 }
47
48 printk(KERN_CONT "span %*pbl level %s\n",
49 cpumask_pr_args(sched_domain_span(sd)), sd->name);
50
51 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
52 printk(KERN_ERR "ERROR: domain->span does not contain "
53 "CPU%d\n", cpu);
54 }
55 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
56 printk(KERN_ERR "ERROR: domain->groups does not contain"
57 " CPU%d\n", cpu);
58 }
59
60 printk(KERN_DEBUG "%*s groups:", level + 1, "");
61 do {
62 if (!group) {
63 printk("\n");
64 printk(KERN_ERR "ERROR: group is NULL\n");
65 break;
66 }
67
68 if (!cpumask_weight(sched_group_cpus(group))) {
69 printk(KERN_CONT "\n");
70 printk(KERN_ERR "ERROR: empty group\n");
71 break;
72 }
73
74 if (!(sd->flags & SD_OVERLAP) &&
75 cpumask_intersects(groupmask, sched_group_cpus(group))) {
76 printk(KERN_CONT "\n");
77 printk(KERN_ERR "ERROR: repeated CPUs\n");
78 break;
79 }
80
81 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
82
83 printk(KERN_CONT " %*pbl",
84 cpumask_pr_args(sched_group_cpus(group)));
85 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
86 printk(KERN_CONT " (cpu_capacity = %lu)",
87 group->sgc->capacity);
88 }
89
90 group = group->next;
91 } while (group != sd->groups);
92 printk(KERN_CONT "\n");
93
94 if (!cpumask_equal(sched_domain_span(sd), groupmask))
95 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
96
97 if (sd->parent &&
98 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
99 printk(KERN_ERR "ERROR: parent span is not a superset "
100 "of domain->span\n");
101 return 0;
102}
103
104static void sched_domain_debug(struct sched_domain *sd, int cpu)
105{
106 int level = 0;
107
108 if (!sched_debug_enabled)
109 return;
110
111 if (!sd) {
112 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
113 return;
114 }
115
116 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
117
118 for (;;) {
119 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
120 break;
121 level++;
122 sd = sd->parent;
123 if (!sd)
124 break;
125 }
126}
127#else /* !CONFIG_SCHED_DEBUG */
128
129# define sched_debug_enabled 0
130# define sched_domain_debug(sd, cpu) do { } while (0)
131static inline bool sched_debug(void)
132{
133 return false;
134}
135#endif /* CONFIG_SCHED_DEBUG */
136
137static int sd_degenerate(struct sched_domain *sd)
138{
139 if (cpumask_weight(sched_domain_span(sd)) == 1)
140 return 1;
141
142 /* Following flags need at least 2 groups */
143 if (sd->flags & (SD_LOAD_BALANCE |
144 SD_BALANCE_NEWIDLE |
145 SD_BALANCE_FORK |
146 SD_BALANCE_EXEC |
147 SD_SHARE_CPUCAPACITY |
148 SD_ASYM_CPUCAPACITY |
149 SD_SHARE_PKG_RESOURCES |
150 SD_SHARE_POWERDOMAIN)) {
151 if (sd->groups != sd->groups->next)
152 return 0;
153 }
154
155 /* Following flags don't use groups */
156 if (sd->flags & (SD_WAKE_AFFINE))
157 return 0;
158
159 return 1;
160}
161
162static int
163sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
164{
165 unsigned long cflags = sd->flags, pflags = parent->flags;
166
167 if (sd_degenerate(parent))
168 return 1;
169
170 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
171 return 0;
172
173 /* Flags needing groups don't count if only 1 group in parent */
174 if (parent->groups == parent->groups->next) {
175 pflags &= ~(SD_LOAD_BALANCE |
176 SD_BALANCE_NEWIDLE |
177 SD_BALANCE_FORK |
178 SD_BALANCE_EXEC |
179 SD_ASYM_CPUCAPACITY |
180 SD_SHARE_CPUCAPACITY |
181 SD_SHARE_PKG_RESOURCES |
182 SD_PREFER_SIBLING |
183 SD_SHARE_POWERDOMAIN);
184 if (nr_node_ids == 1)
185 pflags &= ~SD_SERIALIZE;
186 }
187 if (~cflags & pflags)
188 return 0;
189
190 return 1;
191}
192
193static void free_rootdomain(struct rcu_head *rcu)
194{
195 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
196
197 cpupri_cleanup(&rd->cpupri);
198 cpudl_cleanup(&rd->cpudl);
199 free_cpumask_var(rd->dlo_mask);
200 free_cpumask_var(rd->rto_mask);
201 free_cpumask_var(rd->online);
202 free_cpumask_var(rd->span);
203 kfree(rd);
204}
205
206void rq_attach_root(struct rq *rq, struct root_domain *rd)
207{
208 struct root_domain *old_rd = NULL;
209 unsigned long flags;
210
211 raw_spin_lock_irqsave(&rq->lock, flags);
212
213 if (rq->rd) {
214 old_rd = rq->rd;
215
216 if (cpumask_test_cpu(rq->cpu, old_rd->online))
217 set_rq_offline(rq);
218
219 cpumask_clear_cpu(rq->cpu, old_rd->span);
220
221 /*
222 * If we dont want to free the old_rd yet then
223 * set old_rd to NULL to skip the freeing later
224 * in this function:
225 */
226 if (!atomic_dec_and_test(&old_rd->refcount))
227 old_rd = NULL;
228 }
229
230 atomic_inc(&rd->refcount);
231 rq->rd = rd;
232
233 cpumask_set_cpu(rq->cpu, rd->span);
234 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
235 set_rq_online(rq);
236
237 raw_spin_unlock_irqrestore(&rq->lock, flags);
238
239 if (old_rd)
240 call_rcu_sched(&old_rd->rcu, free_rootdomain);
241}
242
243static int init_rootdomain(struct root_domain *rd)
244{
245 memset(rd, 0, sizeof(*rd));
246
247 if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
248 goto out;
249 if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
250 goto free_span;
251 if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
252 goto free_online;
253 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
254 goto free_dlo_mask;
255
256 init_dl_bw(&rd->dl_bw);
257 if (cpudl_init(&rd->cpudl) != 0)
258 goto free_rto_mask;
259
260 if (cpupri_init(&rd->cpupri) != 0)
261 goto free_cpudl;
262 return 0;
263
264free_cpudl:
265 cpudl_cleanup(&rd->cpudl);
266free_rto_mask:
267 free_cpumask_var(rd->rto_mask);
268free_dlo_mask:
269 free_cpumask_var(rd->dlo_mask);
270free_online:
271 free_cpumask_var(rd->online);
272free_span:
273 free_cpumask_var(rd->span);
274out:
275 return -ENOMEM;
276}
277
278/*
279 * By default the system creates a single root-domain with all CPUs as
280 * members (mimicking the global state we have today).
281 */
282struct root_domain def_root_domain;
283
284void init_defrootdomain(void)
285{
286 init_rootdomain(&def_root_domain);
287
288 atomic_set(&def_root_domain.refcount, 1);
289}
290
291static struct root_domain *alloc_rootdomain(void)
292{
293 struct root_domain *rd;
294
295 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
296 if (!rd)
297 return NULL;
298
299 if (init_rootdomain(rd) != 0) {
300 kfree(rd);
301 return NULL;
302 }
303
304 return rd;
305}
306
307static void free_sched_groups(struct sched_group *sg, int free_sgc)
308{
309 struct sched_group *tmp, *first;
310
311 if (!sg)
312 return;
313
314 first = sg;
315 do {
316 tmp = sg->next;
317
318 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
319 kfree(sg->sgc);
320
321 kfree(sg);
322 sg = tmp;
323 } while (sg != first);
324}
325
326static void destroy_sched_domain(struct sched_domain *sd)
327{
328 /*
329 * If its an overlapping domain it has private groups, iterate and
330 * nuke them all.
331 */
332 if (sd->flags & SD_OVERLAP) {
333 free_sched_groups(sd->groups, 1);
334 } else if (atomic_dec_and_test(&sd->groups->ref)) {
335 kfree(sd->groups->sgc);
336 kfree(sd->groups);
337 }
338 if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
339 kfree(sd->shared);
340 kfree(sd);
341}
342
343static void destroy_sched_domains_rcu(struct rcu_head *rcu)
344{
345 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
346
347 while (sd) {
348 struct sched_domain *parent = sd->parent;
349 destroy_sched_domain(sd);
350 sd = parent;
351 }
352}
353
354static void destroy_sched_domains(struct sched_domain *sd)
355{
356 if (sd)
357 call_rcu(&sd->rcu, destroy_sched_domains_rcu);
358}
359
360/*
361 * Keep a special pointer to the highest sched_domain that has
362 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
363 * allows us to avoid some pointer chasing select_idle_sibling().
364 *
365 * Also keep a unique ID per domain (we use the first CPU number in
366 * the cpumask of the domain), this allows us to quickly tell if
367 * two CPUs are in the same cache domain, see cpus_share_cache().
368 */
369DEFINE_PER_CPU(struct sched_domain *, sd_llc);
370DEFINE_PER_CPU(int, sd_llc_size);
371DEFINE_PER_CPU(int, sd_llc_id);
372DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
373DEFINE_PER_CPU(struct sched_domain *, sd_numa);
374DEFINE_PER_CPU(struct sched_domain *, sd_asym);
375
376static void update_top_cache_domain(int cpu)
377{
378 struct sched_domain_shared *sds = NULL;
379 struct sched_domain *sd;
380 int id = cpu;
381 int size = 1;
382
383 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
384 if (sd) {
385 id = cpumask_first(sched_domain_span(sd));
386 size = cpumask_weight(sched_domain_span(sd));
387 sds = sd->shared;
388 }
389
390 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
391 per_cpu(sd_llc_size, cpu) = size;
392 per_cpu(sd_llc_id, cpu) = id;
393 rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
394
395 sd = lowest_flag_domain(cpu, SD_NUMA);
396 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
397
398 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
399 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
400}
401
402/*
403 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
404 * hold the hotplug lock.
405 */
406static void
407cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
408{
409 struct rq *rq = cpu_rq(cpu);
410 struct sched_domain *tmp;
411
412 /* Remove the sched domains which do not contribute to scheduling. */
413 for (tmp = sd; tmp; ) {
414 struct sched_domain *parent = tmp->parent;
415 if (!parent)
416 break;
417
418 if (sd_parent_degenerate(tmp, parent)) {
419 tmp->parent = parent->parent;
420 if (parent->parent)
421 parent->parent->child = tmp;
422 /*
423 * Transfer SD_PREFER_SIBLING down in case of a
424 * degenerate parent; the spans match for this
425 * so the property transfers.
426 */
427 if (parent->flags & SD_PREFER_SIBLING)
428 tmp->flags |= SD_PREFER_SIBLING;
429 destroy_sched_domain(parent);
430 } else
431 tmp = tmp->parent;
432 }
433
434 if (sd && sd_degenerate(sd)) {
435 tmp = sd;
436 sd = sd->parent;
437 destroy_sched_domain(tmp);
438 if (sd)
439 sd->child = NULL;
440 }
441
442 sched_domain_debug(sd, cpu);
443
444 rq_attach_root(rq, rd);
445 tmp = rq->sd;
446 rcu_assign_pointer(rq->sd, sd);
447 destroy_sched_domains(tmp);
448
449 update_top_cache_domain(cpu);
450}
451
452/* Setup the mask of CPUs configured for isolated domains */
453static int __init isolated_cpu_setup(char *str)
454{
455 int ret;
456
457 alloc_bootmem_cpumask_var(&cpu_isolated_map);
458 ret = cpulist_parse(str, cpu_isolated_map);
459 if (ret) {
460 pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
461 return 0;
462 }
463 return 1;
464}
465__setup("isolcpus=", isolated_cpu_setup);
466
467struct s_data {
468 struct sched_domain ** __percpu sd;
469 struct root_domain *rd;
470};
471
472enum s_alloc {
473 sa_rootdomain,
474 sa_sd,
475 sa_sd_storage,
476 sa_none,
477};
478
479/*
480 * Build an iteration mask that can exclude certain CPUs from the upwards
481 * domain traversal.
482 *
483 * Asymmetric node setups can result in situations where the domain tree is of
484 * unequal depth, make sure to skip domains that already cover the entire
485 * range.
486 *
487 * In that case build_sched_domains() will have terminated the iteration early
488 * and our sibling sd spans will be empty. Domains should always include the
489 * CPU they're built on, so check that.
490 */
491static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
492{
493 const struct cpumask *span = sched_domain_span(sd);
494 struct sd_data *sdd = sd->private;
495 struct sched_domain *sibling;
496 int i;
497
498 for_each_cpu(i, span) {
499 sibling = *per_cpu_ptr(sdd->sd, i);
500 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
501 continue;
502
503 cpumask_set_cpu(i, sched_group_mask(sg));
504 }
505}
506
507/*
508 * Return the canonical balance CPU for this group, this is the first CPU
509 * of this group that's also in the iteration mask.
510 */
511int group_balance_cpu(struct sched_group *sg)
512{
513 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
514}
515
516static int
517build_overlap_sched_groups(struct sched_domain *sd, int cpu)
518{
519 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
520 const struct cpumask *span = sched_domain_span(sd);
521 struct cpumask *covered = sched_domains_tmpmask;
522 struct sd_data *sdd = sd->private;
523 struct sched_domain *sibling;
524 int i;
525
526 cpumask_clear(covered);
527
528 for_each_cpu(i, span) {
529 struct cpumask *sg_span;
530
531 if (cpumask_test_cpu(i, covered))
532 continue;
533
534 sibling = *per_cpu_ptr(sdd->sd, i);
535
536 /* See the comment near build_group_mask(). */
537 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
538 continue;
539
540 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
541 GFP_KERNEL, cpu_to_node(cpu));
542
543 if (!sg)
544 goto fail;
545
546 sg_span = sched_group_cpus(sg);
547 if (sibling->child)
548 cpumask_copy(sg_span, sched_domain_span(sibling->child));
549 else
550 cpumask_set_cpu(i, sg_span);
551
552 cpumask_or(covered, covered, sg_span);
553
554 sg->sgc = *per_cpu_ptr(sdd->sgc, i);
555 if (atomic_inc_return(&sg->sgc->ref) == 1)
556 build_group_mask(sd, sg);
557
558 /*
559 * Initialize sgc->capacity such that even if we mess up the
560 * domains and no possible iteration will get us here, we won't
561 * die on a /0 trap.
562 */
563 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
564 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
565
566 /*
567 * Make sure the first group of this domain contains the
568 * canonical balance CPU. Otherwise the sched_domain iteration
569 * breaks. See update_sg_lb_stats().
570 */
571 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
572 group_balance_cpu(sg) == cpu)
573 groups = sg;
574
575 if (!first)
576 first = sg;
577 if (last)
578 last->next = sg;
579 last = sg;
580 last->next = first;
581 }
582 sd->groups = groups;
583
584 return 0;
585
586fail:
587 free_sched_groups(first, 0);
588
589 return -ENOMEM;
590}
591
592static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
593{
594 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
595 struct sched_domain *child = sd->child;
596
597 if (child)
598 cpu = cpumask_first(sched_domain_span(child));
599
600 if (sg) {
601 *sg = *per_cpu_ptr(sdd->sg, cpu);
602 (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
603
604 /* For claim_allocations: */
605 atomic_set(&(*sg)->sgc->ref, 1);
606 }
607
608 return cpu;
609}
610
611/*
612 * build_sched_groups will build a circular linked list of the groups
613 * covered by the given span, and will set each group's ->cpumask correctly,
614 * and ->cpu_capacity to 0.
615 *
616 * Assumes the sched_domain tree is fully constructed
617 */
618static int
619build_sched_groups(struct sched_domain *sd, int cpu)
620{
621 struct sched_group *first = NULL, *last = NULL;
622 struct sd_data *sdd = sd->private;
623 const struct cpumask *span = sched_domain_span(sd);
624 struct cpumask *covered;
625 int i;
626
627 get_group(cpu, sdd, &sd->groups);
628 atomic_inc(&sd->groups->ref);
629
630 if (cpu != cpumask_first(span))
631 return 0;
632
633 lockdep_assert_held(&sched_domains_mutex);
634 covered = sched_domains_tmpmask;
635
636 cpumask_clear(covered);
637
638 for_each_cpu(i, span) {
639 struct sched_group *sg;
640 int group, j;
641
642 if (cpumask_test_cpu(i, covered))
643 continue;
644
645 group = get_group(i, sdd, &sg);
646 cpumask_setall(sched_group_mask(sg));
647
648 for_each_cpu(j, span) {
649 if (get_group(j, sdd, NULL) != group)
650 continue;
651
652 cpumask_set_cpu(j, covered);
653 cpumask_set_cpu(j, sched_group_cpus(sg));
654 }
655
656 if (!first)
657 first = sg;
658 if (last)
659 last->next = sg;
660 last = sg;
661 }
662 last->next = first;
663
664 return 0;
665}
666
667/*
668 * Initialize sched groups cpu_capacity.
669 *
670 * cpu_capacity indicates the capacity of sched group, which is used while
671 * distributing the load between different sched groups in a sched domain.
672 * Typically cpu_capacity for all the groups in a sched domain will be same
673 * unless there are asymmetries in the topology. If there are asymmetries,
674 * group having more cpu_capacity will pickup more load compared to the
675 * group having less cpu_capacity.
676 */
677static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
678{
679 struct sched_group *sg = sd->groups;
680
681 WARN_ON(!sg);
682
683 do {
684 int cpu, max_cpu = -1;
685
686 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
687
688 if (!(sd->flags & SD_ASYM_PACKING))
689 goto next;
690
691 for_each_cpu(cpu, sched_group_cpus(sg)) {
692 if (max_cpu < 0)
693 max_cpu = cpu;
694 else if (sched_asym_prefer(cpu, max_cpu))
695 max_cpu = cpu;
696 }
697 sg->asym_prefer_cpu = max_cpu;
698
699next:
700 sg = sg->next;
701 } while (sg != sd->groups);
702
703 if (cpu != group_balance_cpu(sg))
704 return;
705
706 update_group_capacity(sd, cpu);
707}
708
709/*
710 * Initializers for schedule domains
711 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
712 */
713
714static int default_relax_domain_level = -1;
715int sched_domain_level_max;
716
717static int __init setup_relax_domain_level(char *str)
718{
719 if (kstrtoint(str, 0, &default_relax_domain_level))
720 pr_warn("Unable to set relax_domain_level\n");
721
722 return 1;
723}
724__setup("relax_domain_level=", setup_relax_domain_level);
725
726static void set_domain_attribute(struct sched_domain *sd,
727 struct sched_domain_attr *attr)
728{
729 int request;
730
731 if (!attr || attr->relax_domain_level < 0) {
732 if (default_relax_domain_level < 0)
733 return;
734 else
735 request = default_relax_domain_level;
736 } else
737 request = attr->relax_domain_level;
738 if (request < sd->level) {
739 /* Turn off idle balance on this domain: */
740 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
741 } else {
742 /* Turn on idle balance on this domain: */
743 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
744 }
745}
746
747static void __sdt_free(const struct cpumask *cpu_map);
748static int __sdt_alloc(const struct cpumask *cpu_map);
749
750static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
751 const struct cpumask *cpu_map)
752{
753 switch (what) {
754 case sa_rootdomain:
755 if (!atomic_read(&d->rd->refcount))
756 free_rootdomain(&d->rd->rcu);
757 /* Fall through */
758 case sa_sd:
759 free_percpu(d->sd);
760 /* Fall through */
761 case sa_sd_storage:
762 __sdt_free(cpu_map);
763 /* Fall through */
764 case sa_none:
765 break;
766 }
767}
768
769static enum s_alloc
770__visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
771{
772 memset(d, 0, sizeof(*d));
773
774 if (__sdt_alloc(cpu_map))
775 return sa_sd_storage;
776 d->sd = alloc_percpu(struct sched_domain *);
777 if (!d->sd)
778 return sa_sd_storage;
779 d->rd = alloc_rootdomain();
780 if (!d->rd)
781 return sa_sd;
782 return sa_rootdomain;
783}
784
785/*
786 * NULL the sd_data elements we've used to build the sched_domain and
787 * sched_group structure so that the subsequent __free_domain_allocs()
788 * will not free the data we're using.
789 */
790static void claim_allocations(int cpu, struct sched_domain *sd)
791{
792 struct sd_data *sdd = sd->private;
793
794 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
795 *per_cpu_ptr(sdd->sd, cpu) = NULL;
796
797 if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
798 *per_cpu_ptr(sdd->sds, cpu) = NULL;
799
800 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
801 *per_cpu_ptr(sdd->sg, cpu) = NULL;
802
803 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
804 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
805}
806
807#ifdef CONFIG_NUMA
808static int sched_domains_numa_levels;
809enum numa_topology_type sched_numa_topology_type;
810static int *sched_domains_numa_distance;
811int sched_max_numa_distance;
812static struct cpumask ***sched_domains_numa_masks;
813static int sched_domains_curr_level;
814#endif
815
816/*
817 * SD_flags allowed in topology descriptions.
818 *
819 * These flags are purely descriptive of the topology and do not prescribe
820 * behaviour. Behaviour is artificial and mapped in the below sd_init()
821 * function:
822 *
823 * SD_SHARE_CPUCAPACITY - describes SMT topologies
824 * SD_SHARE_PKG_RESOURCES - describes shared caches
825 * SD_NUMA - describes NUMA topologies
826 * SD_SHARE_POWERDOMAIN - describes shared power domain
827 * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
828 *
829 * Odd one out, which beside describing the topology has a quirk also
830 * prescribes the desired behaviour that goes along with it:
831 *
832 * SD_ASYM_PACKING - describes SMT quirks
833 */
834#define TOPOLOGY_SD_FLAGS \
835 (SD_SHARE_CPUCAPACITY | \
836 SD_SHARE_PKG_RESOURCES | \
837 SD_NUMA | \
838 SD_ASYM_PACKING | \
839 SD_ASYM_CPUCAPACITY | \
840 SD_SHARE_POWERDOMAIN)
841
842static struct sched_domain *
843sd_init(struct sched_domain_topology_level *tl,
844 const struct cpumask *cpu_map,
845 struct sched_domain *child, int cpu)
846{
847 struct sd_data *sdd = &tl->data;
848 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
849 int sd_id, sd_weight, sd_flags = 0;
850
851#ifdef CONFIG_NUMA
852 /*
853 * Ugly hack to pass state to sd_numa_mask()...
854 */
855 sched_domains_curr_level = tl->numa_level;
856#endif
857
858 sd_weight = cpumask_weight(tl->mask(cpu));
859
860 if (tl->sd_flags)
861 sd_flags = (*tl->sd_flags)();
862 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
863 "wrong sd_flags in topology description\n"))
864 sd_flags &= ~TOPOLOGY_SD_FLAGS;
865
866 *sd = (struct sched_domain){
867 .min_interval = sd_weight,
868 .max_interval = 2*sd_weight,
869 .busy_factor = 32,
870 .imbalance_pct = 125,
871
872 .cache_nice_tries = 0,
873 .busy_idx = 0,
874 .idle_idx = 0,
875 .newidle_idx = 0,
876 .wake_idx = 0,
877 .forkexec_idx = 0,
878
879 .flags = 1*SD_LOAD_BALANCE
880 | 1*SD_BALANCE_NEWIDLE
881 | 1*SD_BALANCE_EXEC
882 | 1*SD_BALANCE_FORK
883 | 0*SD_BALANCE_WAKE
884 | 1*SD_WAKE_AFFINE
885 | 0*SD_SHARE_CPUCAPACITY
886 | 0*SD_SHARE_PKG_RESOURCES
887 | 0*SD_SERIALIZE
888 | 0*SD_PREFER_SIBLING
889 | 0*SD_NUMA
890 | sd_flags
891 ,
892
893 .last_balance = jiffies,
894 .balance_interval = sd_weight,
895 .smt_gain = 0,
896 .max_newidle_lb_cost = 0,
897 .next_decay_max_lb_cost = jiffies,
898 .child = child,
899#ifdef CONFIG_SCHED_DEBUG
900 .name = tl->name,
901#endif
902 };
903
904 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
905 sd_id = cpumask_first(sched_domain_span(sd));
906
907 /*
908 * Convert topological properties into behaviour.
909 */
910
911 if (sd->flags & SD_ASYM_CPUCAPACITY) {
912 struct sched_domain *t = sd;
913
914 for_each_lower_domain(t)
915 t->flags |= SD_BALANCE_WAKE;
916 }
917
918 if (sd->flags & SD_SHARE_CPUCAPACITY) {
919 sd->flags |= SD_PREFER_SIBLING;
920 sd->imbalance_pct = 110;
921 sd->smt_gain = 1178; /* ~15% */
922
923 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
924 sd->imbalance_pct = 117;
925 sd->cache_nice_tries = 1;
926 sd->busy_idx = 2;
927
928#ifdef CONFIG_NUMA
929 } else if (sd->flags & SD_NUMA) {
930 sd->cache_nice_tries = 2;
931 sd->busy_idx = 3;
932 sd->idle_idx = 2;
933
934 sd->flags |= SD_SERIALIZE;
935 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
936 sd->flags &= ~(SD_BALANCE_EXEC |
937 SD_BALANCE_FORK |
938 SD_WAKE_AFFINE);
939 }
940
941#endif
942 } else {
943 sd->flags |= SD_PREFER_SIBLING;
944 sd->cache_nice_tries = 1;
945 sd->busy_idx = 2;
946 sd->idle_idx = 1;
947 }
948
949 /*
950 * For all levels sharing cache; connect a sched_domain_shared
951 * instance.
952 */
953 if (sd->flags & SD_SHARE_PKG_RESOURCES) {
954 sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
955 atomic_inc(&sd->shared->ref);
956 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
957 }
958
959 sd->private = sdd;
960
961 return sd;
962}
963
964/*
965 * Topology list, bottom-up.
966 */
967static struct sched_domain_topology_level default_topology[] = {
968#ifdef CONFIG_SCHED_SMT
969 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
970#endif
971#ifdef CONFIG_SCHED_MC
972 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
973#endif
974 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
975 { NULL, },
976};
977
978static struct sched_domain_topology_level *sched_domain_topology =
979 default_topology;
980
981#define for_each_sd_topology(tl) \
982 for (tl = sched_domain_topology; tl->mask; tl++)
983
984void set_sched_topology(struct sched_domain_topology_level *tl)
985{
986 if (WARN_ON_ONCE(sched_smp_initialized))
987 return;
988
989 sched_domain_topology = tl;
990}
991
992#ifdef CONFIG_NUMA
993
994static const struct cpumask *sd_numa_mask(int cpu)
995{
996 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
997}
998
999static void sched_numa_warn(const char *str)
1000{
1001 static int done = false;
1002 int i,j;
1003
1004 if (done)
1005 return;
1006
1007 done = true;
1008
1009 printk(KERN_WARNING "ERROR: %s\n\n", str);
1010
1011 for (i = 0; i < nr_node_ids; i++) {
1012 printk(KERN_WARNING " ");
1013 for (j = 0; j < nr_node_ids; j++)
1014 printk(KERN_CONT "%02d ", node_distance(i,j));
1015 printk(KERN_CONT "\n");
1016 }
1017 printk(KERN_WARNING "\n");
1018}
1019
1020bool find_numa_distance(int distance)
1021{
1022 int i;
1023
1024 if (distance == node_distance(0, 0))
1025 return true;
1026
1027 for (i = 0; i < sched_domains_numa_levels; i++) {
1028 if (sched_domains_numa_distance[i] == distance)
1029 return true;
1030 }
1031
1032 return false;
1033}
1034
1035/*
1036 * A system can have three types of NUMA topology:
1037 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
1038 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
1039 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
1040 *
1041 * The difference between a glueless mesh topology and a backplane
1042 * topology lies in whether communication between not directly
1043 * connected nodes goes through intermediary nodes (where programs
1044 * could run), or through backplane controllers. This affects
1045 * placement of programs.
1046 *
1047 * The type of topology can be discerned with the following tests:
1048 * - If the maximum distance between any nodes is 1 hop, the system
1049 * is directly connected.
1050 * - If for two nodes A and B, located N > 1 hops away from each other,
1051 * there is an intermediary node C, which is < N hops away from both
1052 * nodes A and B, the system is a glueless mesh.
1053 */
1054static void init_numa_topology_type(void)
1055{
1056 int a, b, c, n;
1057
1058 n = sched_max_numa_distance;
1059
1060 if (sched_domains_numa_levels <= 1) {
1061 sched_numa_topology_type = NUMA_DIRECT;
1062 return;
1063 }
1064
1065 for_each_online_node(a) {
1066 for_each_online_node(b) {
1067 /* Find two nodes furthest removed from each other. */
1068 if (node_distance(a, b) < n)
1069 continue;
1070
1071 /* Is there an intermediary node between a and b? */
1072 for_each_online_node(c) {
1073 if (node_distance(a, c) < n &&
1074 node_distance(b, c) < n) {
1075 sched_numa_topology_type =
1076 NUMA_GLUELESS_MESH;
1077 return;
1078 }
1079 }
1080
1081 sched_numa_topology_type = NUMA_BACKPLANE;
1082 return;
1083 }
1084 }
1085}
1086
1087void sched_init_numa(void)
1088{
1089 int next_distance, curr_distance = node_distance(0, 0);
1090 struct sched_domain_topology_level *tl;
1091 int level = 0;
1092 int i, j, k;
1093
1094 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
1095 if (!sched_domains_numa_distance)
1096 return;
1097
1098 /*
1099 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
1100 * unique distances in the node_distance() table.
1101 *
1102 * Assumes node_distance(0,j) includes all distances in
1103 * node_distance(i,j) in order to avoid cubic time.
1104 */
1105 next_distance = curr_distance;
1106 for (i = 0; i < nr_node_ids; i++) {
1107 for (j = 0; j < nr_node_ids; j++) {
1108 for (k = 0; k < nr_node_ids; k++) {
1109 int distance = node_distance(i, k);
1110
1111 if (distance > curr_distance &&
1112 (distance < next_distance ||
1113 next_distance == curr_distance))
1114 next_distance = distance;
1115
1116 /*
1117 * While not a strong assumption it would be nice to know
1118 * about cases where if node A is connected to B, B is not
1119 * equally connected to A.
1120 */
1121 if (sched_debug() && node_distance(k, i) != distance)
1122 sched_numa_warn("Node-distance not symmetric");
1123
1124 if (sched_debug() && i && !find_numa_distance(distance))
1125 sched_numa_warn("Node-0 not representative");
1126 }
1127 if (next_distance != curr_distance) {
1128 sched_domains_numa_distance[level++] = next_distance;
1129 sched_domains_numa_levels = level;
1130 curr_distance = next_distance;
1131 } else break;
1132 }
1133
1134 /*
1135 * In case of sched_debug() we verify the above assumption.
1136 */
1137 if (!sched_debug())
1138 break;
1139 }
1140
1141 if (!level)
1142 return;
1143
1144 /*
1145 * 'level' contains the number of unique distances, excluding the
1146 * identity distance node_distance(i,i).
1147 *
1148 * The sched_domains_numa_distance[] array includes the actual distance
1149 * numbers.
1150 */
1151
1152 /*
1153 * Here, we should temporarily reset sched_domains_numa_levels to 0.
1154 * If it fails to allocate memory for array sched_domains_numa_masks[][],
1155 * the array will contain less then 'level' members. This could be
1156 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
1157 * in other functions.
1158 *
1159 * We reset it to 'level' at the end of this function.
1160 */
1161 sched_domains_numa_levels = 0;
1162
1163 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
1164 if (!sched_domains_numa_masks)
1165 return;
1166
1167 /*
1168 * Now for each level, construct a mask per node which contains all
1169 * CPUs of nodes that are that many hops away from us.
1170 */
1171 for (i = 0; i < level; i++) {
1172 sched_domains_numa_masks[i] =
1173 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
1174 if (!sched_domains_numa_masks[i])
1175 return;
1176
1177 for (j = 0; j < nr_node_ids; j++) {
1178 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
1179 if (!mask)
1180 return;
1181
1182 sched_domains_numa_masks[i][j] = mask;
1183
1184 for_each_node(k) {
1185 if (node_distance(j, k) > sched_domains_numa_distance[i])
1186 continue;
1187
1188 cpumask_or(mask, mask, cpumask_of_node(k));
1189 }
1190 }
1191 }
1192
1193 /* Compute default topology size */
1194 for (i = 0; sched_domain_topology[i].mask; i++);
1195
1196 tl = kzalloc((i + level + 1) *
1197 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
1198 if (!tl)
1199 return;
1200
1201 /*
1202 * Copy the default topology bits..
1203 */
1204 for (i = 0; sched_domain_topology[i].mask; i++)
1205 tl[i] = sched_domain_topology[i];
1206
1207 /*
1208 * .. and append 'j' levels of NUMA goodness.
1209 */
1210 for (j = 0; j < level; i++, j++) {
1211 tl[i] = (struct sched_domain_topology_level){
1212 .mask = sd_numa_mask,
1213 .sd_flags = cpu_numa_flags,
1214 .flags = SDTL_OVERLAP,
1215 .numa_level = j,
1216 SD_INIT_NAME(NUMA)
1217 };
1218 }
1219
1220 sched_domain_topology = tl;
1221
1222 sched_domains_numa_levels = level;
1223 sched_max_numa_distance = sched_domains_numa_distance[level - 1];
1224
1225 init_numa_topology_type();
1226}
1227
1228void sched_domains_numa_masks_set(unsigned int cpu)
1229{
1230 int node = cpu_to_node(cpu);
1231 int i, j;
1232
1233 for (i = 0; i < sched_domains_numa_levels; i++) {
1234 for (j = 0; j < nr_node_ids; j++) {
1235 if (node_distance(j, node) <= sched_domains_numa_distance[i])
1236 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
1237 }
1238 }
1239}
1240
1241void sched_domains_numa_masks_clear(unsigned int cpu)
1242{
1243 int i, j;
1244
1245 for (i = 0; i < sched_domains_numa_levels; i++) {
1246 for (j = 0; j < nr_node_ids; j++)
1247 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
1248 }
1249}
1250
1251#endif /* CONFIG_NUMA */
1252
1253static int __sdt_alloc(const struct cpumask *cpu_map)
1254{
1255 struct sched_domain_topology_level *tl;
1256 int j;
1257
1258 for_each_sd_topology(tl) {
1259 struct sd_data *sdd = &tl->data;
1260
1261 sdd->sd = alloc_percpu(struct sched_domain *);
1262 if (!sdd->sd)
1263 return -ENOMEM;
1264
1265 sdd->sds = alloc_percpu(struct sched_domain_shared *);
1266 if (!sdd->sds)
1267 return -ENOMEM;
1268
1269 sdd->sg = alloc_percpu(struct sched_group *);
1270 if (!sdd->sg)
1271 return -ENOMEM;
1272
1273 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
1274 if (!sdd->sgc)
1275 return -ENOMEM;
1276
1277 for_each_cpu(j, cpu_map) {
1278 struct sched_domain *sd;
1279 struct sched_domain_shared *sds;
1280 struct sched_group *sg;
1281 struct sched_group_capacity *sgc;
1282
1283 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
1284 GFP_KERNEL, cpu_to_node(j));
1285 if (!sd)
1286 return -ENOMEM;
1287
1288 *per_cpu_ptr(sdd->sd, j) = sd;
1289
1290 sds = kzalloc_node(sizeof(struct sched_domain_shared),
1291 GFP_KERNEL, cpu_to_node(j));
1292 if (!sds)
1293 return -ENOMEM;
1294
1295 *per_cpu_ptr(sdd->sds, j) = sds;
1296
1297 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
1298 GFP_KERNEL, cpu_to_node(j));
1299 if (!sg)
1300 return -ENOMEM;
1301
1302 sg->next = sg;
1303
1304 *per_cpu_ptr(sdd->sg, j) = sg;
1305
1306 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
1307 GFP_KERNEL, cpu_to_node(j));
1308 if (!sgc)
1309 return -ENOMEM;
1310
1311 *per_cpu_ptr(sdd->sgc, j) = sgc;
1312 }
1313 }
1314
1315 return 0;
1316}
1317
1318static void __sdt_free(const struct cpumask *cpu_map)
1319{
1320 struct sched_domain_topology_level *tl;
1321 int j;
1322
1323 for_each_sd_topology(tl) {
1324 struct sd_data *sdd = &tl->data;
1325
1326 for_each_cpu(j, cpu_map) {
1327 struct sched_domain *sd;
1328
1329 if (sdd->sd) {
1330 sd = *per_cpu_ptr(sdd->sd, j);
1331 if (sd && (sd->flags & SD_OVERLAP))
1332 free_sched_groups(sd->groups, 0);
1333 kfree(*per_cpu_ptr(sdd->sd, j));
1334 }
1335
1336 if (sdd->sds)
1337 kfree(*per_cpu_ptr(sdd->sds, j));
1338 if (sdd->sg)
1339 kfree(*per_cpu_ptr(sdd->sg, j));
1340 if (sdd->sgc)
1341 kfree(*per_cpu_ptr(sdd->sgc, j));
1342 }
1343 free_percpu(sdd->sd);
1344 sdd->sd = NULL;
1345 free_percpu(sdd->sds);
1346 sdd->sds = NULL;
1347 free_percpu(sdd->sg);
1348 sdd->sg = NULL;
1349 free_percpu(sdd->sgc);
1350 sdd->sgc = NULL;
1351 }
1352}
1353
1354struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
1355 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
1356 struct sched_domain *child, int cpu)
1357{
1358 struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
1359
1360 if (child) {
1361 sd->level = child->level + 1;
1362 sched_domain_level_max = max(sched_domain_level_max, sd->level);
1363 child->parent = sd;
1364
1365 if (!cpumask_subset(sched_domain_span(child),
1366 sched_domain_span(sd))) {
1367 pr_err("BUG: arch topology borken\n");
1368#ifdef CONFIG_SCHED_DEBUG
1369 pr_err(" the %s domain not a subset of the %s domain\n",
1370 child->name, sd->name);
1371#endif
1372 /* Fixup, ensure @sd has at least @child cpus. */
1373 cpumask_or(sched_domain_span(sd),
1374 sched_domain_span(sd),
1375 sched_domain_span(child));
1376 }
1377
1378 }
1379 set_domain_attribute(sd, attr);
1380
1381 return sd;
1382}
1383
1384/*
1385 * Build sched domains for a given set of CPUs and attach the sched domains
1386 * to the individual CPUs
1387 */
1388static int
1389build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
1390{
1391 enum s_alloc alloc_state;
1392 struct sched_domain *sd;
1393 struct s_data d;
1394 struct rq *rq = NULL;
1395 int i, ret = -ENOMEM;
1396
1397 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
1398 if (alloc_state != sa_rootdomain)
1399 goto error;
1400
1401 /* Set up domains for CPUs specified by the cpu_map: */
1402 for_each_cpu(i, cpu_map) {
1403 struct sched_domain_topology_level *tl;
1404
1405 sd = NULL;
1406 for_each_sd_topology(tl) {
1407 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
1408 if (tl == sched_domain_topology)
1409 *per_cpu_ptr(d.sd, i) = sd;
1410 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
1411 sd->flags |= SD_OVERLAP;
1412 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
1413 break;
1414 }
1415 }
1416
1417 /* Build the groups for the domains */
1418 for_each_cpu(i, cpu_map) {
1419 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
1420 sd->span_weight = cpumask_weight(sched_domain_span(sd));
1421 if (sd->flags & SD_OVERLAP) {
1422 if (build_overlap_sched_groups(sd, i))
1423 goto error;
1424 } else {
1425 if (build_sched_groups(sd, i))
1426 goto error;
1427 }
1428 }
1429 }
1430
1431 /* Calculate CPU capacity for physical packages and nodes */
1432 for (i = nr_cpumask_bits-1; i >= 0; i--) {
1433 if (!cpumask_test_cpu(i, cpu_map))
1434 continue;
1435
1436 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
1437 claim_allocations(i, sd);
1438 init_sched_groups_capacity(i, sd);
1439 }
1440 }
1441
1442 /* Attach the domains */
1443 rcu_read_lock();
1444 for_each_cpu(i, cpu_map) {
1445 rq = cpu_rq(i);
1446 sd = *per_cpu_ptr(d.sd, i);
1447
1448 /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
1449 if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
1450 WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
1451
1452 cpu_attach_domain(sd, d.rd, i);
1453 }
1454 rcu_read_unlock();
1455
1456 if (rq && sched_debug_enabled) {
1457 pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
1458 cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
1459 }
1460
1461 ret = 0;
1462error:
1463 __free_domain_allocs(&d, alloc_state, cpu_map);
1464 return ret;
1465}
1466
1467/* Current sched domains: */
1468static cpumask_var_t *doms_cur;
1469
1470/* Number of sched domains in 'doms_cur': */
1471static int ndoms_cur;
1472
1473/* Attribues of custom domains in 'doms_cur' */
1474static struct sched_domain_attr *dattr_cur;
1475
1476/*
1477 * Special case: If a kmalloc() of a doms_cur partition (array of
1478 * cpumask) fails, then fallback to a single sched domain,
1479 * as determined by the single cpumask fallback_doms.
1480 */
1481cpumask_var_t fallback_doms;
1482
1483/*
1484 * arch_update_cpu_topology lets virtualized architectures update the
1485 * CPU core maps. It is supposed to return 1 if the topology changed
1486 * or 0 if it stayed the same.
1487 */
1488int __weak arch_update_cpu_topology(void)
1489{
1490 return 0;
1491}
1492
1493cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
1494{
1495 int i;
1496 cpumask_var_t *doms;
1497
1498 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
1499 if (!doms)
1500 return NULL;
1501 for (i = 0; i < ndoms; i++) {
1502 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
1503 free_sched_domains(doms, i);
1504 return NULL;
1505 }
1506 }
1507 return doms;
1508}
1509
1510void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
1511{
1512 unsigned int i;
1513 for (i = 0; i < ndoms; i++)
1514 free_cpumask_var(doms[i]);
1515 kfree(doms);
1516}
1517
1518/*
1519 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
1520 * For now this just excludes isolated CPUs, but could be used to
1521 * exclude other special cases in the future.
1522 */
1523int init_sched_domains(const struct cpumask *cpu_map)
1524{
1525 int err;
1526
1527 arch_update_cpu_topology();
1528 ndoms_cur = 1;
1529 doms_cur = alloc_sched_domains(ndoms_cur);
1530 if (!doms_cur)
1531 doms_cur = &fallback_doms;
1532 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
1533 err = build_sched_domains(doms_cur[0], NULL);
1534 register_sched_domain_sysctl();
1535
1536 return err;
1537}
1538
1539/*
1540 * Detach sched domains from a group of CPUs specified in cpu_map
1541 * These CPUs will now be attached to the NULL domain
1542 */
1543static void detach_destroy_domains(const struct cpumask *cpu_map)
1544{
1545 int i;
1546
1547 rcu_read_lock();
1548 for_each_cpu(i, cpu_map)
1549 cpu_attach_domain(NULL, &def_root_domain, i);
1550 rcu_read_unlock();
1551}
1552
1553/* handle null as "default" */
1554static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
1555 struct sched_domain_attr *new, int idx_new)
1556{
1557 struct sched_domain_attr tmp;
1558
1559 /* Fast path: */
1560 if (!new && !cur)
1561 return 1;
1562
1563 tmp = SD_ATTR_INIT;
1564 return !memcmp(cur ? (cur + idx_cur) : &tmp,
1565 new ? (new + idx_new) : &tmp,
1566 sizeof(struct sched_domain_attr));
1567}
1568
1569/*
1570 * Partition sched domains as specified by the 'ndoms_new'
1571 * cpumasks in the array doms_new[] of cpumasks. This compares
1572 * doms_new[] to the current sched domain partitioning, doms_cur[].
1573 * It destroys each deleted domain and builds each new domain.
1574 *
1575 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
1576 * The masks don't intersect (don't overlap.) We should setup one
1577 * sched domain for each mask. CPUs not in any of the cpumasks will
1578 * not be load balanced. If the same cpumask appears both in the
1579 * current 'doms_cur' domains and in the new 'doms_new', we can leave
1580 * it as it is.
1581 *
1582 * The passed in 'doms_new' should be allocated using
1583 * alloc_sched_domains. This routine takes ownership of it and will
1584 * free_sched_domains it when done with it. If the caller failed the
1585 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
1586 * and partition_sched_domains() will fallback to the single partition
1587 * 'fallback_doms', it also forces the domains to be rebuilt.
1588 *
1589 * If doms_new == NULL it will be replaced with cpu_online_mask.
1590 * ndoms_new == 0 is a special case for destroying existing domains,
1591 * and it will not create the default domain.
1592 *
1593 * Call with hotplug lock held
1594 */
1595void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
1596 struct sched_domain_attr *dattr_new)
1597{
1598 int i, j, n;
1599 int new_topology;
1600
1601 mutex_lock(&sched_domains_mutex);
1602
1603 /* Always unregister in case we don't destroy any domains: */
1604 unregister_sched_domain_sysctl();
1605
1606 /* Let the architecture update CPU core mappings: */
1607 new_topology = arch_update_cpu_topology();
1608
1609 n = doms_new ? ndoms_new : 0;
1610
1611 /* Destroy deleted domains: */
1612 for (i = 0; i < ndoms_cur; i++) {
1613 for (j = 0; j < n && !new_topology; j++) {
1614 if (cpumask_equal(doms_cur[i], doms_new[j])
1615 && dattrs_equal(dattr_cur, i, dattr_new, j))
1616 goto match1;
1617 }
1618 /* No match - a current sched domain not in new doms_new[] */
1619 detach_destroy_domains(doms_cur[i]);
1620match1:
1621 ;
1622 }
1623
1624 n = ndoms_cur;
1625 if (doms_new == NULL) {
1626 n = 0;
1627 doms_new = &fallback_doms;
1628 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
1629 WARN_ON_ONCE(dattr_new);
1630 }
1631
1632 /* Build new domains: */
1633 for (i = 0; i < ndoms_new; i++) {
1634 for (j = 0; j < n && !new_topology; j++) {
1635 if (cpumask_equal(doms_new[i], doms_cur[j])
1636 && dattrs_equal(dattr_new, i, dattr_cur, j))
1637 goto match2;
1638 }
1639 /* No match - add a new doms_new */
1640 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
1641match2:
1642 ;
1643 }
1644
1645 /* Remember the new sched domains: */
1646 if (doms_cur != &fallback_doms)
1647 free_sched_domains(doms_cur, ndoms_cur);
1648
1649 kfree(dattr_cur);
1650 doms_cur = doms_new;
1651 dattr_cur = dattr_new;
1652 ndoms_cur = ndoms_new;
1653
1654 register_sched_domain_sysctl();
1655
1656 mutex_unlock(&sched_domains_mutex);
1657}
1658