diff options
author | Ingo Molnar <mingo@kernel.org> | 2017-02-01 07:10:18 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2017-02-07 04:58:12 -0500 |
commit | f2cb13609d5397cdd747f3ed6fb651233851717d (patch) | |
tree | 0714785a7b04430b41346653178afc7b9a7bca70 /kernel | |
parent | 004172bdad644327dc7a6543186b9d7b529ee944 (diff) |
sched/topology: Split out scheduler topology code from core.c into topology.c
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched/Makefile | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 1659 | ||||
-rw-r--r-- | kernel/sched/sched.h | 23 | ||||
-rw-r--r-- | kernel/sched/topology.c | 1658 |
4 files changed, 1684 insertions, 1658 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 5e59b832ae2b..130ce8ac725b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -18,7 +18,7 @@ endif | |||
18 | obj-y += core.o loadavg.o clock.o cputime.o | 18 | obj-y += core.o loadavg.o clock.o cputime.o |
19 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o | 19 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o |
20 | obj-y += wait.o swait.o completion.o idle.o | 20 | obj-y += wait.o swait.o completion.o idle.o |
21 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o | 21 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o |
22 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 22 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
23 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 23 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
24 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | 24 | obj-$(CONFIG_SCHED_DEBUG) += debug.o |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1cea6c61fb01..e4aa470ed454 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -31,7 +31,6 @@ | |||
31 | #define CREATE_TRACE_POINTS | 31 | #define CREATE_TRACE_POINTS |
32 | #include <trace/events/sched.h> | 32 | #include <trace/events/sched.h> |
33 | 33 | ||
34 | DEFINE_MUTEX(sched_domains_mutex); | ||
35 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 34 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
36 | 35 | ||
37 | /* | 36 | /* |
@@ -5446,7 +5445,7 @@ out: | |||
5446 | 5445 | ||
5447 | #ifdef CONFIG_SMP | 5446 | #ifdef CONFIG_SMP |
5448 | 5447 | ||
5449 | static bool sched_smp_initialized __read_mostly; | 5448 | bool sched_smp_initialized __read_mostly; |
5450 | 5449 | ||
5451 | #ifdef CONFIG_NUMA_BALANCING | 5450 | #ifdef CONFIG_NUMA_BALANCING |
5452 | /* Migrate current task p to target_cpu */ | 5451 | /* Migrate current task p to target_cpu */ |
@@ -5643,7 +5642,7 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5643 | } | 5642 | } |
5644 | #endif /* CONFIG_HOTPLUG_CPU */ | 5643 | #endif /* CONFIG_HOTPLUG_CPU */ |
5645 | 5644 | ||
5646 | static void set_rq_online(struct rq *rq) | 5645 | void set_rq_online(struct rq *rq) |
5647 | { | 5646 | { |
5648 | if (!rq->online) { | 5647 | if (!rq->online) { |
5649 | const struct sched_class *class; | 5648 | const struct sched_class *class; |
@@ -5658,7 +5657,7 @@ static void set_rq_online(struct rq *rq) | |||
5658 | } | 5657 | } |
5659 | } | 5658 | } |
5660 | 5659 | ||
5661 | static void set_rq_offline(struct rq *rq) | 5660 | void set_rq_offline(struct rq *rq) |
5662 | { | 5661 | { |
5663 | if (rq->online) { | 5662 | if (rq->online) { |
5664 | const struct sched_class *class; | 5663 | const struct sched_class *class; |
@@ -5680,1658 +5679,6 @@ static void set_cpu_rq_start_time(unsigned int cpu) | |||
5680 | rq->age_stamp = sched_clock_cpu(cpu); | 5679 | rq->age_stamp = sched_clock_cpu(cpu); |
5681 | } | 5680 | } |
5682 | 5681 | ||
5683 | /* Protected by sched_domains_mutex: */ | ||
5684 | static cpumask_var_t sched_domains_tmpmask; | ||
5685 | |||
5686 | #ifdef CONFIG_SCHED_DEBUG | ||
5687 | |||
5688 | static __read_mostly int sched_debug_enabled; | ||
5689 | |||
5690 | static int __init sched_debug_setup(char *str) | ||
5691 | { | ||
5692 | sched_debug_enabled = 1; | ||
5693 | |||
5694 | return 0; | ||
5695 | } | ||
5696 | early_param("sched_debug", sched_debug_setup); | ||
5697 | |||
5698 | static inline bool sched_debug(void) | ||
5699 | { | ||
5700 | return sched_debug_enabled; | ||
5701 | } | ||
5702 | |||
5703 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | ||
5704 | struct cpumask *groupmask) | ||
5705 | { | ||
5706 | struct sched_group *group = sd->groups; | ||
5707 | |||
5708 | cpumask_clear(groupmask); | ||
5709 | |||
5710 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); | ||
5711 | |||
5712 | if (!(sd->flags & SD_LOAD_BALANCE)) { | ||
5713 | printk("does not load-balance\n"); | ||
5714 | if (sd->parent) | ||
5715 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" | ||
5716 | " has parent"); | ||
5717 | return -1; | ||
5718 | } | ||
5719 | |||
5720 | printk(KERN_CONT "span %*pbl level %s\n", | ||
5721 | cpumask_pr_args(sched_domain_span(sd)), sd->name); | ||
5722 | |||
5723 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
5724 | printk(KERN_ERR "ERROR: domain->span does not contain " | ||
5725 | "CPU%d\n", cpu); | ||
5726 | } | ||
5727 | if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { | ||
5728 | printk(KERN_ERR "ERROR: domain->groups does not contain" | ||
5729 | " CPU%d\n", cpu); | ||
5730 | } | ||
5731 | |||
5732 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); | ||
5733 | do { | ||
5734 | if (!group) { | ||
5735 | printk("\n"); | ||
5736 | printk(KERN_ERR "ERROR: group is NULL\n"); | ||
5737 | break; | ||
5738 | } | ||
5739 | |||
5740 | if (!cpumask_weight(sched_group_cpus(group))) { | ||
5741 | printk(KERN_CONT "\n"); | ||
5742 | printk(KERN_ERR "ERROR: empty group\n"); | ||
5743 | break; | ||
5744 | } | ||
5745 | |||
5746 | if (!(sd->flags & SD_OVERLAP) && | ||
5747 | cpumask_intersects(groupmask, sched_group_cpus(group))) { | ||
5748 | printk(KERN_CONT "\n"); | ||
5749 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | ||
5750 | break; | ||
5751 | } | ||
5752 | |||
5753 | cpumask_or(groupmask, groupmask, sched_group_cpus(group)); | ||
5754 | |||
5755 | printk(KERN_CONT " %*pbl", | ||
5756 | cpumask_pr_args(sched_group_cpus(group))); | ||
5757 | if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { | ||
5758 | printk(KERN_CONT " (cpu_capacity = %lu)", | ||
5759 | group->sgc->capacity); | ||
5760 | } | ||
5761 | |||
5762 | group = group->next; | ||
5763 | } while (group != sd->groups); | ||
5764 | printk(KERN_CONT "\n"); | ||
5765 | |||
5766 | if (!cpumask_equal(sched_domain_span(sd), groupmask)) | ||
5767 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | ||
5768 | |||
5769 | if (sd->parent && | ||
5770 | !cpumask_subset(groupmask, sched_domain_span(sd->parent))) | ||
5771 | printk(KERN_ERR "ERROR: parent span is not a superset " | ||
5772 | "of domain->span\n"); | ||
5773 | return 0; | ||
5774 | } | ||
5775 | |||
5776 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | ||
5777 | { | ||
5778 | int level = 0; | ||
5779 | |||
5780 | if (!sched_debug_enabled) | ||
5781 | return; | ||
5782 | |||
5783 | if (!sd) { | ||
5784 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | ||
5785 | return; | ||
5786 | } | ||
5787 | |||
5788 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | ||
5789 | |||
5790 | for (;;) { | ||
5791 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) | ||
5792 | break; | ||
5793 | level++; | ||
5794 | sd = sd->parent; | ||
5795 | if (!sd) | ||
5796 | break; | ||
5797 | } | ||
5798 | } | ||
5799 | #else /* !CONFIG_SCHED_DEBUG */ | ||
5800 | |||
5801 | # define sched_debug_enabled 0 | ||
5802 | # define sched_domain_debug(sd, cpu) do { } while (0) | ||
5803 | static inline bool sched_debug(void) | ||
5804 | { | ||
5805 | return false; | ||
5806 | } | ||
5807 | #endif /* CONFIG_SCHED_DEBUG */ | ||
5808 | |||
5809 | static int sd_degenerate(struct sched_domain *sd) | ||
5810 | { | ||
5811 | if (cpumask_weight(sched_domain_span(sd)) == 1) | ||
5812 | return 1; | ||
5813 | |||
5814 | /* Following flags need at least 2 groups */ | ||
5815 | if (sd->flags & (SD_LOAD_BALANCE | | ||
5816 | SD_BALANCE_NEWIDLE | | ||
5817 | SD_BALANCE_FORK | | ||
5818 | SD_BALANCE_EXEC | | ||
5819 | SD_SHARE_CPUCAPACITY | | ||
5820 | SD_ASYM_CPUCAPACITY | | ||
5821 | SD_SHARE_PKG_RESOURCES | | ||
5822 | SD_SHARE_POWERDOMAIN)) { | ||
5823 | if (sd->groups != sd->groups->next) | ||
5824 | return 0; | ||
5825 | } | ||
5826 | |||
5827 | /* Following flags don't use groups */ | ||
5828 | if (sd->flags & (SD_WAKE_AFFINE)) | ||
5829 | return 0; | ||
5830 | |||
5831 | return 1; | ||
5832 | } | ||
5833 | |||
5834 | static int | ||
5835 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | ||
5836 | { | ||
5837 | unsigned long cflags = sd->flags, pflags = parent->flags; | ||
5838 | |||
5839 | if (sd_degenerate(parent)) | ||
5840 | return 1; | ||
5841 | |||
5842 | if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) | ||
5843 | return 0; | ||
5844 | |||
5845 | /* Flags needing groups don't count if only 1 group in parent */ | ||
5846 | if (parent->groups == parent->groups->next) { | ||
5847 | pflags &= ~(SD_LOAD_BALANCE | | ||
5848 | SD_BALANCE_NEWIDLE | | ||
5849 | SD_BALANCE_FORK | | ||
5850 | SD_BALANCE_EXEC | | ||
5851 | SD_ASYM_CPUCAPACITY | | ||
5852 | SD_SHARE_CPUCAPACITY | | ||
5853 | SD_SHARE_PKG_RESOURCES | | ||
5854 | SD_PREFER_SIBLING | | ||
5855 | SD_SHARE_POWERDOMAIN); | ||
5856 | if (nr_node_ids == 1) | ||
5857 | pflags &= ~SD_SERIALIZE; | ||
5858 | } | ||
5859 | if (~cflags & pflags) | ||
5860 | return 0; | ||
5861 | |||
5862 | return 1; | ||
5863 | } | ||
5864 | |||
5865 | static void free_rootdomain(struct rcu_head *rcu) | ||
5866 | { | ||
5867 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); | ||
5868 | |||
5869 | cpupri_cleanup(&rd->cpupri); | ||
5870 | cpudl_cleanup(&rd->cpudl); | ||
5871 | free_cpumask_var(rd->dlo_mask); | ||
5872 | free_cpumask_var(rd->rto_mask); | ||
5873 | free_cpumask_var(rd->online); | ||
5874 | free_cpumask_var(rd->span); | ||
5875 | kfree(rd); | ||
5876 | } | ||
5877 | |||
5878 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) | ||
5879 | { | ||
5880 | struct root_domain *old_rd = NULL; | ||
5881 | unsigned long flags; | ||
5882 | |||
5883 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5884 | |||
5885 | if (rq->rd) { | ||
5886 | old_rd = rq->rd; | ||
5887 | |||
5888 | if (cpumask_test_cpu(rq->cpu, old_rd->online)) | ||
5889 | set_rq_offline(rq); | ||
5890 | |||
5891 | cpumask_clear_cpu(rq->cpu, old_rd->span); | ||
5892 | |||
5893 | /* | ||
5894 | * If we dont want to free the old_rd yet then | ||
5895 | * set old_rd to NULL to skip the freeing later | ||
5896 | * in this function: | ||
5897 | */ | ||
5898 | if (!atomic_dec_and_test(&old_rd->refcount)) | ||
5899 | old_rd = NULL; | ||
5900 | } | ||
5901 | |||
5902 | atomic_inc(&rd->refcount); | ||
5903 | rq->rd = rd; | ||
5904 | |||
5905 | cpumask_set_cpu(rq->cpu, rd->span); | ||
5906 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) | ||
5907 | set_rq_online(rq); | ||
5908 | |||
5909 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
5910 | |||
5911 | if (old_rd) | ||
5912 | call_rcu_sched(&old_rd->rcu, free_rootdomain); | ||
5913 | } | ||
5914 | |||
5915 | static int init_rootdomain(struct root_domain *rd) | ||
5916 | { | ||
5917 | memset(rd, 0, sizeof(*rd)); | ||
5918 | |||
5919 | if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) | ||
5920 | goto out; | ||
5921 | if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) | ||
5922 | goto free_span; | ||
5923 | if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) | ||
5924 | goto free_online; | ||
5925 | if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | ||
5926 | goto free_dlo_mask; | ||
5927 | |||
5928 | init_dl_bw(&rd->dl_bw); | ||
5929 | if (cpudl_init(&rd->cpudl) != 0) | ||
5930 | goto free_rto_mask; | ||
5931 | |||
5932 | if (cpupri_init(&rd->cpupri) != 0) | ||
5933 | goto free_cpudl; | ||
5934 | return 0; | ||
5935 | |||
5936 | free_cpudl: | ||
5937 | cpudl_cleanup(&rd->cpudl); | ||
5938 | free_rto_mask: | ||
5939 | free_cpumask_var(rd->rto_mask); | ||
5940 | free_dlo_mask: | ||
5941 | free_cpumask_var(rd->dlo_mask); | ||
5942 | free_online: | ||
5943 | free_cpumask_var(rd->online); | ||
5944 | free_span: | ||
5945 | free_cpumask_var(rd->span); | ||
5946 | out: | ||
5947 | return -ENOMEM; | ||
5948 | } | ||
5949 | |||
5950 | /* | ||
5951 | * By default the system creates a single root-domain with all CPUs as | ||
5952 | * members (mimicking the global state we have today). | ||
5953 | */ | ||
5954 | struct root_domain def_root_domain; | ||
5955 | |||
5956 | static void init_defrootdomain(void) | ||
5957 | { | ||
5958 | init_rootdomain(&def_root_domain); | ||
5959 | |||
5960 | atomic_set(&def_root_domain.refcount, 1); | ||
5961 | } | ||
5962 | |||
5963 | static struct root_domain *alloc_rootdomain(void) | ||
5964 | { | ||
5965 | struct root_domain *rd; | ||
5966 | |||
5967 | rd = kmalloc(sizeof(*rd), GFP_KERNEL); | ||
5968 | if (!rd) | ||
5969 | return NULL; | ||
5970 | |||
5971 | if (init_rootdomain(rd) != 0) { | ||
5972 | kfree(rd); | ||
5973 | return NULL; | ||
5974 | } | ||
5975 | |||
5976 | return rd; | ||
5977 | } | ||
5978 | |||
5979 | static void free_sched_groups(struct sched_group *sg, int free_sgc) | ||
5980 | { | ||
5981 | struct sched_group *tmp, *first; | ||
5982 | |||
5983 | if (!sg) | ||
5984 | return; | ||
5985 | |||
5986 | first = sg; | ||
5987 | do { | ||
5988 | tmp = sg->next; | ||
5989 | |||
5990 | if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) | ||
5991 | kfree(sg->sgc); | ||
5992 | |||
5993 | kfree(sg); | ||
5994 | sg = tmp; | ||
5995 | } while (sg != first); | ||
5996 | } | ||
5997 | |||
5998 | static void destroy_sched_domain(struct sched_domain *sd) | ||
5999 | { | ||
6000 | /* | ||
6001 | * If its an overlapping domain it has private groups, iterate and | ||
6002 | * nuke them all. | ||
6003 | */ | ||
6004 | if (sd->flags & SD_OVERLAP) { | ||
6005 | free_sched_groups(sd->groups, 1); | ||
6006 | } else if (atomic_dec_and_test(&sd->groups->ref)) { | ||
6007 | kfree(sd->groups->sgc); | ||
6008 | kfree(sd->groups); | ||
6009 | } | ||
6010 | if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) | ||
6011 | kfree(sd->shared); | ||
6012 | kfree(sd); | ||
6013 | } | ||
6014 | |||
6015 | static void destroy_sched_domains_rcu(struct rcu_head *rcu) | ||
6016 | { | ||
6017 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
6018 | |||
6019 | while (sd) { | ||
6020 | struct sched_domain *parent = sd->parent; | ||
6021 | destroy_sched_domain(sd); | ||
6022 | sd = parent; | ||
6023 | } | ||
6024 | } | ||
6025 | |||
6026 | static void destroy_sched_domains(struct sched_domain *sd) | ||
6027 | { | ||
6028 | if (sd) | ||
6029 | call_rcu(&sd->rcu, destroy_sched_domains_rcu); | ||
6030 | } | ||
6031 | |||
6032 | /* | ||
6033 | * Keep a special pointer to the highest sched_domain that has | ||
6034 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this | ||
6035 | * allows us to avoid some pointer chasing select_idle_sibling(). | ||
6036 | * | ||
6037 | * Also keep a unique ID per domain (we use the first CPU number in | ||
6038 | * the cpumask of the domain), this allows us to quickly tell if | ||
6039 | * two CPUs are in the same cache domain, see cpus_share_cache(). | ||
6040 | */ | ||
6041 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | ||
6042 | DEFINE_PER_CPU(int, sd_llc_size); | ||
6043 | DEFINE_PER_CPU(int, sd_llc_id); | ||
6044 | DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); | ||
6045 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); | ||
6046 | DEFINE_PER_CPU(struct sched_domain *, sd_asym); | ||
6047 | |||
6048 | static void update_top_cache_domain(int cpu) | ||
6049 | { | ||
6050 | struct sched_domain_shared *sds = NULL; | ||
6051 | struct sched_domain *sd; | ||
6052 | int id = cpu; | ||
6053 | int size = 1; | ||
6054 | |||
6055 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | ||
6056 | if (sd) { | ||
6057 | id = cpumask_first(sched_domain_span(sd)); | ||
6058 | size = cpumask_weight(sched_domain_span(sd)); | ||
6059 | sds = sd->shared; | ||
6060 | } | ||
6061 | |||
6062 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | ||
6063 | per_cpu(sd_llc_size, cpu) = size; | ||
6064 | per_cpu(sd_llc_id, cpu) = id; | ||
6065 | rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); | ||
6066 | |||
6067 | sd = lowest_flag_domain(cpu, SD_NUMA); | ||
6068 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); | ||
6069 | |||
6070 | sd = highest_flag_domain(cpu, SD_ASYM_PACKING); | ||
6071 | rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); | ||
6072 | } | ||
6073 | |||
6074 | /* | ||
6075 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | ||
6076 | * hold the hotplug lock. | ||
6077 | */ | ||
6078 | static void | ||
6079 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | ||
6080 | { | ||
6081 | struct rq *rq = cpu_rq(cpu); | ||
6082 | struct sched_domain *tmp; | ||
6083 | |||
6084 | /* Remove the sched domains which do not contribute to scheduling. */ | ||
6085 | for (tmp = sd; tmp; ) { | ||
6086 | struct sched_domain *parent = tmp->parent; | ||
6087 | if (!parent) | ||
6088 | break; | ||
6089 | |||
6090 | if (sd_parent_degenerate(tmp, parent)) { | ||
6091 | tmp->parent = parent->parent; | ||
6092 | if (parent->parent) | ||
6093 | parent->parent->child = tmp; | ||
6094 | /* | ||
6095 | * Transfer SD_PREFER_SIBLING down in case of a | ||
6096 | * degenerate parent; the spans match for this | ||
6097 | * so the property transfers. | ||
6098 | */ | ||
6099 | if (parent->flags & SD_PREFER_SIBLING) | ||
6100 | tmp->flags |= SD_PREFER_SIBLING; | ||
6101 | destroy_sched_domain(parent); | ||
6102 | } else | ||
6103 | tmp = tmp->parent; | ||
6104 | } | ||
6105 | |||
6106 | if (sd && sd_degenerate(sd)) { | ||
6107 | tmp = sd; | ||
6108 | sd = sd->parent; | ||
6109 | destroy_sched_domain(tmp); | ||
6110 | if (sd) | ||
6111 | sd->child = NULL; | ||
6112 | } | ||
6113 | |||
6114 | sched_domain_debug(sd, cpu); | ||
6115 | |||
6116 | rq_attach_root(rq, rd); | ||
6117 | tmp = rq->sd; | ||
6118 | rcu_assign_pointer(rq->sd, sd); | ||
6119 | destroy_sched_domains(tmp); | ||
6120 | |||
6121 | update_top_cache_domain(cpu); | ||
6122 | } | ||
6123 | |||
6124 | /* Setup the mask of CPUs configured for isolated domains */ | ||
6125 | static int __init isolated_cpu_setup(char *str) | ||
6126 | { | ||
6127 | int ret; | ||
6128 | |||
6129 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | ||
6130 | ret = cpulist_parse(str, cpu_isolated_map); | ||
6131 | if (ret) { | ||
6132 | pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); | ||
6133 | return 0; | ||
6134 | } | ||
6135 | return 1; | ||
6136 | } | ||
6137 | __setup("isolcpus=", isolated_cpu_setup); | ||
6138 | |||
6139 | struct s_data { | ||
6140 | struct sched_domain ** __percpu sd; | ||
6141 | struct root_domain *rd; | ||
6142 | }; | ||
6143 | |||
6144 | enum s_alloc { | ||
6145 | sa_rootdomain, | ||
6146 | sa_sd, | ||
6147 | sa_sd_storage, | ||
6148 | sa_none, | ||
6149 | }; | ||
6150 | |||
6151 | /* | ||
6152 | * Build an iteration mask that can exclude certain CPUs from the upwards | ||
6153 | * domain traversal. | ||
6154 | * | ||
6155 | * Asymmetric node setups can result in situations where the domain tree is of | ||
6156 | * unequal depth, make sure to skip domains that already cover the entire | ||
6157 | * range. | ||
6158 | * | ||
6159 | * In that case build_sched_domains() will have terminated the iteration early | ||
6160 | * and our sibling sd spans will be empty. Domains should always include the | ||
6161 | * CPU they're built on, so check that. | ||
6162 | */ | ||
6163 | static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) | ||
6164 | { | ||
6165 | const struct cpumask *span = sched_domain_span(sd); | ||
6166 | struct sd_data *sdd = sd->private; | ||
6167 | struct sched_domain *sibling; | ||
6168 | int i; | ||
6169 | |||
6170 | for_each_cpu(i, span) { | ||
6171 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
6172 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
6173 | continue; | ||
6174 | |||
6175 | cpumask_set_cpu(i, sched_group_mask(sg)); | ||
6176 | } | ||
6177 | } | ||
6178 | |||
6179 | /* | ||
6180 | * Return the canonical balance CPU for this group, this is the first CPU | ||
6181 | * of this group that's also in the iteration mask. | ||
6182 | */ | ||
6183 | int group_balance_cpu(struct sched_group *sg) | ||
6184 | { | ||
6185 | return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); | ||
6186 | } | ||
6187 | |||
6188 | static int | ||
6189 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) | ||
6190 | { | ||
6191 | struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; | ||
6192 | const struct cpumask *span = sched_domain_span(sd); | ||
6193 | struct cpumask *covered = sched_domains_tmpmask; | ||
6194 | struct sd_data *sdd = sd->private; | ||
6195 | struct sched_domain *sibling; | ||
6196 | int i; | ||
6197 | |||
6198 | cpumask_clear(covered); | ||
6199 | |||
6200 | for_each_cpu(i, span) { | ||
6201 | struct cpumask *sg_span; | ||
6202 | |||
6203 | if (cpumask_test_cpu(i, covered)) | ||
6204 | continue; | ||
6205 | |||
6206 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
6207 | |||
6208 | /* See the comment near build_group_mask(). */ | ||
6209 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
6210 | continue; | ||
6211 | |||
6212 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
6213 | GFP_KERNEL, cpu_to_node(cpu)); | ||
6214 | |||
6215 | if (!sg) | ||
6216 | goto fail; | ||
6217 | |||
6218 | sg_span = sched_group_cpus(sg); | ||
6219 | if (sibling->child) | ||
6220 | cpumask_copy(sg_span, sched_domain_span(sibling->child)); | ||
6221 | else | ||
6222 | cpumask_set_cpu(i, sg_span); | ||
6223 | |||
6224 | cpumask_or(covered, covered, sg_span); | ||
6225 | |||
6226 | sg->sgc = *per_cpu_ptr(sdd->sgc, i); | ||
6227 | if (atomic_inc_return(&sg->sgc->ref) == 1) | ||
6228 | build_group_mask(sd, sg); | ||
6229 | |||
6230 | /* | ||
6231 | * Initialize sgc->capacity such that even if we mess up the | ||
6232 | * domains and no possible iteration will get us here, we won't | ||
6233 | * die on a /0 trap. | ||
6234 | */ | ||
6235 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); | ||
6236 | sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; | ||
6237 | |||
6238 | /* | ||
6239 | * Make sure the first group of this domain contains the | ||
6240 | * canonical balance CPU. Otherwise the sched_domain iteration | ||
6241 | * breaks. See update_sg_lb_stats(). | ||
6242 | */ | ||
6243 | if ((!groups && cpumask_test_cpu(cpu, sg_span)) || | ||
6244 | group_balance_cpu(sg) == cpu) | ||
6245 | groups = sg; | ||
6246 | |||
6247 | if (!first) | ||
6248 | first = sg; | ||
6249 | if (last) | ||
6250 | last->next = sg; | ||
6251 | last = sg; | ||
6252 | last->next = first; | ||
6253 | } | ||
6254 | sd->groups = groups; | ||
6255 | |||
6256 | return 0; | ||
6257 | |||
6258 | fail: | ||
6259 | free_sched_groups(first, 0); | ||
6260 | |||
6261 | return -ENOMEM; | ||
6262 | } | ||
6263 | |||
6264 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) | ||
6265 | { | ||
6266 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | ||
6267 | struct sched_domain *child = sd->child; | ||
6268 | |||
6269 | if (child) | ||
6270 | cpu = cpumask_first(sched_domain_span(child)); | ||
6271 | |||
6272 | if (sg) { | ||
6273 | *sg = *per_cpu_ptr(sdd->sg, cpu); | ||
6274 | (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); | ||
6275 | |||
6276 | /* For claim_allocations: */ | ||
6277 | atomic_set(&(*sg)->sgc->ref, 1); | ||
6278 | } | ||
6279 | |||
6280 | return cpu; | ||
6281 | } | ||
6282 | |||
6283 | /* | ||
6284 | * build_sched_groups will build a circular linked list of the groups | ||
6285 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
6286 | * and ->cpu_capacity to 0. | ||
6287 | * | ||
6288 | * Assumes the sched_domain tree is fully constructed | ||
6289 | */ | ||
6290 | static int | ||
6291 | build_sched_groups(struct sched_domain *sd, int cpu) | ||
6292 | { | ||
6293 | struct sched_group *first = NULL, *last = NULL; | ||
6294 | struct sd_data *sdd = sd->private; | ||
6295 | const struct cpumask *span = sched_domain_span(sd); | ||
6296 | struct cpumask *covered; | ||
6297 | int i; | ||
6298 | |||
6299 | get_group(cpu, sdd, &sd->groups); | ||
6300 | atomic_inc(&sd->groups->ref); | ||
6301 | |||
6302 | if (cpu != cpumask_first(span)) | ||
6303 | return 0; | ||
6304 | |||
6305 | lockdep_assert_held(&sched_domains_mutex); | ||
6306 | covered = sched_domains_tmpmask; | ||
6307 | |||
6308 | cpumask_clear(covered); | ||
6309 | |||
6310 | for_each_cpu(i, span) { | ||
6311 | struct sched_group *sg; | ||
6312 | int group, j; | ||
6313 | |||
6314 | if (cpumask_test_cpu(i, covered)) | ||
6315 | continue; | ||
6316 | |||
6317 | group = get_group(i, sdd, &sg); | ||
6318 | cpumask_setall(sched_group_mask(sg)); | ||
6319 | |||
6320 | for_each_cpu(j, span) { | ||
6321 | if (get_group(j, sdd, NULL) != group) | ||
6322 | continue; | ||
6323 | |||
6324 | cpumask_set_cpu(j, covered); | ||
6325 | cpumask_set_cpu(j, sched_group_cpus(sg)); | ||
6326 | } | ||
6327 | |||
6328 | if (!first) | ||
6329 | first = sg; | ||
6330 | if (last) | ||
6331 | last->next = sg; | ||
6332 | last = sg; | ||
6333 | } | ||
6334 | last->next = first; | ||
6335 | |||
6336 | return 0; | ||
6337 | } | ||
6338 | |||
6339 | /* | ||
6340 | * Initialize sched groups cpu_capacity. | ||
6341 | * | ||
6342 | * cpu_capacity indicates the capacity of sched group, which is used while | ||
6343 | * distributing the load between different sched groups in a sched domain. | ||
6344 | * Typically cpu_capacity for all the groups in a sched domain will be same | ||
6345 | * unless there are asymmetries in the topology. If there are asymmetries, | ||
6346 | * group having more cpu_capacity will pickup more load compared to the | ||
6347 | * group having less cpu_capacity. | ||
6348 | */ | ||
6349 | static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) | ||
6350 | { | ||
6351 | struct sched_group *sg = sd->groups; | ||
6352 | |||
6353 | WARN_ON(!sg); | ||
6354 | |||
6355 | do { | ||
6356 | int cpu, max_cpu = -1; | ||
6357 | |||
6358 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); | ||
6359 | |||
6360 | if (!(sd->flags & SD_ASYM_PACKING)) | ||
6361 | goto next; | ||
6362 | |||
6363 | for_each_cpu(cpu, sched_group_cpus(sg)) { | ||
6364 | if (max_cpu < 0) | ||
6365 | max_cpu = cpu; | ||
6366 | else if (sched_asym_prefer(cpu, max_cpu)) | ||
6367 | max_cpu = cpu; | ||
6368 | } | ||
6369 | sg->asym_prefer_cpu = max_cpu; | ||
6370 | |||
6371 | next: | ||
6372 | sg = sg->next; | ||
6373 | } while (sg != sd->groups); | ||
6374 | |||
6375 | if (cpu != group_balance_cpu(sg)) | ||
6376 | return; | ||
6377 | |||
6378 | update_group_capacity(sd, cpu); | ||
6379 | } | ||
6380 | |||
6381 | /* | ||
6382 | * Initializers for schedule domains | ||
6383 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | ||
6384 | */ | ||
6385 | |||
6386 | static int default_relax_domain_level = -1; | ||
6387 | int sched_domain_level_max; | ||
6388 | |||
6389 | static int __init setup_relax_domain_level(char *str) | ||
6390 | { | ||
6391 | if (kstrtoint(str, 0, &default_relax_domain_level)) | ||
6392 | pr_warn("Unable to set relax_domain_level\n"); | ||
6393 | |||
6394 | return 1; | ||
6395 | } | ||
6396 | __setup("relax_domain_level=", setup_relax_domain_level); | ||
6397 | |||
6398 | static void set_domain_attribute(struct sched_domain *sd, | ||
6399 | struct sched_domain_attr *attr) | ||
6400 | { | ||
6401 | int request; | ||
6402 | |||
6403 | if (!attr || attr->relax_domain_level < 0) { | ||
6404 | if (default_relax_domain_level < 0) | ||
6405 | return; | ||
6406 | else | ||
6407 | request = default_relax_domain_level; | ||
6408 | } else | ||
6409 | request = attr->relax_domain_level; | ||
6410 | if (request < sd->level) { | ||
6411 | /* Turn off idle balance on this domain: */ | ||
6412 | sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); | ||
6413 | } else { | ||
6414 | /* Turn on idle balance on this domain: */ | ||
6415 | sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); | ||
6416 | } | ||
6417 | } | ||
6418 | |||
6419 | static void __sdt_free(const struct cpumask *cpu_map); | ||
6420 | static int __sdt_alloc(const struct cpumask *cpu_map); | ||
6421 | |||
6422 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | ||
6423 | const struct cpumask *cpu_map) | ||
6424 | { | ||
6425 | switch (what) { | ||
6426 | case sa_rootdomain: | ||
6427 | if (!atomic_read(&d->rd->refcount)) | ||
6428 | free_rootdomain(&d->rd->rcu); | ||
6429 | /* Fall through */ | ||
6430 | case sa_sd: | ||
6431 | free_percpu(d->sd); | ||
6432 | /* Fall through */ | ||
6433 | case sa_sd_storage: | ||
6434 | __sdt_free(cpu_map); | ||
6435 | /* Fall through */ | ||
6436 | case sa_none: | ||
6437 | break; | ||
6438 | } | ||
6439 | } | ||
6440 | |||
6441 | static enum s_alloc | ||
6442 | __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) | ||
6443 | { | ||
6444 | memset(d, 0, sizeof(*d)); | ||
6445 | |||
6446 | if (__sdt_alloc(cpu_map)) | ||
6447 | return sa_sd_storage; | ||
6448 | d->sd = alloc_percpu(struct sched_domain *); | ||
6449 | if (!d->sd) | ||
6450 | return sa_sd_storage; | ||
6451 | d->rd = alloc_rootdomain(); | ||
6452 | if (!d->rd) | ||
6453 | return sa_sd; | ||
6454 | return sa_rootdomain; | ||
6455 | } | ||
6456 | |||
6457 | /* | ||
6458 | * NULL the sd_data elements we've used to build the sched_domain and | ||
6459 | * sched_group structure so that the subsequent __free_domain_allocs() | ||
6460 | * will not free the data we're using. | ||
6461 | */ | ||
6462 | static void claim_allocations(int cpu, struct sched_domain *sd) | ||
6463 | { | ||
6464 | struct sd_data *sdd = sd->private; | ||
6465 | |||
6466 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); | ||
6467 | *per_cpu_ptr(sdd->sd, cpu) = NULL; | ||
6468 | |||
6469 | if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) | ||
6470 | *per_cpu_ptr(sdd->sds, cpu) = NULL; | ||
6471 | |||
6472 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) | ||
6473 | *per_cpu_ptr(sdd->sg, cpu) = NULL; | ||
6474 | |||
6475 | if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) | ||
6476 | *per_cpu_ptr(sdd->sgc, cpu) = NULL; | ||
6477 | } | ||
6478 | |||
6479 | #ifdef CONFIG_NUMA | ||
6480 | static int sched_domains_numa_levels; | ||
6481 | enum numa_topology_type sched_numa_topology_type; | ||
6482 | static int *sched_domains_numa_distance; | ||
6483 | int sched_max_numa_distance; | ||
6484 | static struct cpumask ***sched_domains_numa_masks; | ||
6485 | static int sched_domains_curr_level; | ||
6486 | #endif | ||
6487 | |||
6488 | /* | ||
6489 | * SD_flags allowed in topology descriptions. | ||
6490 | * | ||
6491 | * These flags are purely descriptive of the topology and do not prescribe | ||
6492 | * behaviour. Behaviour is artificial and mapped in the below sd_init() | ||
6493 | * function: | ||
6494 | * | ||
6495 | * SD_SHARE_CPUCAPACITY - describes SMT topologies | ||
6496 | * SD_SHARE_PKG_RESOURCES - describes shared caches | ||
6497 | * SD_NUMA - describes NUMA topologies | ||
6498 | * SD_SHARE_POWERDOMAIN - describes shared power domain | ||
6499 | * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies | ||
6500 | * | ||
6501 | * Odd one out, which beside describing the topology has a quirk also | ||
6502 | * prescribes the desired behaviour that goes along with it: | ||
6503 | * | ||
6504 | * SD_ASYM_PACKING - describes SMT quirks | ||
6505 | */ | ||
6506 | #define TOPOLOGY_SD_FLAGS \ | ||
6507 | (SD_SHARE_CPUCAPACITY | \ | ||
6508 | SD_SHARE_PKG_RESOURCES | \ | ||
6509 | SD_NUMA | \ | ||
6510 | SD_ASYM_PACKING | \ | ||
6511 | SD_ASYM_CPUCAPACITY | \ | ||
6512 | SD_SHARE_POWERDOMAIN) | ||
6513 | |||
6514 | static struct sched_domain * | ||
6515 | sd_init(struct sched_domain_topology_level *tl, | ||
6516 | const struct cpumask *cpu_map, | ||
6517 | struct sched_domain *child, int cpu) | ||
6518 | { | ||
6519 | struct sd_data *sdd = &tl->data; | ||
6520 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | ||
6521 | int sd_id, sd_weight, sd_flags = 0; | ||
6522 | |||
6523 | #ifdef CONFIG_NUMA | ||
6524 | /* | ||
6525 | * Ugly hack to pass state to sd_numa_mask()... | ||
6526 | */ | ||
6527 | sched_domains_curr_level = tl->numa_level; | ||
6528 | #endif | ||
6529 | |||
6530 | sd_weight = cpumask_weight(tl->mask(cpu)); | ||
6531 | |||
6532 | if (tl->sd_flags) | ||
6533 | sd_flags = (*tl->sd_flags)(); | ||
6534 | if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, | ||
6535 | "wrong sd_flags in topology description\n")) | ||
6536 | sd_flags &= ~TOPOLOGY_SD_FLAGS; | ||
6537 | |||
6538 | *sd = (struct sched_domain){ | ||
6539 | .min_interval = sd_weight, | ||
6540 | .max_interval = 2*sd_weight, | ||
6541 | .busy_factor = 32, | ||
6542 | .imbalance_pct = 125, | ||
6543 | |||
6544 | .cache_nice_tries = 0, | ||
6545 | .busy_idx = 0, | ||
6546 | .idle_idx = 0, | ||
6547 | .newidle_idx = 0, | ||
6548 | .wake_idx = 0, | ||
6549 | .forkexec_idx = 0, | ||
6550 | |||
6551 | .flags = 1*SD_LOAD_BALANCE | ||
6552 | | 1*SD_BALANCE_NEWIDLE | ||
6553 | | 1*SD_BALANCE_EXEC | ||
6554 | | 1*SD_BALANCE_FORK | ||
6555 | | 0*SD_BALANCE_WAKE | ||
6556 | | 1*SD_WAKE_AFFINE | ||
6557 | | 0*SD_SHARE_CPUCAPACITY | ||
6558 | | 0*SD_SHARE_PKG_RESOURCES | ||
6559 | | 0*SD_SERIALIZE | ||
6560 | | 0*SD_PREFER_SIBLING | ||
6561 | | 0*SD_NUMA | ||
6562 | | sd_flags | ||
6563 | , | ||
6564 | |||
6565 | .last_balance = jiffies, | ||
6566 | .balance_interval = sd_weight, | ||
6567 | .smt_gain = 0, | ||
6568 | .max_newidle_lb_cost = 0, | ||
6569 | .next_decay_max_lb_cost = jiffies, | ||
6570 | .child = child, | ||
6571 | #ifdef CONFIG_SCHED_DEBUG | ||
6572 | .name = tl->name, | ||
6573 | #endif | ||
6574 | }; | ||
6575 | |||
6576 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
6577 | sd_id = cpumask_first(sched_domain_span(sd)); | ||
6578 | |||
6579 | /* | ||
6580 | * Convert topological properties into behaviour. | ||
6581 | */ | ||
6582 | |||
6583 | if (sd->flags & SD_ASYM_CPUCAPACITY) { | ||
6584 | struct sched_domain *t = sd; | ||
6585 | |||
6586 | for_each_lower_domain(t) | ||
6587 | t->flags |= SD_BALANCE_WAKE; | ||
6588 | } | ||
6589 | |||
6590 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | ||
6591 | sd->flags |= SD_PREFER_SIBLING; | ||
6592 | sd->imbalance_pct = 110; | ||
6593 | sd->smt_gain = 1178; /* ~15% */ | ||
6594 | |||
6595 | } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
6596 | sd->imbalance_pct = 117; | ||
6597 | sd->cache_nice_tries = 1; | ||
6598 | sd->busy_idx = 2; | ||
6599 | |||
6600 | #ifdef CONFIG_NUMA | ||
6601 | } else if (sd->flags & SD_NUMA) { | ||
6602 | sd->cache_nice_tries = 2; | ||
6603 | sd->busy_idx = 3; | ||
6604 | sd->idle_idx = 2; | ||
6605 | |||
6606 | sd->flags |= SD_SERIALIZE; | ||
6607 | if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { | ||
6608 | sd->flags &= ~(SD_BALANCE_EXEC | | ||
6609 | SD_BALANCE_FORK | | ||
6610 | SD_WAKE_AFFINE); | ||
6611 | } | ||
6612 | |||
6613 | #endif | ||
6614 | } else { | ||
6615 | sd->flags |= SD_PREFER_SIBLING; | ||
6616 | sd->cache_nice_tries = 1; | ||
6617 | sd->busy_idx = 2; | ||
6618 | sd->idle_idx = 1; | ||
6619 | } | ||
6620 | |||
6621 | /* | ||
6622 | * For all levels sharing cache; connect a sched_domain_shared | ||
6623 | * instance. | ||
6624 | */ | ||
6625 | if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
6626 | sd->shared = *per_cpu_ptr(sdd->sds, sd_id); | ||
6627 | atomic_inc(&sd->shared->ref); | ||
6628 | atomic_set(&sd->shared->nr_busy_cpus, sd_weight); | ||
6629 | } | ||
6630 | |||
6631 | sd->private = sdd; | ||
6632 | |||
6633 | return sd; | ||
6634 | } | ||
6635 | |||
6636 | /* | ||
6637 | * Topology list, bottom-up. | ||
6638 | */ | ||
6639 | static struct sched_domain_topology_level default_topology[] = { | ||
6640 | #ifdef CONFIG_SCHED_SMT | ||
6641 | { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, | ||
6642 | #endif | ||
6643 | #ifdef CONFIG_SCHED_MC | ||
6644 | { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, | ||
6645 | #endif | ||
6646 | { cpu_cpu_mask, SD_INIT_NAME(DIE) }, | ||
6647 | { NULL, }, | ||
6648 | }; | ||
6649 | |||
6650 | static struct sched_domain_topology_level *sched_domain_topology = | ||
6651 | default_topology; | ||
6652 | |||
6653 | #define for_each_sd_topology(tl) \ | ||
6654 | for (tl = sched_domain_topology; tl->mask; tl++) | ||
6655 | |||
6656 | void set_sched_topology(struct sched_domain_topology_level *tl) | ||
6657 | { | ||
6658 | if (WARN_ON_ONCE(sched_smp_initialized)) | ||
6659 | return; | ||
6660 | |||
6661 | sched_domain_topology = tl; | ||
6662 | } | ||
6663 | |||
6664 | #ifdef CONFIG_NUMA | ||
6665 | |||
6666 | static const struct cpumask *sd_numa_mask(int cpu) | ||
6667 | { | ||
6668 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | ||
6669 | } | ||
6670 | |||
6671 | static void sched_numa_warn(const char *str) | ||
6672 | { | ||
6673 | static int done = false; | ||
6674 | int i,j; | ||
6675 | |||
6676 | if (done) | ||
6677 | return; | ||
6678 | |||
6679 | done = true; | ||
6680 | |||
6681 | printk(KERN_WARNING "ERROR: %s\n\n", str); | ||
6682 | |||
6683 | for (i = 0; i < nr_node_ids; i++) { | ||
6684 | printk(KERN_WARNING " "); | ||
6685 | for (j = 0; j < nr_node_ids; j++) | ||
6686 | printk(KERN_CONT "%02d ", node_distance(i,j)); | ||
6687 | printk(KERN_CONT "\n"); | ||
6688 | } | ||
6689 | printk(KERN_WARNING "\n"); | ||
6690 | } | ||
6691 | |||
6692 | bool find_numa_distance(int distance) | ||
6693 | { | ||
6694 | int i; | ||
6695 | |||
6696 | if (distance == node_distance(0, 0)) | ||
6697 | return true; | ||
6698 | |||
6699 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
6700 | if (sched_domains_numa_distance[i] == distance) | ||
6701 | return true; | ||
6702 | } | ||
6703 | |||
6704 | return false; | ||
6705 | } | ||
6706 | |||
6707 | /* | ||
6708 | * A system can have three types of NUMA topology: | ||
6709 | * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system | ||
6710 | * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes | ||
6711 | * NUMA_BACKPLANE: nodes can reach other nodes through a backplane | ||
6712 | * | ||
6713 | * The difference between a glueless mesh topology and a backplane | ||
6714 | * topology lies in whether communication between not directly | ||
6715 | * connected nodes goes through intermediary nodes (where programs | ||
6716 | * could run), or through backplane controllers. This affects | ||
6717 | * placement of programs. | ||
6718 | * | ||
6719 | * The type of topology can be discerned with the following tests: | ||
6720 | * - If the maximum distance between any nodes is 1 hop, the system | ||
6721 | * is directly connected. | ||
6722 | * - If for two nodes A and B, located N > 1 hops away from each other, | ||
6723 | * there is an intermediary node C, which is < N hops away from both | ||
6724 | * nodes A and B, the system is a glueless mesh. | ||
6725 | */ | ||
6726 | static void init_numa_topology_type(void) | ||
6727 | { | ||
6728 | int a, b, c, n; | ||
6729 | |||
6730 | n = sched_max_numa_distance; | ||
6731 | |||
6732 | if (sched_domains_numa_levels <= 1) { | ||
6733 | sched_numa_topology_type = NUMA_DIRECT; | ||
6734 | return; | ||
6735 | } | ||
6736 | |||
6737 | for_each_online_node(a) { | ||
6738 | for_each_online_node(b) { | ||
6739 | /* Find two nodes furthest removed from each other. */ | ||
6740 | if (node_distance(a, b) < n) | ||
6741 | continue; | ||
6742 | |||
6743 | /* Is there an intermediary node between a and b? */ | ||
6744 | for_each_online_node(c) { | ||
6745 | if (node_distance(a, c) < n && | ||
6746 | node_distance(b, c) < n) { | ||
6747 | sched_numa_topology_type = | ||
6748 | NUMA_GLUELESS_MESH; | ||
6749 | return; | ||
6750 | } | ||
6751 | } | ||
6752 | |||
6753 | sched_numa_topology_type = NUMA_BACKPLANE; | ||
6754 | return; | ||
6755 | } | ||
6756 | } | ||
6757 | } | ||
6758 | |||
6759 | static void sched_init_numa(void) | ||
6760 | { | ||
6761 | int next_distance, curr_distance = node_distance(0, 0); | ||
6762 | struct sched_domain_topology_level *tl; | ||
6763 | int level = 0; | ||
6764 | int i, j, k; | ||
6765 | |||
6766 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); | ||
6767 | if (!sched_domains_numa_distance) | ||
6768 | return; | ||
6769 | |||
6770 | /* | ||
6771 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the | ||
6772 | * unique distances in the node_distance() table. | ||
6773 | * | ||
6774 | * Assumes node_distance(0,j) includes all distances in | ||
6775 | * node_distance(i,j) in order to avoid cubic time. | ||
6776 | */ | ||
6777 | next_distance = curr_distance; | ||
6778 | for (i = 0; i < nr_node_ids; i++) { | ||
6779 | for (j = 0; j < nr_node_ids; j++) { | ||
6780 | for (k = 0; k < nr_node_ids; k++) { | ||
6781 | int distance = node_distance(i, k); | ||
6782 | |||
6783 | if (distance > curr_distance && | ||
6784 | (distance < next_distance || | ||
6785 | next_distance == curr_distance)) | ||
6786 | next_distance = distance; | ||
6787 | |||
6788 | /* | ||
6789 | * While not a strong assumption it would be nice to know | ||
6790 | * about cases where if node A is connected to B, B is not | ||
6791 | * equally connected to A. | ||
6792 | */ | ||
6793 | if (sched_debug() && node_distance(k, i) != distance) | ||
6794 | sched_numa_warn("Node-distance not symmetric"); | ||
6795 | |||
6796 | if (sched_debug() && i && !find_numa_distance(distance)) | ||
6797 | sched_numa_warn("Node-0 not representative"); | ||
6798 | } | ||
6799 | if (next_distance != curr_distance) { | ||
6800 | sched_domains_numa_distance[level++] = next_distance; | ||
6801 | sched_domains_numa_levels = level; | ||
6802 | curr_distance = next_distance; | ||
6803 | } else break; | ||
6804 | } | ||
6805 | |||
6806 | /* | ||
6807 | * In case of sched_debug() we verify the above assumption. | ||
6808 | */ | ||
6809 | if (!sched_debug()) | ||
6810 | break; | ||
6811 | } | ||
6812 | |||
6813 | if (!level) | ||
6814 | return; | ||
6815 | |||
6816 | /* | ||
6817 | * 'level' contains the number of unique distances, excluding the | ||
6818 | * identity distance node_distance(i,i). | ||
6819 | * | ||
6820 | * The sched_domains_numa_distance[] array includes the actual distance | ||
6821 | * numbers. | ||
6822 | */ | ||
6823 | |||
6824 | /* | ||
6825 | * Here, we should temporarily reset sched_domains_numa_levels to 0. | ||
6826 | * If it fails to allocate memory for array sched_domains_numa_masks[][], | ||
6827 | * the array will contain less then 'level' members. This could be | ||
6828 | * dangerous when we use it to iterate array sched_domains_numa_masks[][] | ||
6829 | * in other functions. | ||
6830 | * | ||
6831 | * We reset it to 'level' at the end of this function. | ||
6832 | */ | ||
6833 | sched_domains_numa_levels = 0; | ||
6834 | |||
6835 | sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); | ||
6836 | if (!sched_domains_numa_masks) | ||
6837 | return; | ||
6838 | |||
6839 | /* | ||
6840 | * Now for each level, construct a mask per node which contains all | ||
6841 | * CPUs of nodes that are that many hops away from us. | ||
6842 | */ | ||
6843 | for (i = 0; i < level; i++) { | ||
6844 | sched_domains_numa_masks[i] = | ||
6845 | kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); | ||
6846 | if (!sched_domains_numa_masks[i]) | ||
6847 | return; | ||
6848 | |||
6849 | for (j = 0; j < nr_node_ids; j++) { | ||
6850 | struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); | ||
6851 | if (!mask) | ||
6852 | return; | ||
6853 | |||
6854 | sched_domains_numa_masks[i][j] = mask; | ||
6855 | |||
6856 | for_each_node(k) { | ||
6857 | if (node_distance(j, k) > sched_domains_numa_distance[i]) | ||
6858 | continue; | ||
6859 | |||
6860 | cpumask_or(mask, mask, cpumask_of_node(k)); | ||
6861 | } | ||
6862 | } | ||
6863 | } | ||
6864 | |||
6865 | /* Compute default topology size */ | ||
6866 | for (i = 0; sched_domain_topology[i].mask; i++); | ||
6867 | |||
6868 | tl = kzalloc((i + level + 1) * | ||
6869 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); | ||
6870 | if (!tl) | ||
6871 | return; | ||
6872 | |||
6873 | /* | ||
6874 | * Copy the default topology bits.. | ||
6875 | */ | ||
6876 | for (i = 0; sched_domain_topology[i].mask; i++) | ||
6877 | tl[i] = sched_domain_topology[i]; | ||
6878 | |||
6879 | /* | ||
6880 | * .. and append 'j' levels of NUMA goodness. | ||
6881 | */ | ||
6882 | for (j = 0; j < level; i++, j++) { | ||
6883 | tl[i] = (struct sched_domain_topology_level){ | ||
6884 | .mask = sd_numa_mask, | ||
6885 | .sd_flags = cpu_numa_flags, | ||
6886 | .flags = SDTL_OVERLAP, | ||
6887 | .numa_level = j, | ||
6888 | SD_INIT_NAME(NUMA) | ||
6889 | }; | ||
6890 | } | ||
6891 | |||
6892 | sched_domain_topology = tl; | ||
6893 | |||
6894 | sched_domains_numa_levels = level; | ||
6895 | sched_max_numa_distance = sched_domains_numa_distance[level - 1]; | ||
6896 | |||
6897 | init_numa_topology_type(); | ||
6898 | } | ||
6899 | |||
6900 | static void sched_domains_numa_masks_set(unsigned int cpu) | ||
6901 | { | ||
6902 | int node = cpu_to_node(cpu); | ||
6903 | int i, j; | ||
6904 | |||
6905 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
6906 | for (j = 0; j < nr_node_ids; j++) { | ||
6907 | if (node_distance(j, node) <= sched_domains_numa_distance[i]) | ||
6908 | cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); | ||
6909 | } | ||
6910 | } | ||
6911 | } | ||
6912 | |||
6913 | static void sched_domains_numa_masks_clear(unsigned int cpu) | ||
6914 | { | ||
6915 | int i, j; | ||
6916 | |||
6917 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
6918 | for (j = 0; j < nr_node_ids; j++) | ||
6919 | cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); | ||
6920 | } | ||
6921 | } | ||
6922 | |||
6923 | #else | ||
6924 | static inline void sched_init_numa(void) { } | ||
6925 | static void sched_domains_numa_masks_set(unsigned int cpu) { } | ||
6926 | static void sched_domains_numa_masks_clear(unsigned int cpu) { } | ||
6927 | #endif /* CONFIG_NUMA */ | ||
6928 | |||
6929 | static int __sdt_alloc(const struct cpumask *cpu_map) | ||
6930 | { | ||
6931 | struct sched_domain_topology_level *tl; | ||
6932 | int j; | ||
6933 | |||
6934 | for_each_sd_topology(tl) { | ||
6935 | struct sd_data *sdd = &tl->data; | ||
6936 | |||
6937 | sdd->sd = alloc_percpu(struct sched_domain *); | ||
6938 | if (!sdd->sd) | ||
6939 | return -ENOMEM; | ||
6940 | |||
6941 | sdd->sds = alloc_percpu(struct sched_domain_shared *); | ||
6942 | if (!sdd->sds) | ||
6943 | return -ENOMEM; | ||
6944 | |||
6945 | sdd->sg = alloc_percpu(struct sched_group *); | ||
6946 | if (!sdd->sg) | ||
6947 | return -ENOMEM; | ||
6948 | |||
6949 | sdd->sgc = alloc_percpu(struct sched_group_capacity *); | ||
6950 | if (!sdd->sgc) | ||
6951 | return -ENOMEM; | ||
6952 | |||
6953 | for_each_cpu(j, cpu_map) { | ||
6954 | struct sched_domain *sd; | ||
6955 | struct sched_domain_shared *sds; | ||
6956 | struct sched_group *sg; | ||
6957 | struct sched_group_capacity *sgc; | ||
6958 | |||
6959 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | ||
6960 | GFP_KERNEL, cpu_to_node(j)); | ||
6961 | if (!sd) | ||
6962 | return -ENOMEM; | ||
6963 | |||
6964 | *per_cpu_ptr(sdd->sd, j) = sd; | ||
6965 | |||
6966 | sds = kzalloc_node(sizeof(struct sched_domain_shared), | ||
6967 | GFP_KERNEL, cpu_to_node(j)); | ||
6968 | if (!sds) | ||
6969 | return -ENOMEM; | ||
6970 | |||
6971 | *per_cpu_ptr(sdd->sds, j) = sds; | ||
6972 | |||
6973 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
6974 | GFP_KERNEL, cpu_to_node(j)); | ||
6975 | if (!sg) | ||
6976 | return -ENOMEM; | ||
6977 | |||
6978 | sg->next = sg; | ||
6979 | |||
6980 | *per_cpu_ptr(sdd->sg, j) = sg; | ||
6981 | |||
6982 | sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), | ||
6983 | GFP_KERNEL, cpu_to_node(j)); | ||
6984 | if (!sgc) | ||
6985 | return -ENOMEM; | ||
6986 | |||
6987 | *per_cpu_ptr(sdd->sgc, j) = sgc; | ||
6988 | } | ||
6989 | } | ||
6990 | |||
6991 | return 0; | ||
6992 | } | ||
6993 | |||
6994 | static void __sdt_free(const struct cpumask *cpu_map) | ||
6995 | { | ||
6996 | struct sched_domain_topology_level *tl; | ||
6997 | int j; | ||
6998 | |||
6999 | for_each_sd_topology(tl) { | ||
7000 | struct sd_data *sdd = &tl->data; | ||
7001 | |||
7002 | for_each_cpu(j, cpu_map) { | ||
7003 | struct sched_domain *sd; | ||
7004 | |||
7005 | if (sdd->sd) { | ||
7006 | sd = *per_cpu_ptr(sdd->sd, j); | ||
7007 | if (sd && (sd->flags & SD_OVERLAP)) | ||
7008 | free_sched_groups(sd->groups, 0); | ||
7009 | kfree(*per_cpu_ptr(sdd->sd, j)); | ||
7010 | } | ||
7011 | |||
7012 | if (sdd->sds) | ||
7013 | kfree(*per_cpu_ptr(sdd->sds, j)); | ||
7014 | if (sdd->sg) | ||
7015 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
7016 | if (sdd->sgc) | ||
7017 | kfree(*per_cpu_ptr(sdd->sgc, j)); | ||
7018 | } | ||
7019 | free_percpu(sdd->sd); | ||
7020 | sdd->sd = NULL; | ||
7021 | free_percpu(sdd->sds); | ||
7022 | sdd->sds = NULL; | ||
7023 | free_percpu(sdd->sg); | ||
7024 | sdd->sg = NULL; | ||
7025 | free_percpu(sdd->sgc); | ||
7026 | sdd->sgc = NULL; | ||
7027 | } | ||
7028 | } | ||
7029 | |||
7030 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | ||
7031 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
7032 | struct sched_domain *child, int cpu) | ||
7033 | { | ||
7034 | struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); | ||
7035 | |||
7036 | if (child) { | ||
7037 | sd->level = child->level + 1; | ||
7038 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | ||
7039 | child->parent = sd; | ||
7040 | |||
7041 | if (!cpumask_subset(sched_domain_span(child), | ||
7042 | sched_domain_span(sd))) { | ||
7043 | pr_err("BUG: arch topology borken\n"); | ||
7044 | #ifdef CONFIG_SCHED_DEBUG | ||
7045 | pr_err(" the %s domain not a subset of the %s domain\n", | ||
7046 | child->name, sd->name); | ||
7047 | #endif | ||
7048 | /* Fixup, ensure @sd has at least @child cpus. */ | ||
7049 | cpumask_or(sched_domain_span(sd), | ||
7050 | sched_domain_span(sd), | ||
7051 | sched_domain_span(child)); | ||
7052 | } | ||
7053 | |||
7054 | } | ||
7055 | set_domain_attribute(sd, attr); | ||
7056 | |||
7057 | return sd; | ||
7058 | } | ||
7059 | |||
7060 | /* | ||
7061 | * Build sched domains for a given set of CPUs and attach the sched domains | ||
7062 | * to the individual CPUs | ||
7063 | */ | ||
7064 | static int | ||
7065 | build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) | ||
7066 | { | ||
7067 | enum s_alloc alloc_state; | ||
7068 | struct sched_domain *sd; | ||
7069 | struct s_data d; | ||
7070 | struct rq *rq = NULL; | ||
7071 | int i, ret = -ENOMEM; | ||
7072 | |||
7073 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | ||
7074 | if (alloc_state != sa_rootdomain) | ||
7075 | goto error; | ||
7076 | |||
7077 | /* Set up domains for CPUs specified by the cpu_map: */ | ||
7078 | for_each_cpu(i, cpu_map) { | ||
7079 | struct sched_domain_topology_level *tl; | ||
7080 | |||
7081 | sd = NULL; | ||
7082 | for_each_sd_topology(tl) { | ||
7083 | sd = build_sched_domain(tl, cpu_map, attr, sd, i); | ||
7084 | if (tl == sched_domain_topology) | ||
7085 | *per_cpu_ptr(d.sd, i) = sd; | ||
7086 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) | ||
7087 | sd->flags |= SD_OVERLAP; | ||
7088 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) | ||
7089 | break; | ||
7090 | } | ||
7091 | } | ||
7092 | |||
7093 | /* Build the groups for the domains */ | ||
7094 | for_each_cpu(i, cpu_map) { | ||
7095 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
7096 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); | ||
7097 | if (sd->flags & SD_OVERLAP) { | ||
7098 | if (build_overlap_sched_groups(sd, i)) | ||
7099 | goto error; | ||
7100 | } else { | ||
7101 | if (build_sched_groups(sd, i)) | ||
7102 | goto error; | ||
7103 | } | ||
7104 | } | ||
7105 | } | ||
7106 | |||
7107 | /* Calculate CPU capacity for physical packages and nodes */ | ||
7108 | for (i = nr_cpumask_bits-1; i >= 0; i--) { | ||
7109 | if (!cpumask_test_cpu(i, cpu_map)) | ||
7110 | continue; | ||
7111 | |||
7112 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
7113 | claim_allocations(i, sd); | ||
7114 | init_sched_groups_capacity(i, sd); | ||
7115 | } | ||
7116 | } | ||
7117 | |||
7118 | /* Attach the domains */ | ||
7119 | rcu_read_lock(); | ||
7120 | for_each_cpu(i, cpu_map) { | ||
7121 | rq = cpu_rq(i); | ||
7122 | sd = *per_cpu_ptr(d.sd, i); | ||
7123 | |||
7124 | /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ | ||
7125 | if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) | ||
7126 | WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); | ||
7127 | |||
7128 | cpu_attach_domain(sd, d.rd, i); | ||
7129 | } | ||
7130 | rcu_read_unlock(); | ||
7131 | |||
7132 | if (rq && sched_debug_enabled) { | ||
7133 | pr_info("span: %*pbl (max cpu_capacity = %lu)\n", | ||
7134 | cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); | ||
7135 | } | ||
7136 | |||
7137 | ret = 0; | ||
7138 | error: | ||
7139 | __free_domain_allocs(&d, alloc_state, cpu_map); | ||
7140 | return ret; | ||
7141 | } | ||
7142 | |||
7143 | /* Current sched domains: */ | ||
7144 | static cpumask_var_t *doms_cur; | ||
7145 | |||
7146 | /* Number of sched domains in 'doms_cur': */ | ||
7147 | static int ndoms_cur; | ||
7148 | |||
7149 | /* Attribues of custom domains in 'doms_cur' */ | ||
7150 | static struct sched_domain_attr *dattr_cur; | ||
7151 | |||
7152 | /* | ||
7153 | * Special case: If a kmalloc() of a doms_cur partition (array of | ||
7154 | * cpumask) fails, then fallback to a single sched domain, | ||
7155 | * as determined by the single cpumask fallback_doms. | ||
7156 | */ | ||
7157 | static cpumask_var_t fallback_doms; | ||
7158 | |||
7159 | /* | ||
7160 | * arch_update_cpu_topology lets virtualized architectures update the | ||
7161 | * CPU core maps. It is supposed to return 1 if the topology changed | ||
7162 | * or 0 if it stayed the same. | ||
7163 | */ | ||
7164 | int __weak arch_update_cpu_topology(void) | ||
7165 | { | ||
7166 | return 0; | ||
7167 | } | ||
7168 | |||
7169 | cpumask_var_t *alloc_sched_domains(unsigned int ndoms) | ||
7170 | { | ||
7171 | int i; | ||
7172 | cpumask_var_t *doms; | ||
7173 | |||
7174 | doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); | ||
7175 | if (!doms) | ||
7176 | return NULL; | ||
7177 | for (i = 0; i < ndoms; i++) { | ||
7178 | if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { | ||
7179 | free_sched_domains(doms, i); | ||
7180 | return NULL; | ||
7181 | } | ||
7182 | } | ||
7183 | return doms; | ||
7184 | } | ||
7185 | |||
7186 | void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | ||
7187 | { | ||
7188 | unsigned int i; | ||
7189 | for (i = 0; i < ndoms; i++) | ||
7190 | free_cpumask_var(doms[i]); | ||
7191 | kfree(doms); | ||
7192 | } | ||
7193 | |||
7194 | /* | ||
7195 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | ||
7196 | * For now this just excludes isolated CPUs, but could be used to | ||
7197 | * exclude other special cases in the future. | ||
7198 | */ | ||
7199 | static int init_sched_domains(const struct cpumask *cpu_map) | ||
7200 | { | ||
7201 | int err; | ||
7202 | |||
7203 | arch_update_cpu_topology(); | ||
7204 | ndoms_cur = 1; | ||
7205 | doms_cur = alloc_sched_domains(ndoms_cur); | ||
7206 | if (!doms_cur) | ||
7207 | doms_cur = &fallback_doms; | ||
7208 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | ||
7209 | err = build_sched_domains(doms_cur[0], NULL); | ||
7210 | register_sched_domain_sysctl(); | ||
7211 | |||
7212 | return err; | ||
7213 | } | ||
7214 | |||
7215 | /* | ||
7216 | * Detach sched domains from a group of CPUs specified in cpu_map | ||
7217 | * These CPUs will now be attached to the NULL domain | ||
7218 | */ | ||
7219 | static void detach_destroy_domains(const struct cpumask *cpu_map) | ||
7220 | { | ||
7221 | int i; | ||
7222 | |||
7223 | rcu_read_lock(); | ||
7224 | for_each_cpu(i, cpu_map) | ||
7225 | cpu_attach_domain(NULL, &def_root_domain, i); | ||
7226 | rcu_read_unlock(); | ||
7227 | } | ||
7228 | |||
7229 | /* handle null as "default" */ | ||
7230 | static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | ||
7231 | struct sched_domain_attr *new, int idx_new) | ||
7232 | { | ||
7233 | struct sched_domain_attr tmp; | ||
7234 | |||
7235 | /* Fast path: */ | ||
7236 | if (!new && !cur) | ||
7237 | return 1; | ||
7238 | |||
7239 | tmp = SD_ATTR_INIT; | ||
7240 | return !memcmp(cur ? (cur + idx_cur) : &tmp, | ||
7241 | new ? (new + idx_new) : &tmp, | ||
7242 | sizeof(struct sched_domain_attr)); | ||
7243 | } | ||
7244 | |||
7245 | /* | ||
7246 | * Partition sched domains as specified by the 'ndoms_new' | ||
7247 | * cpumasks in the array doms_new[] of cpumasks. This compares | ||
7248 | * doms_new[] to the current sched domain partitioning, doms_cur[]. | ||
7249 | * It destroys each deleted domain and builds each new domain. | ||
7250 | * | ||
7251 | * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. | ||
7252 | * The masks don't intersect (don't overlap.) We should setup one | ||
7253 | * sched domain for each mask. CPUs not in any of the cpumasks will | ||
7254 | * not be load balanced. If the same cpumask appears both in the | ||
7255 | * current 'doms_cur' domains and in the new 'doms_new', we can leave | ||
7256 | * it as it is. | ||
7257 | * | ||
7258 | * The passed in 'doms_new' should be allocated using | ||
7259 | * alloc_sched_domains. This routine takes ownership of it and will | ||
7260 | * free_sched_domains it when done with it. If the caller failed the | ||
7261 | * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, | ||
7262 | * and partition_sched_domains() will fallback to the single partition | ||
7263 | * 'fallback_doms', it also forces the domains to be rebuilt. | ||
7264 | * | ||
7265 | * If doms_new == NULL it will be replaced with cpu_online_mask. | ||
7266 | * ndoms_new == 0 is a special case for destroying existing domains, | ||
7267 | * and it will not create the default domain. | ||
7268 | * | ||
7269 | * Call with hotplug lock held | ||
7270 | */ | ||
7271 | void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], | ||
7272 | struct sched_domain_attr *dattr_new) | ||
7273 | { | ||
7274 | int i, j, n; | ||
7275 | int new_topology; | ||
7276 | |||
7277 | mutex_lock(&sched_domains_mutex); | ||
7278 | |||
7279 | /* Always unregister in case we don't destroy any domains: */ | ||
7280 | unregister_sched_domain_sysctl(); | ||
7281 | |||
7282 | /* Let the architecture update CPU core mappings: */ | ||
7283 | new_topology = arch_update_cpu_topology(); | ||
7284 | |||
7285 | n = doms_new ? ndoms_new : 0; | ||
7286 | |||
7287 | /* Destroy deleted domains: */ | ||
7288 | for (i = 0; i < ndoms_cur; i++) { | ||
7289 | for (j = 0; j < n && !new_topology; j++) { | ||
7290 | if (cpumask_equal(doms_cur[i], doms_new[j]) | ||
7291 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | ||
7292 | goto match1; | ||
7293 | } | ||
7294 | /* No match - a current sched domain not in new doms_new[] */ | ||
7295 | detach_destroy_domains(doms_cur[i]); | ||
7296 | match1: | ||
7297 | ; | ||
7298 | } | ||
7299 | |||
7300 | n = ndoms_cur; | ||
7301 | if (doms_new == NULL) { | ||
7302 | n = 0; | ||
7303 | doms_new = &fallback_doms; | ||
7304 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); | ||
7305 | WARN_ON_ONCE(dattr_new); | ||
7306 | } | ||
7307 | |||
7308 | /* Build new domains: */ | ||
7309 | for (i = 0; i < ndoms_new; i++) { | ||
7310 | for (j = 0; j < n && !new_topology; j++) { | ||
7311 | if (cpumask_equal(doms_new[i], doms_cur[j]) | ||
7312 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | ||
7313 | goto match2; | ||
7314 | } | ||
7315 | /* No match - add a new doms_new */ | ||
7316 | build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); | ||
7317 | match2: | ||
7318 | ; | ||
7319 | } | ||
7320 | |||
7321 | /* Remember the new sched domains: */ | ||
7322 | if (doms_cur != &fallback_doms) | ||
7323 | free_sched_domains(doms_cur, ndoms_cur); | ||
7324 | |||
7325 | kfree(dattr_cur); | ||
7326 | doms_cur = doms_new; | ||
7327 | dattr_cur = dattr_new; | ||
7328 | ndoms_cur = ndoms_new; | ||
7329 | |||
7330 | register_sched_domain_sysctl(); | ||
7331 | |||
7332 | mutex_unlock(&sched_domains_mutex); | ||
7333 | } | ||
7334 | |||
7335 | /* | 5682 | /* |
7336 | * used to mark begin/end of suspend/resume: | 5683 | * used to mark begin/end of suspend/resume: |
7337 | */ | 5684 | */ |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8ff5cc539e8a..17ed94b9b413 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -223,7 +223,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | |||
223 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | 223 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; |
224 | } | 224 | } |
225 | 225 | ||
226 | extern struct mutex sched_domains_mutex; | 226 | extern void init_dl_bw(struct dl_bw *dl_b); |
227 | 227 | ||
228 | #ifdef CONFIG_CGROUP_SCHED | 228 | #ifdef CONFIG_CGROUP_SCHED |
229 | 229 | ||
@@ -584,6 +584,13 @@ struct root_domain { | |||
584 | }; | 584 | }; |
585 | 585 | ||
586 | extern struct root_domain def_root_domain; | 586 | extern struct root_domain def_root_domain; |
587 | extern struct mutex sched_domains_mutex; | ||
588 | extern cpumask_var_t fallback_doms; | ||
589 | extern cpumask_var_t sched_domains_tmpmask; | ||
590 | |||
591 | extern void init_defrootdomain(void); | ||
592 | extern int init_sched_domains(const struct cpumask *cpu_map); | ||
593 | extern void rq_attach_root(struct rq *rq, struct root_domain *rd); | ||
587 | 594 | ||
588 | #endif /* CONFIG_SMP */ | 595 | #endif /* CONFIG_SMP */ |
589 | 596 | ||
@@ -886,6 +893,16 @@ extern int sched_max_numa_distance; | |||
886 | extern bool find_numa_distance(int distance); | 893 | extern bool find_numa_distance(int distance); |
887 | #endif | 894 | #endif |
888 | 895 | ||
896 | #ifdef CONFIG_NUMA | ||
897 | extern void sched_init_numa(void); | ||
898 | extern void sched_domains_numa_masks_set(unsigned int cpu); | ||
899 | extern void sched_domains_numa_masks_clear(unsigned int cpu); | ||
900 | #else | ||
901 | static inline void sched_init_numa(void) { } | ||
902 | static inline void sched_domains_numa_masks_set(unsigned int cpu) { } | ||
903 | static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } | ||
904 | #endif | ||
905 | |||
889 | #ifdef CONFIG_NUMA_BALANCING | 906 | #ifdef CONFIG_NUMA_BALANCING |
890 | /* The regions in numa_faults array from task_struct */ | 907 | /* The regions in numa_faults array from task_struct */ |
891 | enum numa_faults_stats { | 908 | enum numa_faults_stats { |
@@ -1752,6 +1769,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
1752 | __release(rq2->lock); | 1769 | __release(rq2->lock); |
1753 | } | 1770 | } |
1754 | 1771 | ||
1772 | extern void set_rq_online (struct rq *rq); | ||
1773 | extern void set_rq_offline(struct rq *rq); | ||
1774 | extern bool sched_smp_initialized; | ||
1775 | |||
1755 | #else /* CONFIG_SMP */ | 1776 | #else /* CONFIG_SMP */ |
1756 | 1777 | ||
1757 | /* | 1778 | /* |
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c new file mode 100644 index 000000000000..1b0b4fb12837 --- /dev/null +++ b/kernel/sched/topology.c | |||
@@ -0,0 +1,1658 @@ | |||
1 | /* | ||
2 | * Scheduler topology setup/handling methods | ||
3 | */ | ||
4 | #include <linux/sched.h> | ||
5 | #include <linux/mutex.h> | ||
6 | |||
7 | #include "sched.h" | ||
8 | |||
9 | DEFINE_MUTEX(sched_domains_mutex); | ||
10 | |||
11 | /* Protected by sched_domains_mutex: */ | ||
12 | cpumask_var_t sched_domains_tmpmask; | ||
13 | |||
14 | #ifdef CONFIG_SCHED_DEBUG | ||
15 | |||
16 | static __read_mostly int sched_debug_enabled; | ||
17 | |||
18 | static int __init sched_debug_setup(char *str) | ||
19 | { | ||
20 | sched_debug_enabled = 1; | ||
21 | |||
22 | return 0; | ||
23 | } | ||
24 | early_param("sched_debug", sched_debug_setup); | ||
25 | |||
26 | static inline bool sched_debug(void) | ||
27 | { | ||
28 | return sched_debug_enabled; | ||
29 | } | ||
30 | |||
31 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | ||
32 | struct cpumask *groupmask) | ||
33 | { | ||
34 | struct sched_group *group = sd->groups; | ||
35 | |||
36 | cpumask_clear(groupmask); | ||
37 | |||
38 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); | ||
39 | |||
40 | if (!(sd->flags & SD_LOAD_BALANCE)) { | ||
41 | printk("does not load-balance\n"); | ||
42 | if (sd->parent) | ||
43 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" | ||
44 | " has parent"); | ||
45 | return -1; | ||
46 | } | ||
47 | |||
48 | printk(KERN_CONT "span %*pbl level %s\n", | ||
49 | cpumask_pr_args(sched_domain_span(sd)), sd->name); | ||
50 | |||
51 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
52 | printk(KERN_ERR "ERROR: domain->span does not contain " | ||
53 | "CPU%d\n", cpu); | ||
54 | } | ||
55 | if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { | ||
56 | printk(KERN_ERR "ERROR: domain->groups does not contain" | ||
57 | " CPU%d\n", cpu); | ||
58 | } | ||
59 | |||
60 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); | ||
61 | do { | ||
62 | if (!group) { | ||
63 | printk("\n"); | ||
64 | printk(KERN_ERR "ERROR: group is NULL\n"); | ||
65 | break; | ||
66 | } | ||
67 | |||
68 | if (!cpumask_weight(sched_group_cpus(group))) { | ||
69 | printk(KERN_CONT "\n"); | ||
70 | printk(KERN_ERR "ERROR: empty group\n"); | ||
71 | break; | ||
72 | } | ||
73 | |||
74 | if (!(sd->flags & SD_OVERLAP) && | ||
75 | cpumask_intersects(groupmask, sched_group_cpus(group))) { | ||
76 | printk(KERN_CONT "\n"); | ||
77 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | ||
78 | break; | ||
79 | } | ||
80 | |||
81 | cpumask_or(groupmask, groupmask, sched_group_cpus(group)); | ||
82 | |||
83 | printk(KERN_CONT " %*pbl", | ||
84 | cpumask_pr_args(sched_group_cpus(group))); | ||
85 | if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { | ||
86 | printk(KERN_CONT " (cpu_capacity = %lu)", | ||
87 | group->sgc->capacity); | ||
88 | } | ||
89 | |||
90 | group = group->next; | ||
91 | } while (group != sd->groups); | ||
92 | printk(KERN_CONT "\n"); | ||
93 | |||
94 | if (!cpumask_equal(sched_domain_span(sd), groupmask)) | ||
95 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | ||
96 | |||
97 | if (sd->parent && | ||
98 | !cpumask_subset(groupmask, sched_domain_span(sd->parent))) | ||
99 | printk(KERN_ERR "ERROR: parent span is not a superset " | ||
100 | "of domain->span\n"); | ||
101 | return 0; | ||
102 | } | ||
103 | |||
104 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | ||
105 | { | ||
106 | int level = 0; | ||
107 | |||
108 | if (!sched_debug_enabled) | ||
109 | return; | ||
110 | |||
111 | if (!sd) { | ||
112 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | ||
113 | return; | ||
114 | } | ||
115 | |||
116 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | ||
117 | |||
118 | for (;;) { | ||
119 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) | ||
120 | break; | ||
121 | level++; | ||
122 | sd = sd->parent; | ||
123 | if (!sd) | ||
124 | break; | ||
125 | } | ||
126 | } | ||
127 | #else /* !CONFIG_SCHED_DEBUG */ | ||
128 | |||
129 | # define sched_debug_enabled 0 | ||
130 | # define sched_domain_debug(sd, cpu) do { } while (0) | ||
131 | static inline bool sched_debug(void) | ||
132 | { | ||
133 | return false; | ||
134 | } | ||
135 | #endif /* CONFIG_SCHED_DEBUG */ | ||
136 | |||
137 | static int sd_degenerate(struct sched_domain *sd) | ||
138 | { | ||
139 | if (cpumask_weight(sched_domain_span(sd)) == 1) | ||
140 | return 1; | ||
141 | |||
142 | /* Following flags need at least 2 groups */ | ||
143 | if (sd->flags & (SD_LOAD_BALANCE | | ||
144 | SD_BALANCE_NEWIDLE | | ||
145 | SD_BALANCE_FORK | | ||
146 | SD_BALANCE_EXEC | | ||
147 | SD_SHARE_CPUCAPACITY | | ||
148 | SD_ASYM_CPUCAPACITY | | ||
149 | SD_SHARE_PKG_RESOURCES | | ||
150 | SD_SHARE_POWERDOMAIN)) { | ||
151 | if (sd->groups != sd->groups->next) | ||
152 | return 0; | ||
153 | } | ||
154 | |||
155 | /* Following flags don't use groups */ | ||
156 | if (sd->flags & (SD_WAKE_AFFINE)) | ||
157 | return 0; | ||
158 | |||
159 | return 1; | ||
160 | } | ||
161 | |||
162 | static int | ||
163 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | ||
164 | { | ||
165 | unsigned long cflags = sd->flags, pflags = parent->flags; | ||
166 | |||
167 | if (sd_degenerate(parent)) | ||
168 | return 1; | ||
169 | |||
170 | if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) | ||
171 | return 0; | ||
172 | |||
173 | /* Flags needing groups don't count if only 1 group in parent */ | ||
174 | if (parent->groups == parent->groups->next) { | ||
175 | pflags &= ~(SD_LOAD_BALANCE | | ||
176 | SD_BALANCE_NEWIDLE | | ||
177 | SD_BALANCE_FORK | | ||
178 | SD_BALANCE_EXEC | | ||
179 | SD_ASYM_CPUCAPACITY | | ||
180 | SD_SHARE_CPUCAPACITY | | ||
181 | SD_SHARE_PKG_RESOURCES | | ||
182 | SD_PREFER_SIBLING | | ||
183 | SD_SHARE_POWERDOMAIN); | ||
184 | if (nr_node_ids == 1) | ||
185 | pflags &= ~SD_SERIALIZE; | ||
186 | } | ||
187 | if (~cflags & pflags) | ||
188 | return 0; | ||
189 | |||
190 | return 1; | ||
191 | } | ||
192 | |||
193 | static void free_rootdomain(struct rcu_head *rcu) | ||
194 | { | ||
195 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); | ||
196 | |||
197 | cpupri_cleanup(&rd->cpupri); | ||
198 | cpudl_cleanup(&rd->cpudl); | ||
199 | free_cpumask_var(rd->dlo_mask); | ||
200 | free_cpumask_var(rd->rto_mask); | ||
201 | free_cpumask_var(rd->online); | ||
202 | free_cpumask_var(rd->span); | ||
203 | kfree(rd); | ||
204 | } | ||
205 | |||
206 | void rq_attach_root(struct rq *rq, struct root_domain *rd) | ||
207 | { | ||
208 | struct root_domain *old_rd = NULL; | ||
209 | unsigned long flags; | ||
210 | |||
211 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
212 | |||
213 | if (rq->rd) { | ||
214 | old_rd = rq->rd; | ||
215 | |||
216 | if (cpumask_test_cpu(rq->cpu, old_rd->online)) | ||
217 | set_rq_offline(rq); | ||
218 | |||
219 | cpumask_clear_cpu(rq->cpu, old_rd->span); | ||
220 | |||
221 | /* | ||
222 | * If we dont want to free the old_rd yet then | ||
223 | * set old_rd to NULL to skip the freeing later | ||
224 | * in this function: | ||
225 | */ | ||
226 | if (!atomic_dec_and_test(&old_rd->refcount)) | ||
227 | old_rd = NULL; | ||
228 | } | ||
229 | |||
230 | atomic_inc(&rd->refcount); | ||
231 | rq->rd = rd; | ||
232 | |||
233 | cpumask_set_cpu(rq->cpu, rd->span); | ||
234 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) | ||
235 | set_rq_online(rq); | ||
236 | |||
237 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
238 | |||
239 | if (old_rd) | ||
240 | call_rcu_sched(&old_rd->rcu, free_rootdomain); | ||
241 | } | ||
242 | |||
243 | static int init_rootdomain(struct root_domain *rd) | ||
244 | { | ||
245 | memset(rd, 0, sizeof(*rd)); | ||
246 | |||
247 | if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) | ||
248 | goto out; | ||
249 | if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) | ||
250 | goto free_span; | ||
251 | if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) | ||
252 | goto free_online; | ||
253 | if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | ||
254 | goto free_dlo_mask; | ||
255 | |||
256 | init_dl_bw(&rd->dl_bw); | ||
257 | if (cpudl_init(&rd->cpudl) != 0) | ||
258 | goto free_rto_mask; | ||
259 | |||
260 | if (cpupri_init(&rd->cpupri) != 0) | ||
261 | goto free_cpudl; | ||
262 | return 0; | ||
263 | |||
264 | free_cpudl: | ||
265 | cpudl_cleanup(&rd->cpudl); | ||
266 | free_rto_mask: | ||
267 | free_cpumask_var(rd->rto_mask); | ||
268 | free_dlo_mask: | ||
269 | free_cpumask_var(rd->dlo_mask); | ||
270 | free_online: | ||
271 | free_cpumask_var(rd->online); | ||
272 | free_span: | ||
273 | free_cpumask_var(rd->span); | ||
274 | out: | ||
275 | return -ENOMEM; | ||
276 | } | ||
277 | |||
278 | /* | ||
279 | * By default the system creates a single root-domain with all CPUs as | ||
280 | * members (mimicking the global state we have today). | ||
281 | */ | ||
282 | struct root_domain def_root_domain; | ||
283 | |||
284 | void init_defrootdomain(void) | ||
285 | { | ||
286 | init_rootdomain(&def_root_domain); | ||
287 | |||
288 | atomic_set(&def_root_domain.refcount, 1); | ||
289 | } | ||
290 | |||
291 | static struct root_domain *alloc_rootdomain(void) | ||
292 | { | ||
293 | struct root_domain *rd; | ||
294 | |||
295 | rd = kmalloc(sizeof(*rd), GFP_KERNEL); | ||
296 | if (!rd) | ||
297 | return NULL; | ||
298 | |||
299 | if (init_rootdomain(rd) != 0) { | ||
300 | kfree(rd); | ||
301 | return NULL; | ||
302 | } | ||
303 | |||
304 | return rd; | ||
305 | } | ||
306 | |||
307 | static void free_sched_groups(struct sched_group *sg, int free_sgc) | ||
308 | { | ||
309 | struct sched_group *tmp, *first; | ||
310 | |||
311 | if (!sg) | ||
312 | return; | ||
313 | |||
314 | first = sg; | ||
315 | do { | ||
316 | tmp = sg->next; | ||
317 | |||
318 | if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) | ||
319 | kfree(sg->sgc); | ||
320 | |||
321 | kfree(sg); | ||
322 | sg = tmp; | ||
323 | } while (sg != first); | ||
324 | } | ||
325 | |||
326 | static void destroy_sched_domain(struct sched_domain *sd) | ||
327 | { | ||
328 | /* | ||
329 | * If its an overlapping domain it has private groups, iterate and | ||
330 | * nuke them all. | ||
331 | */ | ||
332 | if (sd->flags & SD_OVERLAP) { | ||
333 | free_sched_groups(sd->groups, 1); | ||
334 | } else if (atomic_dec_and_test(&sd->groups->ref)) { | ||
335 | kfree(sd->groups->sgc); | ||
336 | kfree(sd->groups); | ||
337 | } | ||
338 | if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) | ||
339 | kfree(sd->shared); | ||
340 | kfree(sd); | ||
341 | } | ||
342 | |||
343 | static void destroy_sched_domains_rcu(struct rcu_head *rcu) | ||
344 | { | ||
345 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
346 | |||
347 | while (sd) { | ||
348 | struct sched_domain *parent = sd->parent; | ||
349 | destroy_sched_domain(sd); | ||
350 | sd = parent; | ||
351 | } | ||
352 | } | ||
353 | |||
354 | static void destroy_sched_domains(struct sched_domain *sd) | ||
355 | { | ||
356 | if (sd) | ||
357 | call_rcu(&sd->rcu, destroy_sched_domains_rcu); | ||
358 | } | ||
359 | |||
360 | /* | ||
361 | * Keep a special pointer to the highest sched_domain that has | ||
362 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this | ||
363 | * allows us to avoid some pointer chasing select_idle_sibling(). | ||
364 | * | ||
365 | * Also keep a unique ID per domain (we use the first CPU number in | ||
366 | * the cpumask of the domain), this allows us to quickly tell if | ||
367 | * two CPUs are in the same cache domain, see cpus_share_cache(). | ||
368 | */ | ||
369 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | ||
370 | DEFINE_PER_CPU(int, sd_llc_size); | ||
371 | DEFINE_PER_CPU(int, sd_llc_id); | ||
372 | DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); | ||
373 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); | ||
374 | DEFINE_PER_CPU(struct sched_domain *, sd_asym); | ||
375 | |||
376 | static void update_top_cache_domain(int cpu) | ||
377 | { | ||
378 | struct sched_domain_shared *sds = NULL; | ||
379 | struct sched_domain *sd; | ||
380 | int id = cpu; | ||
381 | int size = 1; | ||
382 | |||
383 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | ||
384 | if (sd) { | ||
385 | id = cpumask_first(sched_domain_span(sd)); | ||
386 | size = cpumask_weight(sched_domain_span(sd)); | ||
387 | sds = sd->shared; | ||
388 | } | ||
389 | |||
390 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | ||
391 | per_cpu(sd_llc_size, cpu) = size; | ||
392 | per_cpu(sd_llc_id, cpu) = id; | ||
393 | rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); | ||
394 | |||
395 | sd = lowest_flag_domain(cpu, SD_NUMA); | ||
396 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); | ||
397 | |||
398 | sd = highest_flag_domain(cpu, SD_ASYM_PACKING); | ||
399 | rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); | ||
400 | } | ||
401 | |||
402 | /* | ||
403 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | ||
404 | * hold the hotplug lock. | ||
405 | */ | ||
406 | static void | ||
407 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | ||
408 | { | ||
409 | struct rq *rq = cpu_rq(cpu); | ||
410 | struct sched_domain *tmp; | ||
411 | |||
412 | /* Remove the sched domains which do not contribute to scheduling. */ | ||
413 | for (tmp = sd; tmp; ) { | ||
414 | struct sched_domain *parent = tmp->parent; | ||
415 | if (!parent) | ||
416 | break; | ||
417 | |||
418 | if (sd_parent_degenerate(tmp, parent)) { | ||
419 | tmp->parent = parent->parent; | ||
420 | if (parent->parent) | ||
421 | parent->parent->child = tmp; | ||
422 | /* | ||
423 | * Transfer SD_PREFER_SIBLING down in case of a | ||
424 | * degenerate parent; the spans match for this | ||
425 | * so the property transfers. | ||
426 | */ | ||
427 | if (parent->flags & SD_PREFER_SIBLING) | ||
428 | tmp->flags |= SD_PREFER_SIBLING; | ||
429 | destroy_sched_domain(parent); | ||
430 | } else | ||
431 | tmp = tmp->parent; | ||
432 | } | ||
433 | |||
434 | if (sd && sd_degenerate(sd)) { | ||
435 | tmp = sd; | ||
436 | sd = sd->parent; | ||
437 | destroy_sched_domain(tmp); | ||
438 | if (sd) | ||
439 | sd->child = NULL; | ||
440 | } | ||
441 | |||
442 | sched_domain_debug(sd, cpu); | ||
443 | |||
444 | rq_attach_root(rq, rd); | ||
445 | tmp = rq->sd; | ||
446 | rcu_assign_pointer(rq->sd, sd); | ||
447 | destroy_sched_domains(tmp); | ||
448 | |||
449 | update_top_cache_domain(cpu); | ||
450 | } | ||
451 | |||
452 | /* Setup the mask of CPUs configured for isolated domains */ | ||
453 | static int __init isolated_cpu_setup(char *str) | ||
454 | { | ||
455 | int ret; | ||
456 | |||
457 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | ||
458 | ret = cpulist_parse(str, cpu_isolated_map); | ||
459 | if (ret) { | ||
460 | pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); | ||
461 | return 0; | ||
462 | } | ||
463 | return 1; | ||
464 | } | ||
465 | __setup("isolcpus=", isolated_cpu_setup); | ||
466 | |||
467 | struct s_data { | ||
468 | struct sched_domain ** __percpu sd; | ||
469 | struct root_domain *rd; | ||
470 | }; | ||
471 | |||
472 | enum s_alloc { | ||
473 | sa_rootdomain, | ||
474 | sa_sd, | ||
475 | sa_sd_storage, | ||
476 | sa_none, | ||
477 | }; | ||
478 | |||
479 | /* | ||
480 | * Build an iteration mask that can exclude certain CPUs from the upwards | ||
481 | * domain traversal. | ||
482 | * | ||
483 | * Asymmetric node setups can result in situations where the domain tree is of | ||
484 | * unequal depth, make sure to skip domains that already cover the entire | ||
485 | * range. | ||
486 | * | ||
487 | * In that case build_sched_domains() will have terminated the iteration early | ||
488 | * and our sibling sd spans will be empty. Domains should always include the | ||
489 | * CPU they're built on, so check that. | ||
490 | */ | ||
491 | static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) | ||
492 | { | ||
493 | const struct cpumask *span = sched_domain_span(sd); | ||
494 | struct sd_data *sdd = sd->private; | ||
495 | struct sched_domain *sibling; | ||
496 | int i; | ||
497 | |||
498 | for_each_cpu(i, span) { | ||
499 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
500 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
501 | continue; | ||
502 | |||
503 | cpumask_set_cpu(i, sched_group_mask(sg)); | ||
504 | } | ||
505 | } | ||
506 | |||
507 | /* | ||
508 | * Return the canonical balance CPU for this group, this is the first CPU | ||
509 | * of this group that's also in the iteration mask. | ||
510 | */ | ||
511 | int group_balance_cpu(struct sched_group *sg) | ||
512 | { | ||
513 | return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); | ||
514 | } | ||
515 | |||
516 | static int | ||
517 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) | ||
518 | { | ||
519 | struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; | ||
520 | const struct cpumask *span = sched_domain_span(sd); | ||
521 | struct cpumask *covered = sched_domains_tmpmask; | ||
522 | struct sd_data *sdd = sd->private; | ||
523 | struct sched_domain *sibling; | ||
524 | int i; | ||
525 | |||
526 | cpumask_clear(covered); | ||
527 | |||
528 | for_each_cpu(i, span) { | ||
529 | struct cpumask *sg_span; | ||
530 | |||
531 | if (cpumask_test_cpu(i, covered)) | ||
532 | continue; | ||
533 | |||
534 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
535 | |||
536 | /* See the comment near build_group_mask(). */ | ||
537 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
538 | continue; | ||
539 | |||
540 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
541 | GFP_KERNEL, cpu_to_node(cpu)); | ||
542 | |||
543 | if (!sg) | ||
544 | goto fail; | ||
545 | |||
546 | sg_span = sched_group_cpus(sg); | ||
547 | if (sibling->child) | ||
548 | cpumask_copy(sg_span, sched_domain_span(sibling->child)); | ||
549 | else | ||
550 | cpumask_set_cpu(i, sg_span); | ||
551 | |||
552 | cpumask_or(covered, covered, sg_span); | ||
553 | |||
554 | sg->sgc = *per_cpu_ptr(sdd->sgc, i); | ||
555 | if (atomic_inc_return(&sg->sgc->ref) == 1) | ||
556 | build_group_mask(sd, sg); | ||
557 | |||
558 | /* | ||
559 | * Initialize sgc->capacity such that even if we mess up the | ||
560 | * domains and no possible iteration will get us here, we won't | ||
561 | * die on a /0 trap. | ||
562 | */ | ||
563 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); | ||
564 | sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; | ||
565 | |||
566 | /* | ||
567 | * Make sure the first group of this domain contains the | ||
568 | * canonical balance CPU. Otherwise the sched_domain iteration | ||
569 | * breaks. See update_sg_lb_stats(). | ||
570 | */ | ||
571 | if ((!groups && cpumask_test_cpu(cpu, sg_span)) || | ||
572 | group_balance_cpu(sg) == cpu) | ||
573 | groups = sg; | ||
574 | |||
575 | if (!first) | ||
576 | first = sg; | ||
577 | if (last) | ||
578 | last->next = sg; | ||
579 | last = sg; | ||
580 | last->next = first; | ||
581 | } | ||
582 | sd->groups = groups; | ||
583 | |||
584 | return 0; | ||
585 | |||
586 | fail: | ||
587 | free_sched_groups(first, 0); | ||
588 | |||
589 | return -ENOMEM; | ||
590 | } | ||
591 | |||
592 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) | ||
593 | { | ||
594 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | ||
595 | struct sched_domain *child = sd->child; | ||
596 | |||
597 | if (child) | ||
598 | cpu = cpumask_first(sched_domain_span(child)); | ||
599 | |||
600 | if (sg) { | ||
601 | *sg = *per_cpu_ptr(sdd->sg, cpu); | ||
602 | (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); | ||
603 | |||
604 | /* For claim_allocations: */ | ||
605 | atomic_set(&(*sg)->sgc->ref, 1); | ||
606 | } | ||
607 | |||
608 | return cpu; | ||
609 | } | ||
610 | |||
611 | /* | ||
612 | * build_sched_groups will build a circular linked list of the groups | ||
613 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
614 | * and ->cpu_capacity to 0. | ||
615 | * | ||
616 | * Assumes the sched_domain tree is fully constructed | ||
617 | */ | ||
618 | static int | ||
619 | build_sched_groups(struct sched_domain *sd, int cpu) | ||
620 | { | ||
621 | struct sched_group *first = NULL, *last = NULL; | ||
622 | struct sd_data *sdd = sd->private; | ||
623 | const struct cpumask *span = sched_domain_span(sd); | ||
624 | struct cpumask *covered; | ||
625 | int i; | ||
626 | |||
627 | get_group(cpu, sdd, &sd->groups); | ||
628 | atomic_inc(&sd->groups->ref); | ||
629 | |||
630 | if (cpu != cpumask_first(span)) | ||
631 | return 0; | ||
632 | |||
633 | lockdep_assert_held(&sched_domains_mutex); | ||
634 | covered = sched_domains_tmpmask; | ||
635 | |||
636 | cpumask_clear(covered); | ||
637 | |||
638 | for_each_cpu(i, span) { | ||
639 | struct sched_group *sg; | ||
640 | int group, j; | ||
641 | |||
642 | if (cpumask_test_cpu(i, covered)) | ||
643 | continue; | ||
644 | |||
645 | group = get_group(i, sdd, &sg); | ||
646 | cpumask_setall(sched_group_mask(sg)); | ||
647 | |||
648 | for_each_cpu(j, span) { | ||
649 | if (get_group(j, sdd, NULL) != group) | ||
650 | continue; | ||
651 | |||
652 | cpumask_set_cpu(j, covered); | ||
653 | cpumask_set_cpu(j, sched_group_cpus(sg)); | ||
654 | } | ||
655 | |||
656 | if (!first) | ||
657 | first = sg; | ||
658 | if (last) | ||
659 | last->next = sg; | ||
660 | last = sg; | ||
661 | } | ||
662 | last->next = first; | ||
663 | |||
664 | return 0; | ||
665 | } | ||
666 | |||
667 | /* | ||
668 | * Initialize sched groups cpu_capacity. | ||
669 | * | ||
670 | * cpu_capacity indicates the capacity of sched group, which is used while | ||
671 | * distributing the load between different sched groups in a sched domain. | ||
672 | * Typically cpu_capacity for all the groups in a sched domain will be same | ||
673 | * unless there are asymmetries in the topology. If there are asymmetries, | ||
674 | * group having more cpu_capacity will pickup more load compared to the | ||
675 | * group having less cpu_capacity. | ||
676 | */ | ||
677 | static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) | ||
678 | { | ||
679 | struct sched_group *sg = sd->groups; | ||
680 | |||
681 | WARN_ON(!sg); | ||
682 | |||
683 | do { | ||
684 | int cpu, max_cpu = -1; | ||
685 | |||
686 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); | ||
687 | |||
688 | if (!(sd->flags & SD_ASYM_PACKING)) | ||
689 | goto next; | ||
690 | |||
691 | for_each_cpu(cpu, sched_group_cpus(sg)) { | ||
692 | if (max_cpu < 0) | ||
693 | max_cpu = cpu; | ||
694 | else if (sched_asym_prefer(cpu, max_cpu)) | ||
695 | max_cpu = cpu; | ||
696 | } | ||
697 | sg->asym_prefer_cpu = max_cpu; | ||
698 | |||
699 | next: | ||
700 | sg = sg->next; | ||
701 | } while (sg != sd->groups); | ||
702 | |||
703 | if (cpu != group_balance_cpu(sg)) | ||
704 | return; | ||
705 | |||
706 | update_group_capacity(sd, cpu); | ||
707 | } | ||
708 | |||
709 | /* | ||
710 | * Initializers for schedule domains | ||
711 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | ||
712 | */ | ||
713 | |||
714 | static int default_relax_domain_level = -1; | ||
715 | int sched_domain_level_max; | ||
716 | |||
717 | static int __init setup_relax_domain_level(char *str) | ||
718 | { | ||
719 | if (kstrtoint(str, 0, &default_relax_domain_level)) | ||
720 | pr_warn("Unable to set relax_domain_level\n"); | ||
721 | |||
722 | return 1; | ||
723 | } | ||
724 | __setup("relax_domain_level=", setup_relax_domain_level); | ||
725 | |||
726 | static void set_domain_attribute(struct sched_domain *sd, | ||
727 | struct sched_domain_attr *attr) | ||
728 | { | ||
729 | int request; | ||
730 | |||
731 | if (!attr || attr->relax_domain_level < 0) { | ||
732 | if (default_relax_domain_level < 0) | ||
733 | return; | ||
734 | else | ||
735 | request = default_relax_domain_level; | ||
736 | } else | ||
737 | request = attr->relax_domain_level; | ||
738 | if (request < sd->level) { | ||
739 | /* Turn off idle balance on this domain: */ | ||
740 | sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); | ||
741 | } else { | ||
742 | /* Turn on idle balance on this domain: */ | ||
743 | sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); | ||
744 | } | ||
745 | } | ||
746 | |||
747 | static void __sdt_free(const struct cpumask *cpu_map); | ||
748 | static int __sdt_alloc(const struct cpumask *cpu_map); | ||
749 | |||
750 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | ||
751 | const struct cpumask *cpu_map) | ||
752 | { | ||
753 | switch (what) { | ||
754 | case sa_rootdomain: | ||
755 | if (!atomic_read(&d->rd->refcount)) | ||
756 | free_rootdomain(&d->rd->rcu); | ||
757 | /* Fall through */ | ||
758 | case sa_sd: | ||
759 | free_percpu(d->sd); | ||
760 | /* Fall through */ | ||
761 | case sa_sd_storage: | ||
762 | __sdt_free(cpu_map); | ||
763 | /* Fall through */ | ||
764 | case sa_none: | ||
765 | break; | ||
766 | } | ||
767 | } | ||
768 | |||
769 | static enum s_alloc | ||
770 | __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) | ||
771 | { | ||
772 | memset(d, 0, sizeof(*d)); | ||
773 | |||
774 | if (__sdt_alloc(cpu_map)) | ||
775 | return sa_sd_storage; | ||
776 | d->sd = alloc_percpu(struct sched_domain *); | ||
777 | if (!d->sd) | ||
778 | return sa_sd_storage; | ||
779 | d->rd = alloc_rootdomain(); | ||
780 | if (!d->rd) | ||
781 | return sa_sd; | ||
782 | return sa_rootdomain; | ||
783 | } | ||
784 | |||
785 | /* | ||
786 | * NULL the sd_data elements we've used to build the sched_domain and | ||
787 | * sched_group structure so that the subsequent __free_domain_allocs() | ||
788 | * will not free the data we're using. | ||
789 | */ | ||
790 | static void claim_allocations(int cpu, struct sched_domain *sd) | ||
791 | { | ||
792 | struct sd_data *sdd = sd->private; | ||
793 | |||
794 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); | ||
795 | *per_cpu_ptr(sdd->sd, cpu) = NULL; | ||
796 | |||
797 | if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) | ||
798 | *per_cpu_ptr(sdd->sds, cpu) = NULL; | ||
799 | |||
800 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) | ||
801 | *per_cpu_ptr(sdd->sg, cpu) = NULL; | ||
802 | |||
803 | if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) | ||
804 | *per_cpu_ptr(sdd->sgc, cpu) = NULL; | ||
805 | } | ||
806 | |||
807 | #ifdef CONFIG_NUMA | ||
808 | static int sched_domains_numa_levels; | ||
809 | enum numa_topology_type sched_numa_topology_type; | ||
810 | static int *sched_domains_numa_distance; | ||
811 | int sched_max_numa_distance; | ||
812 | static struct cpumask ***sched_domains_numa_masks; | ||
813 | static int sched_domains_curr_level; | ||
814 | #endif | ||
815 | |||
816 | /* | ||
817 | * SD_flags allowed in topology descriptions. | ||
818 | * | ||
819 | * These flags are purely descriptive of the topology and do not prescribe | ||
820 | * behaviour. Behaviour is artificial and mapped in the below sd_init() | ||
821 | * function: | ||
822 | * | ||
823 | * SD_SHARE_CPUCAPACITY - describes SMT topologies | ||
824 | * SD_SHARE_PKG_RESOURCES - describes shared caches | ||
825 | * SD_NUMA - describes NUMA topologies | ||
826 | * SD_SHARE_POWERDOMAIN - describes shared power domain | ||
827 | * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies | ||
828 | * | ||
829 | * Odd one out, which beside describing the topology has a quirk also | ||
830 | * prescribes the desired behaviour that goes along with it: | ||
831 | * | ||
832 | * SD_ASYM_PACKING - describes SMT quirks | ||
833 | */ | ||
834 | #define TOPOLOGY_SD_FLAGS \ | ||
835 | (SD_SHARE_CPUCAPACITY | \ | ||
836 | SD_SHARE_PKG_RESOURCES | \ | ||
837 | SD_NUMA | \ | ||
838 | SD_ASYM_PACKING | \ | ||
839 | SD_ASYM_CPUCAPACITY | \ | ||
840 | SD_SHARE_POWERDOMAIN) | ||
841 | |||
842 | static struct sched_domain * | ||
843 | sd_init(struct sched_domain_topology_level *tl, | ||
844 | const struct cpumask *cpu_map, | ||
845 | struct sched_domain *child, int cpu) | ||
846 | { | ||
847 | struct sd_data *sdd = &tl->data; | ||
848 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | ||
849 | int sd_id, sd_weight, sd_flags = 0; | ||
850 | |||
851 | #ifdef CONFIG_NUMA | ||
852 | /* | ||
853 | * Ugly hack to pass state to sd_numa_mask()... | ||
854 | */ | ||
855 | sched_domains_curr_level = tl->numa_level; | ||
856 | #endif | ||
857 | |||
858 | sd_weight = cpumask_weight(tl->mask(cpu)); | ||
859 | |||
860 | if (tl->sd_flags) | ||
861 | sd_flags = (*tl->sd_flags)(); | ||
862 | if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, | ||
863 | "wrong sd_flags in topology description\n")) | ||
864 | sd_flags &= ~TOPOLOGY_SD_FLAGS; | ||
865 | |||
866 | *sd = (struct sched_domain){ | ||
867 | .min_interval = sd_weight, | ||
868 | .max_interval = 2*sd_weight, | ||
869 | .busy_factor = 32, | ||
870 | .imbalance_pct = 125, | ||
871 | |||
872 | .cache_nice_tries = 0, | ||
873 | .busy_idx = 0, | ||
874 | .idle_idx = 0, | ||
875 | .newidle_idx = 0, | ||
876 | .wake_idx = 0, | ||
877 | .forkexec_idx = 0, | ||
878 | |||
879 | .flags = 1*SD_LOAD_BALANCE | ||
880 | | 1*SD_BALANCE_NEWIDLE | ||
881 | | 1*SD_BALANCE_EXEC | ||
882 | | 1*SD_BALANCE_FORK | ||
883 | | 0*SD_BALANCE_WAKE | ||
884 | | 1*SD_WAKE_AFFINE | ||
885 | | 0*SD_SHARE_CPUCAPACITY | ||
886 | | 0*SD_SHARE_PKG_RESOURCES | ||
887 | | 0*SD_SERIALIZE | ||
888 | | 0*SD_PREFER_SIBLING | ||
889 | | 0*SD_NUMA | ||
890 | | sd_flags | ||
891 | , | ||
892 | |||
893 | .last_balance = jiffies, | ||
894 | .balance_interval = sd_weight, | ||
895 | .smt_gain = 0, | ||
896 | .max_newidle_lb_cost = 0, | ||
897 | .next_decay_max_lb_cost = jiffies, | ||
898 | .child = child, | ||
899 | #ifdef CONFIG_SCHED_DEBUG | ||
900 | .name = tl->name, | ||
901 | #endif | ||
902 | }; | ||
903 | |||
904 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
905 | sd_id = cpumask_first(sched_domain_span(sd)); | ||
906 | |||
907 | /* | ||
908 | * Convert topological properties into behaviour. | ||
909 | */ | ||
910 | |||
911 | if (sd->flags & SD_ASYM_CPUCAPACITY) { | ||
912 | struct sched_domain *t = sd; | ||
913 | |||
914 | for_each_lower_domain(t) | ||
915 | t->flags |= SD_BALANCE_WAKE; | ||
916 | } | ||
917 | |||
918 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | ||
919 | sd->flags |= SD_PREFER_SIBLING; | ||
920 | sd->imbalance_pct = 110; | ||
921 | sd->smt_gain = 1178; /* ~15% */ | ||
922 | |||
923 | } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
924 | sd->imbalance_pct = 117; | ||
925 | sd->cache_nice_tries = 1; | ||
926 | sd->busy_idx = 2; | ||
927 | |||
928 | #ifdef CONFIG_NUMA | ||
929 | } else if (sd->flags & SD_NUMA) { | ||
930 | sd->cache_nice_tries = 2; | ||
931 | sd->busy_idx = 3; | ||
932 | sd->idle_idx = 2; | ||
933 | |||
934 | sd->flags |= SD_SERIALIZE; | ||
935 | if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { | ||
936 | sd->flags &= ~(SD_BALANCE_EXEC | | ||
937 | SD_BALANCE_FORK | | ||
938 | SD_WAKE_AFFINE); | ||
939 | } | ||
940 | |||
941 | #endif | ||
942 | } else { | ||
943 | sd->flags |= SD_PREFER_SIBLING; | ||
944 | sd->cache_nice_tries = 1; | ||
945 | sd->busy_idx = 2; | ||
946 | sd->idle_idx = 1; | ||
947 | } | ||
948 | |||
949 | /* | ||
950 | * For all levels sharing cache; connect a sched_domain_shared | ||
951 | * instance. | ||
952 | */ | ||
953 | if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
954 | sd->shared = *per_cpu_ptr(sdd->sds, sd_id); | ||
955 | atomic_inc(&sd->shared->ref); | ||
956 | atomic_set(&sd->shared->nr_busy_cpus, sd_weight); | ||
957 | } | ||
958 | |||
959 | sd->private = sdd; | ||
960 | |||
961 | return sd; | ||
962 | } | ||
963 | |||
964 | /* | ||
965 | * Topology list, bottom-up. | ||
966 | */ | ||
967 | static struct sched_domain_topology_level default_topology[] = { | ||
968 | #ifdef CONFIG_SCHED_SMT | ||
969 | { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, | ||
970 | #endif | ||
971 | #ifdef CONFIG_SCHED_MC | ||
972 | { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, | ||
973 | #endif | ||
974 | { cpu_cpu_mask, SD_INIT_NAME(DIE) }, | ||
975 | { NULL, }, | ||
976 | }; | ||
977 | |||
978 | static struct sched_domain_topology_level *sched_domain_topology = | ||
979 | default_topology; | ||
980 | |||
981 | #define for_each_sd_topology(tl) \ | ||
982 | for (tl = sched_domain_topology; tl->mask; tl++) | ||
983 | |||
984 | void set_sched_topology(struct sched_domain_topology_level *tl) | ||
985 | { | ||
986 | if (WARN_ON_ONCE(sched_smp_initialized)) | ||
987 | return; | ||
988 | |||
989 | sched_domain_topology = tl; | ||
990 | } | ||
991 | |||
992 | #ifdef CONFIG_NUMA | ||
993 | |||
994 | static const struct cpumask *sd_numa_mask(int cpu) | ||
995 | { | ||
996 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | ||
997 | } | ||
998 | |||
999 | static void sched_numa_warn(const char *str) | ||
1000 | { | ||
1001 | static int done = false; | ||
1002 | int i,j; | ||
1003 | |||
1004 | if (done) | ||
1005 | return; | ||
1006 | |||
1007 | done = true; | ||
1008 | |||
1009 | printk(KERN_WARNING "ERROR: %s\n\n", str); | ||
1010 | |||
1011 | for (i = 0; i < nr_node_ids; i++) { | ||
1012 | printk(KERN_WARNING " "); | ||
1013 | for (j = 0; j < nr_node_ids; j++) | ||
1014 | printk(KERN_CONT "%02d ", node_distance(i,j)); | ||
1015 | printk(KERN_CONT "\n"); | ||
1016 | } | ||
1017 | printk(KERN_WARNING "\n"); | ||
1018 | } | ||
1019 | |||
1020 | bool find_numa_distance(int distance) | ||
1021 | { | ||
1022 | int i; | ||
1023 | |||
1024 | if (distance == node_distance(0, 0)) | ||
1025 | return true; | ||
1026 | |||
1027 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
1028 | if (sched_domains_numa_distance[i] == distance) | ||
1029 | return true; | ||
1030 | } | ||
1031 | |||
1032 | return false; | ||
1033 | } | ||
1034 | |||
1035 | /* | ||
1036 | * A system can have three types of NUMA topology: | ||
1037 | * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system | ||
1038 | * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes | ||
1039 | * NUMA_BACKPLANE: nodes can reach other nodes through a backplane | ||
1040 | * | ||
1041 | * The difference between a glueless mesh topology and a backplane | ||
1042 | * topology lies in whether communication between not directly | ||
1043 | * connected nodes goes through intermediary nodes (where programs | ||
1044 | * could run), or through backplane controllers. This affects | ||
1045 | * placement of programs. | ||
1046 | * | ||
1047 | * The type of topology can be discerned with the following tests: | ||
1048 | * - If the maximum distance between any nodes is 1 hop, the system | ||
1049 | * is directly connected. | ||
1050 | * - If for two nodes A and B, located N > 1 hops away from each other, | ||
1051 | * there is an intermediary node C, which is < N hops away from both | ||
1052 | * nodes A and B, the system is a glueless mesh. | ||
1053 | */ | ||
1054 | static void init_numa_topology_type(void) | ||
1055 | { | ||
1056 | int a, b, c, n; | ||
1057 | |||
1058 | n = sched_max_numa_distance; | ||
1059 | |||
1060 | if (sched_domains_numa_levels <= 1) { | ||
1061 | sched_numa_topology_type = NUMA_DIRECT; | ||
1062 | return; | ||
1063 | } | ||
1064 | |||
1065 | for_each_online_node(a) { | ||
1066 | for_each_online_node(b) { | ||
1067 | /* Find two nodes furthest removed from each other. */ | ||
1068 | if (node_distance(a, b) < n) | ||
1069 | continue; | ||
1070 | |||
1071 | /* Is there an intermediary node between a and b? */ | ||
1072 | for_each_online_node(c) { | ||
1073 | if (node_distance(a, c) < n && | ||
1074 | node_distance(b, c) < n) { | ||
1075 | sched_numa_topology_type = | ||
1076 | NUMA_GLUELESS_MESH; | ||
1077 | return; | ||
1078 | } | ||
1079 | } | ||
1080 | |||
1081 | sched_numa_topology_type = NUMA_BACKPLANE; | ||
1082 | return; | ||
1083 | } | ||
1084 | } | ||
1085 | } | ||
1086 | |||
1087 | void sched_init_numa(void) | ||
1088 | { | ||
1089 | int next_distance, curr_distance = node_distance(0, 0); | ||
1090 | struct sched_domain_topology_level *tl; | ||
1091 | int level = 0; | ||
1092 | int i, j, k; | ||
1093 | |||
1094 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); | ||
1095 | if (!sched_domains_numa_distance) | ||
1096 | return; | ||
1097 | |||
1098 | /* | ||
1099 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the | ||
1100 | * unique distances in the node_distance() table. | ||
1101 | * | ||
1102 | * Assumes node_distance(0,j) includes all distances in | ||
1103 | * node_distance(i,j) in order to avoid cubic time. | ||
1104 | */ | ||
1105 | next_distance = curr_distance; | ||
1106 | for (i = 0; i < nr_node_ids; i++) { | ||
1107 | for (j = 0; j < nr_node_ids; j++) { | ||
1108 | for (k = 0; k < nr_node_ids; k++) { | ||
1109 | int distance = node_distance(i, k); | ||
1110 | |||
1111 | if (distance > curr_distance && | ||
1112 | (distance < next_distance || | ||
1113 | next_distance == curr_distance)) | ||
1114 | next_distance = distance; | ||
1115 | |||
1116 | /* | ||
1117 | * While not a strong assumption it would be nice to know | ||
1118 | * about cases where if node A is connected to B, B is not | ||
1119 | * equally connected to A. | ||
1120 | */ | ||
1121 | if (sched_debug() && node_distance(k, i) != distance) | ||
1122 | sched_numa_warn("Node-distance not symmetric"); | ||
1123 | |||
1124 | if (sched_debug() && i && !find_numa_distance(distance)) | ||
1125 | sched_numa_warn("Node-0 not representative"); | ||
1126 | } | ||
1127 | if (next_distance != curr_distance) { | ||
1128 | sched_domains_numa_distance[level++] = next_distance; | ||
1129 | sched_domains_numa_levels = level; | ||
1130 | curr_distance = next_distance; | ||
1131 | } else break; | ||
1132 | } | ||
1133 | |||
1134 | /* | ||
1135 | * In case of sched_debug() we verify the above assumption. | ||
1136 | */ | ||
1137 | if (!sched_debug()) | ||
1138 | break; | ||
1139 | } | ||
1140 | |||
1141 | if (!level) | ||
1142 | return; | ||
1143 | |||
1144 | /* | ||
1145 | * 'level' contains the number of unique distances, excluding the | ||
1146 | * identity distance node_distance(i,i). | ||
1147 | * | ||
1148 | * The sched_domains_numa_distance[] array includes the actual distance | ||
1149 | * numbers. | ||
1150 | */ | ||
1151 | |||
1152 | /* | ||
1153 | * Here, we should temporarily reset sched_domains_numa_levels to 0. | ||
1154 | * If it fails to allocate memory for array sched_domains_numa_masks[][], | ||
1155 | * the array will contain less then 'level' members. This could be | ||
1156 | * dangerous when we use it to iterate array sched_domains_numa_masks[][] | ||
1157 | * in other functions. | ||
1158 | * | ||
1159 | * We reset it to 'level' at the end of this function. | ||
1160 | */ | ||
1161 | sched_domains_numa_levels = 0; | ||
1162 | |||
1163 | sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); | ||
1164 | if (!sched_domains_numa_masks) | ||
1165 | return; | ||
1166 | |||
1167 | /* | ||
1168 | * Now for each level, construct a mask per node which contains all | ||
1169 | * CPUs of nodes that are that many hops away from us. | ||
1170 | */ | ||
1171 | for (i = 0; i < level; i++) { | ||
1172 | sched_domains_numa_masks[i] = | ||
1173 | kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); | ||
1174 | if (!sched_domains_numa_masks[i]) | ||
1175 | return; | ||
1176 | |||
1177 | for (j = 0; j < nr_node_ids; j++) { | ||
1178 | struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); | ||
1179 | if (!mask) | ||
1180 | return; | ||
1181 | |||
1182 | sched_domains_numa_masks[i][j] = mask; | ||
1183 | |||
1184 | for_each_node(k) { | ||
1185 | if (node_distance(j, k) > sched_domains_numa_distance[i]) | ||
1186 | continue; | ||
1187 | |||
1188 | cpumask_or(mask, mask, cpumask_of_node(k)); | ||
1189 | } | ||
1190 | } | ||
1191 | } | ||
1192 | |||
1193 | /* Compute default topology size */ | ||
1194 | for (i = 0; sched_domain_topology[i].mask; i++); | ||
1195 | |||
1196 | tl = kzalloc((i + level + 1) * | ||
1197 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); | ||
1198 | if (!tl) | ||
1199 | return; | ||
1200 | |||
1201 | /* | ||
1202 | * Copy the default topology bits.. | ||
1203 | */ | ||
1204 | for (i = 0; sched_domain_topology[i].mask; i++) | ||
1205 | tl[i] = sched_domain_topology[i]; | ||
1206 | |||
1207 | /* | ||
1208 | * .. and append 'j' levels of NUMA goodness. | ||
1209 | */ | ||
1210 | for (j = 0; j < level; i++, j++) { | ||
1211 | tl[i] = (struct sched_domain_topology_level){ | ||
1212 | .mask = sd_numa_mask, | ||
1213 | .sd_flags = cpu_numa_flags, | ||
1214 | .flags = SDTL_OVERLAP, | ||
1215 | .numa_level = j, | ||
1216 | SD_INIT_NAME(NUMA) | ||
1217 | }; | ||
1218 | } | ||
1219 | |||
1220 | sched_domain_topology = tl; | ||
1221 | |||
1222 | sched_domains_numa_levels = level; | ||
1223 | sched_max_numa_distance = sched_domains_numa_distance[level - 1]; | ||
1224 | |||
1225 | init_numa_topology_type(); | ||
1226 | } | ||
1227 | |||
1228 | void sched_domains_numa_masks_set(unsigned int cpu) | ||
1229 | { | ||
1230 | int node = cpu_to_node(cpu); | ||
1231 | int i, j; | ||
1232 | |||
1233 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
1234 | for (j = 0; j < nr_node_ids; j++) { | ||
1235 | if (node_distance(j, node) <= sched_domains_numa_distance[i]) | ||
1236 | cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); | ||
1237 | } | ||
1238 | } | ||
1239 | } | ||
1240 | |||
1241 | void sched_domains_numa_masks_clear(unsigned int cpu) | ||
1242 | { | ||
1243 | int i, j; | ||
1244 | |||
1245 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
1246 | for (j = 0; j < nr_node_ids; j++) | ||
1247 | cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); | ||
1248 | } | ||
1249 | } | ||
1250 | |||
1251 | #endif /* CONFIG_NUMA */ | ||
1252 | |||
1253 | static int __sdt_alloc(const struct cpumask *cpu_map) | ||
1254 | { | ||
1255 | struct sched_domain_topology_level *tl; | ||
1256 | int j; | ||
1257 | |||
1258 | for_each_sd_topology(tl) { | ||
1259 | struct sd_data *sdd = &tl->data; | ||
1260 | |||
1261 | sdd->sd = alloc_percpu(struct sched_domain *); | ||
1262 | if (!sdd->sd) | ||
1263 | return -ENOMEM; | ||
1264 | |||
1265 | sdd->sds = alloc_percpu(struct sched_domain_shared *); | ||
1266 | if (!sdd->sds) | ||
1267 | return -ENOMEM; | ||
1268 | |||
1269 | sdd->sg = alloc_percpu(struct sched_group *); | ||
1270 | if (!sdd->sg) | ||
1271 | return -ENOMEM; | ||
1272 | |||
1273 | sdd->sgc = alloc_percpu(struct sched_group_capacity *); | ||
1274 | if (!sdd->sgc) | ||
1275 | return -ENOMEM; | ||
1276 | |||
1277 | for_each_cpu(j, cpu_map) { | ||
1278 | struct sched_domain *sd; | ||
1279 | struct sched_domain_shared *sds; | ||
1280 | struct sched_group *sg; | ||
1281 | struct sched_group_capacity *sgc; | ||
1282 | |||
1283 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | ||
1284 | GFP_KERNEL, cpu_to_node(j)); | ||
1285 | if (!sd) | ||
1286 | return -ENOMEM; | ||
1287 | |||
1288 | *per_cpu_ptr(sdd->sd, j) = sd; | ||
1289 | |||
1290 | sds = kzalloc_node(sizeof(struct sched_domain_shared), | ||
1291 | GFP_KERNEL, cpu_to_node(j)); | ||
1292 | if (!sds) | ||
1293 | return -ENOMEM; | ||
1294 | |||
1295 | *per_cpu_ptr(sdd->sds, j) = sds; | ||
1296 | |||
1297 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
1298 | GFP_KERNEL, cpu_to_node(j)); | ||
1299 | if (!sg) | ||
1300 | return -ENOMEM; | ||
1301 | |||
1302 | sg->next = sg; | ||
1303 | |||
1304 | *per_cpu_ptr(sdd->sg, j) = sg; | ||
1305 | |||
1306 | sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), | ||
1307 | GFP_KERNEL, cpu_to_node(j)); | ||
1308 | if (!sgc) | ||
1309 | return -ENOMEM; | ||
1310 | |||
1311 | *per_cpu_ptr(sdd->sgc, j) = sgc; | ||
1312 | } | ||
1313 | } | ||
1314 | |||
1315 | return 0; | ||
1316 | } | ||
1317 | |||
1318 | static void __sdt_free(const struct cpumask *cpu_map) | ||
1319 | { | ||
1320 | struct sched_domain_topology_level *tl; | ||
1321 | int j; | ||
1322 | |||
1323 | for_each_sd_topology(tl) { | ||
1324 | struct sd_data *sdd = &tl->data; | ||
1325 | |||
1326 | for_each_cpu(j, cpu_map) { | ||
1327 | struct sched_domain *sd; | ||
1328 | |||
1329 | if (sdd->sd) { | ||
1330 | sd = *per_cpu_ptr(sdd->sd, j); | ||
1331 | if (sd && (sd->flags & SD_OVERLAP)) | ||
1332 | free_sched_groups(sd->groups, 0); | ||
1333 | kfree(*per_cpu_ptr(sdd->sd, j)); | ||
1334 | } | ||
1335 | |||
1336 | if (sdd->sds) | ||
1337 | kfree(*per_cpu_ptr(sdd->sds, j)); | ||
1338 | if (sdd->sg) | ||
1339 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
1340 | if (sdd->sgc) | ||
1341 | kfree(*per_cpu_ptr(sdd->sgc, j)); | ||
1342 | } | ||
1343 | free_percpu(sdd->sd); | ||
1344 | sdd->sd = NULL; | ||
1345 | free_percpu(sdd->sds); | ||
1346 | sdd->sds = NULL; | ||
1347 | free_percpu(sdd->sg); | ||
1348 | sdd->sg = NULL; | ||
1349 | free_percpu(sdd->sgc); | ||
1350 | sdd->sgc = NULL; | ||
1351 | } | ||
1352 | } | ||
1353 | |||
1354 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | ||
1355 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
1356 | struct sched_domain *child, int cpu) | ||
1357 | { | ||
1358 | struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); | ||
1359 | |||
1360 | if (child) { | ||
1361 | sd->level = child->level + 1; | ||
1362 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | ||
1363 | child->parent = sd; | ||
1364 | |||
1365 | if (!cpumask_subset(sched_domain_span(child), | ||
1366 | sched_domain_span(sd))) { | ||
1367 | pr_err("BUG: arch topology borken\n"); | ||
1368 | #ifdef CONFIG_SCHED_DEBUG | ||
1369 | pr_err(" the %s domain not a subset of the %s domain\n", | ||
1370 | child->name, sd->name); | ||
1371 | #endif | ||
1372 | /* Fixup, ensure @sd has at least @child cpus. */ | ||
1373 | cpumask_or(sched_domain_span(sd), | ||
1374 | sched_domain_span(sd), | ||
1375 | sched_domain_span(child)); | ||
1376 | } | ||
1377 | |||
1378 | } | ||
1379 | set_domain_attribute(sd, attr); | ||
1380 | |||
1381 | return sd; | ||
1382 | } | ||
1383 | |||
1384 | /* | ||
1385 | * Build sched domains for a given set of CPUs and attach the sched domains | ||
1386 | * to the individual CPUs | ||
1387 | */ | ||
1388 | static int | ||
1389 | build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) | ||
1390 | { | ||
1391 | enum s_alloc alloc_state; | ||
1392 | struct sched_domain *sd; | ||
1393 | struct s_data d; | ||
1394 | struct rq *rq = NULL; | ||
1395 | int i, ret = -ENOMEM; | ||
1396 | |||
1397 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | ||
1398 | if (alloc_state != sa_rootdomain) | ||
1399 | goto error; | ||
1400 | |||
1401 | /* Set up domains for CPUs specified by the cpu_map: */ | ||
1402 | for_each_cpu(i, cpu_map) { | ||
1403 | struct sched_domain_topology_level *tl; | ||
1404 | |||
1405 | sd = NULL; | ||
1406 | for_each_sd_topology(tl) { | ||
1407 | sd = build_sched_domain(tl, cpu_map, attr, sd, i); | ||
1408 | if (tl == sched_domain_topology) | ||
1409 | *per_cpu_ptr(d.sd, i) = sd; | ||
1410 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) | ||
1411 | sd->flags |= SD_OVERLAP; | ||
1412 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) | ||
1413 | break; | ||
1414 | } | ||
1415 | } | ||
1416 | |||
1417 | /* Build the groups for the domains */ | ||
1418 | for_each_cpu(i, cpu_map) { | ||
1419 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
1420 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); | ||
1421 | if (sd->flags & SD_OVERLAP) { | ||
1422 | if (build_overlap_sched_groups(sd, i)) | ||
1423 | goto error; | ||
1424 | } else { | ||
1425 | if (build_sched_groups(sd, i)) | ||
1426 | goto error; | ||
1427 | } | ||
1428 | } | ||
1429 | } | ||
1430 | |||
1431 | /* Calculate CPU capacity for physical packages and nodes */ | ||
1432 | for (i = nr_cpumask_bits-1; i >= 0; i--) { | ||
1433 | if (!cpumask_test_cpu(i, cpu_map)) | ||
1434 | continue; | ||
1435 | |||
1436 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
1437 | claim_allocations(i, sd); | ||
1438 | init_sched_groups_capacity(i, sd); | ||
1439 | } | ||
1440 | } | ||
1441 | |||
1442 | /* Attach the domains */ | ||
1443 | rcu_read_lock(); | ||
1444 | for_each_cpu(i, cpu_map) { | ||
1445 | rq = cpu_rq(i); | ||
1446 | sd = *per_cpu_ptr(d.sd, i); | ||
1447 | |||
1448 | /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ | ||
1449 | if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) | ||
1450 | WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); | ||
1451 | |||
1452 | cpu_attach_domain(sd, d.rd, i); | ||
1453 | } | ||
1454 | rcu_read_unlock(); | ||
1455 | |||
1456 | if (rq && sched_debug_enabled) { | ||
1457 | pr_info("span: %*pbl (max cpu_capacity = %lu)\n", | ||
1458 | cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); | ||
1459 | } | ||
1460 | |||
1461 | ret = 0; | ||
1462 | error: | ||
1463 | __free_domain_allocs(&d, alloc_state, cpu_map); | ||
1464 | return ret; | ||
1465 | } | ||
1466 | |||
1467 | /* Current sched domains: */ | ||
1468 | static cpumask_var_t *doms_cur; | ||
1469 | |||
1470 | /* Number of sched domains in 'doms_cur': */ | ||
1471 | static int ndoms_cur; | ||
1472 | |||
1473 | /* Attribues of custom domains in 'doms_cur' */ | ||
1474 | static struct sched_domain_attr *dattr_cur; | ||
1475 | |||
1476 | /* | ||
1477 | * Special case: If a kmalloc() of a doms_cur partition (array of | ||
1478 | * cpumask) fails, then fallback to a single sched domain, | ||
1479 | * as determined by the single cpumask fallback_doms. | ||
1480 | */ | ||
1481 | cpumask_var_t fallback_doms; | ||
1482 | |||
1483 | /* | ||
1484 | * arch_update_cpu_topology lets virtualized architectures update the | ||
1485 | * CPU core maps. It is supposed to return 1 if the topology changed | ||
1486 | * or 0 if it stayed the same. | ||
1487 | */ | ||
1488 | int __weak arch_update_cpu_topology(void) | ||
1489 | { | ||
1490 | return 0; | ||
1491 | } | ||
1492 | |||
1493 | cpumask_var_t *alloc_sched_domains(unsigned int ndoms) | ||
1494 | { | ||
1495 | int i; | ||
1496 | cpumask_var_t *doms; | ||
1497 | |||
1498 | doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); | ||
1499 | if (!doms) | ||
1500 | return NULL; | ||
1501 | for (i = 0; i < ndoms; i++) { | ||
1502 | if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { | ||
1503 | free_sched_domains(doms, i); | ||
1504 | return NULL; | ||
1505 | } | ||
1506 | } | ||
1507 | return doms; | ||
1508 | } | ||
1509 | |||
1510 | void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | ||
1511 | { | ||
1512 | unsigned int i; | ||
1513 | for (i = 0; i < ndoms; i++) | ||
1514 | free_cpumask_var(doms[i]); | ||
1515 | kfree(doms); | ||
1516 | } | ||
1517 | |||
1518 | /* | ||
1519 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | ||
1520 | * For now this just excludes isolated CPUs, but could be used to | ||
1521 | * exclude other special cases in the future. | ||
1522 | */ | ||
1523 | int init_sched_domains(const struct cpumask *cpu_map) | ||
1524 | { | ||
1525 | int err; | ||
1526 | |||
1527 | arch_update_cpu_topology(); | ||
1528 | ndoms_cur = 1; | ||
1529 | doms_cur = alloc_sched_domains(ndoms_cur); | ||
1530 | if (!doms_cur) | ||
1531 | doms_cur = &fallback_doms; | ||
1532 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | ||
1533 | err = build_sched_domains(doms_cur[0], NULL); | ||
1534 | register_sched_domain_sysctl(); | ||
1535 | |||
1536 | return err; | ||
1537 | } | ||
1538 | |||
1539 | /* | ||
1540 | * Detach sched domains from a group of CPUs specified in cpu_map | ||
1541 | * These CPUs will now be attached to the NULL domain | ||
1542 | */ | ||
1543 | static void detach_destroy_domains(const struct cpumask *cpu_map) | ||
1544 | { | ||
1545 | int i; | ||
1546 | |||
1547 | rcu_read_lock(); | ||
1548 | for_each_cpu(i, cpu_map) | ||
1549 | cpu_attach_domain(NULL, &def_root_domain, i); | ||
1550 | rcu_read_unlock(); | ||
1551 | } | ||
1552 | |||
1553 | /* handle null as "default" */ | ||
1554 | static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | ||
1555 | struct sched_domain_attr *new, int idx_new) | ||
1556 | { | ||
1557 | struct sched_domain_attr tmp; | ||
1558 | |||
1559 | /* Fast path: */ | ||
1560 | if (!new && !cur) | ||
1561 | return 1; | ||
1562 | |||
1563 | tmp = SD_ATTR_INIT; | ||
1564 | return !memcmp(cur ? (cur + idx_cur) : &tmp, | ||
1565 | new ? (new + idx_new) : &tmp, | ||
1566 | sizeof(struct sched_domain_attr)); | ||
1567 | } | ||
1568 | |||
1569 | /* | ||
1570 | * Partition sched domains as specified by the 'ndoms_new' | ||
1571 | * cpumasks in the array doms_new[] of cpumasks. This compares | ||
1572 | * doms_new[] to the current sched domain partitioning, doms_cur[]. | ||
1573 | * It destroys each deleted domain and builds each new domain. | ||
1574 | * | ||
1575 | * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. | ||
1576 | * The masks don't intersect (don't overlap.) We should setup one | ||
1577 | * sched domain for each mask. CPUs not in any of the cpumasks will | ||
1578 | * not be load balanced. If the same cpumask appears both in the | ||
1579 | * current 'doms_cur' domains and in the new 'doms_new', we can leave | ||
1580 | * it as it is. | ||
1581 | * | ||
1582 | * The passed in 'doms_new' should be allocated using | ||
1583 | * alloc_sched_domains. This routine takes ownership of it and will | ||
1584 | * free_sched_domains it when done with it. If the caller failed the | ||
1585 | * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, | ||
1586 | * and partition_sched_domains() will fallback to the single partition | ||
1587 | * 'fallback_doms', it also forces the domains to be rebuilt. | ||
1588 | * | ||
1589 | * If doms_new == NULL it will be replaced with cpu_online_mask. | ||
1590 | * ndoms_new == 0 is a special case for destroying existing domains, | ||
1591 | * and it will not create the default domain. | ||
1592 | * | ||
1593 | * Call with hotplug lock held | ||
1594 | */ | ||
1595 | void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], | ||
1596 | struct sched_domain_attr *dattr_new) | ||
1597 | { | ||
1598 | int i, j, n; | ||
1599 | int new_topology; | ||
1600 | |||
1601 | mutex_lock(&sched_domains_mutex); | ||
1602 | |||
1603 | /* Always unregister in case we don't destroy any domains: */ | ||
1604 | unregister_sched_domain_sysctl(); | ||
1605 | |||
1606 | /* Let the architecture update CPU core mappings: */ | ||
1607 | new_topology = arch_update_cpu_topology(); | ||
1608 | |||
1609 | n = doms_new ? ndoms_new : 0; | ||
1610 | |||
1611 | /* Destroy deleted domains: */ | ||
1612 | for (i = 0; i < ndoms_cur; i++) { | ||
1613 | for (j = 0; j < n && !new_topology; j++) { | ||
1614 | if (cpumask_equal(doms_cur[i], doms_new[j]) | ||
1615 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | ||
1616 | goto match1; | ||
1617 | } | ||
1618 | /* No match - a current sched domain not in new doms_new[] */ | ||
1619 | detach_destroy_domains(doms_cur[i]); | ||
1620 | match1: | ||
1621 | ; | ||
1622 | } | ||
1623 | |||
1624 | n = ndoms_cur; | ||
1625 | if (doms_new == NULL) { | ||
1626 | n = 0; | ||
1627 | doms_new = &fallback_doms; | ||
1628 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); | ||
1629 | WARN_ON_ONCE(dattr_new); | ||
1630 | } | ||
1631 | |||
1632 | /* Build new domains: */ | ||
1633 | for (i = 0; i < ndoms_new; i++) { | ||
1634 | for (j = 0; j < n && !new_topology; j++) { | ||
1635 | if (cpumask_equal(doms_new[i], doms_cur[j]) | ||
1636 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | ||
1637 | goto match2; | ||
1638 | } | ||
1639 | /* No match - add a new doms_new */ | ||
1640 | build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); | ||
1641 | match2: | ||
1642 | ; | ||
1643 | } | ||
1644 | |||
1645 | /* Remember the new sched domains: */ | ||
1646 | if (doms_cur != &fallback_doms) | ||
1647 | free_sched_domains(doms_cur, ndoms_cur); | ||
1648 | |||
1649 | kfree(dattr_cur); | ||
1650 | doms_cur = doms_new; | ||
1651 | dattr_cur = dattr_new; | ||
1652 | ndoms_cur = ndoms_new; | ||
1653 | |||
1654 | register_sched_domain_sysctl(); | ||
1655 | |||
1656 | mutex_unlock(&sched_domains_mutex); | ||
1657 | } | ||
1658 | |||