diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched.c | 290 |
1 files changed, 236 insertions, 54 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 5f889d0cbfc..50860ad5b62 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -4779,7 +4779,7 @@ static int sd_parent_degenerate(struct sched_domain *sd, | |||
4779 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 4779 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
4780 | * hold the hotplug lock. | 4780 | * hold the hotplug lock. |
4781 | */ | 4781 | */ |
4782 | void cpu_attach_domain(struct sched_domain *sd, int cpu) | 4782 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) |
4783 | { | 4783 | { |
4784 | runqueue_t *rq = cpu_rq(cpu); | 4784 | runqueue_t *rq = cpu_rq(cpu); |
4785 | struct sched_domain *tmp; | 4785 | struct sched_domain *tmp; |
@@ -4802,7 +4802,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu) | |||
4802 | } | 4802 | } |
4803 | 4803 | ||
4804 | /* cpus with isolated domains */ | 4804 | /* cpus with isolated domains */ |
4805 | cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; | 4805 | static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; |
4806 | 4806 | ||
4807 | /* Setup the mask of cpus configured for isolated domains */ | 4807 | /* Setup the mask of cpus configured for isolated domains */ |
4808 | static int __init isolated_cpu_setup(char *str) | 4808 | static int __init isolated_cpu_setup(char *str) |
@@ -4830,8 +4830,8 @@ __setup ("isolcpus=", isolated_cpu_setup); | |||
4830 | * covered by the given span, and will set each group's ->cpumask correctly, | 4830 | * covered by the given span, and will set each group's ->cpumask correctly, |
4831 | * and ->cpu_power to 0. | 4831 | * and ->cpu_power to 0. |
4832 | */ | 4832 | */ |
4833 | void init_sched_build_groups(struct sched_group groups[], | 4833 | static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, |
4834 | cpumask_t span, int (*group_fn)(int cpu)) | 4834 | int (*group_fn)(int cpu)) |
4835 | { | 4835 | { |
4836 | struct sched_group *first = NULL, *last = NULL; | 4836 | struct sched_group *first = NULL, *last = NULL; |
4837 | cpumask_t covered = CPU_MASK_NONE; | 4837 | cpumask_t covered = CPU_MASK_NONE; |
@@ -4864,12 +4864,85 @@ void init_sched_build_groups(struct sched_group groups[], | |||
4864 | last->next = first; | 4864 | last->next = first; |
4865 | } | 4865 | } |
4866 | 4866 | ||
4867 | #define SD_NODES_PER_DOMAIN 16 | ||
4867 | 4868 | ||
4868 | #ifdef ARCH_HAS_SCHED_DOMAIN | 4869 | #ifdef CONFIG_NUMA |
4869 | extern void build_sched_domains(const cpumask_t *cpu_map); | 4870 | /** |
4870 | extern void arch_init_sched_domains(const cpumask_t *cpu_map); | 4871 | * find_next_best_node - find the next node to include in a sched_domain |
4871 | extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); | 4872 | * @node: node whose sched_domain we're building |
4872 | #else | 4873 | * @used_nodes: nodes already in the sched_domain |
4874 | * | ||
4875 | * Find the next node to include in a given scheduling domain. Simply | ||
4876 | * finds the closest node not already in the @used_nodes map. | ||
4877 | * | ||
4878 | * Should use nodemask_t. | ||
4879 | */ | ||
4880 | static int find_next_best_node(int node, unsigned long *used_nodes) | ||
4881 | { | ||
4882 | int i, n, val, min_val, best_node = 0; | ||
4883 | |||
4884 | min_val = INT_MAX; | ||
4885 | |||
4886 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
4887 | /* Start at @node */ | ||
4888 | n = (node + i) % MAX_NUMNODES; | ||
4889 | |||
4890 | if (!nr_cpus_node(n)) | ||
4891 | continue; | ||
4892 | |||
4893 | /* Skip already used nodes */ | ||
4894 | if (test_bit(n, used_nodes)) | ||
4895 | continue; | ||
4896 | |||
4897 | /* Simple min distance search */ | ||
4898 | val = node_distance(node, n); | ||
4899 | |||
4900 | if (val < min_val) { | ||
4901 | min_val = val; | ||
4902 | best_node = n; | ||
4903 | } | ||
4904 | } | ||
4905 | |||
4906 | set_bit(best_node, used_nodes); | ||
4907 | return best_node; | ||
4908 | } | ||
4909 | |||
4910 | /** | ||
4911 | * sched_domain_node_span - get a cpumask for a node's sched_domain | ||
4912 | * @node: node whose cpumask we're constructing | ||
4913 | * @size: number of nodes to include in this span | ||
4914 | * | ||
4915 | * Given a node, construct a good cpumask for its sched_domain to span. It | ||
4916 | * should be one that prevents unnecessary balancing, but also spreads tasks | ||
4917 | * out optimally. | ||
4918 | */ | ||
4919 | static cpumask_t sched_domain_node_span(int node) | ||
4920 | { | ||
4921 | int i; | ||
4922 | cpumask_t span, nodemask; | ||
4923 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); | ||
4924 | |||
4925 | cpus_clear(span); | ||
4926 | bitmap_zero(used_nodes, MAX_NUMNODES); | ||
4927 | |||
4928 | nodemask = node_to_cpumask(node); | ||
4929 | cpus_or(span, span, nodemask); | ||
4930 | set_bit(node, used_nodes); | ||
4931 | |||
4932 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | ||
4933 | int next_node = find_next_best_node(node, used_nodes); | ||
4934 | nodemask = node_to_cpumask(next_node); | ||
4935 | cpus_or(span, span, nodemask); | ||
4936 | } | ||
4937 | |||
4938 | return span; | ||
4939 | } | ||
4940 | #endif | ||
4941 | |||
4942 | /* | ||
4943 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we | ||
4944 | * can switch it on easily if needed. | ||
4945 | */ | ||
4873 | #ifdef CONFIG_SCHED_SMT | 4946 | #ifdef CONFIG_SCHED_SMT |
4874 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 4947 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
4875 | static struct sched_group sched_group_cpus[NR_CPUS]; | 4948 | static struct sched_group sched_group_cpus[NR_CPUS]; |
@@ -4891,36 +4964,20 @@ static int cpu_to_phys_group(int cpu) | |||
4891 | } | 4964 | } |
4892 | 4965 | ||
4893 | #ifdef CONFIG_NUMA | 4966 | #ifdef CONFIG_NUMA |
4894 | |||
4895 | static DEFINE_PER_CPU(struct sched_domain, node_domains); | ||
4896 | static struct sched_group sched_group_nodes[MAX_NUMNODES]; | ||
4897 | static int cpu_to_node_group(int cpu) | ||
4898 | { | ||
4899 | return cpu_to_node(cpu); | ||
4900 | } | ||
4901 | #endif | ||
4902 | |||
4903 | #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) | ||
4904 | /* | 4967 | /* |
4905 | * The domains setup code relies on siblings not spanning | 4968 | * The init_sched_build_groups can't handle what we want to do with node |
4906 | * multiple nodes. Make sure the architecture has a proper | 4969 | * groups, so roll our own. Now each node has its own list of groups which |
4907 | * siblings map: | 4970 | * gets dynamically allocated. |
4908 | */ | 4971 | */ |
4909 | static void check_sibling_maps(void) | 4972 | static DEFINE_PER_CPU(struct sched_domain, node_domains); |
4910 | { | 4973 | static struct sched_group *sched_group_nodes[MAX_NUMNODES]; |
4911 | int i, j; | ||
4912 | 4974 | ||
4913 | for_each_online_cpu(i) { | 4975 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); |
4914 | for_each_cpu_mask(j, cpu_sibling_map[i]) { | 4976 | static struct sched_group sched_group_allnodes[MAX_NUMNODES]; |
4915 | if (cpu_to_node(i) != cpu_to_node(j)) { | 4977 | |
4916 | printk(KERN_INFO "warning: CPU %d siblings map " | 4978 | static int cpu_to_allnodes_group(int cpu) |
4917 | "to different node - isolating " | 4979 | { |
4918 | "them.\n", i); | 4980 | return cpu_to_node(cpu); |
4919 | cpu_sibling_map[i] = cpumask_of_cpu(i); | ||
4920 | break; | ||
4921 | } | ||
4922 | } | ||
4923 | } | ||
4924 | } | 4981 | } |
4925 | #endif | 4982 | #endif |
4926 | 4983 | ||
@@ -4928,7 +4985,7 @@ static void check_sibling_maps(void) | |||
4928 | * Build sched domains for a given set of cpus and attach the sched domains | 4985 | * Build sched domains for a given set of cpus and attach the sched domains |
4929 | * to the individual cpus | 4986 | * to the individual cpus |
4930 | */ | 4987 | */ |
4931 | static void build_sched_domains(const cpumask_t *cpu_map) | 4988 | void build_sched_domains(const cpumask_t *cpu_map) |
4932 | { | 4989 | { |
4933 | int i; | 4990 | int i; |
4934 | 4991 | ||
@@ -4943,11 +5000,22 @@ static void build_sched_domains(const cpumask_t *cpu_map) | |||
4943 | cpus_and(nodemask, nodemask, *cpu_map); | 5000 | cpus_and(nodemask, nodemask, *cpu_map); |
4944 | 5001 | ||
4945 | #ifdef CONFIG_NUMA | 5002 | #ifdef CONFIG_NUMA |
5003 | if (num_online_cpus() | ||
5004 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | ||
5005 | sd = &per_cpu(allnodes_domains, i); | ||
5006 | *sd = SD_ALLNODES_INIT; | ||
5007 | sd->span = *cpu_map; | ||
5008 | group = cpu_to_allnodes_group(i); | ||
5009 | sd->groups = &sched_group_allnodes[group]; | ||
5010 | p = sd; | ||
5011 | } else | ||
5012 | p = NULL; | ||
5013 | |||
4946 | sd = &per_cpu(node_domains, i); | 5014 | sd = &per_cpu(node_domains, i); |
4947 | group = cpu_to_node_group(i); | ||
4948 | *sd = SD_NODE_INIT; | 5015 | *sd = SD_NODE_INIT; |
4949 | sd->span = *cpu_map; | 5016 | sd->span = sched_domain_node_span(cpu_to_node(i)); |
4950 | sd->groups = &sched_group_nodes[group]; | 5017 | sd->parent = p; |
5018 | cpus_and(sd->span, sd->span, *cpu_map); | ||
4951 | #endif | 5019 | #endif |
4952 | 5020 | ||
4953 | p = sd; | 5021 | p = sd; |
@@ -4972,7 +5040,7 @@ static void build_sched_domains(const cpumask_t *cpu_map) | |||
4972 | 5040 | ||
4973 | #ifdef CONFIG_SCHED_SMT | 5041 | #ifdef CONFIG_SCHED_SMT |
4974 | /* Set up CPU (sibling) groups */ | 5042 | /* Set up CPU (sibling) groups */ |
4975 | for_each_online_cpu(i) { | 5043 | for_each_cpu_mask(i, *cpu_map) { |
4976 | cpumask_t this_sibling_map = cpu_sibling_map[i]; | 5044 | cpumask_t this_sibling_map = cpu_sibling_map[i]; |
4977 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); | 5045 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); |
4978 | if (i != first_cpu(this_sibling_map)) | 5046 | if (i != first_cpu(this_sibling_map)) |
@@ -4997,8 +5065,74 @@ static void build_sched_domains(const cpumask_t *cpu_map) | |||
4997 | 5065 | ||
4998 | #ifdef CONFIG_NUMA | 5066 | #ifdef CONFIG_NUMA |
4999 | /* Set up node groups */ | 5067 | /* Set up node groups */ |
5000 | init_sched_build_groups(sched_group_nodes, *cpu_map, | 5068 | init_sched_build_groups(sched_group_allnodes, *cpu_map, |
5001 | &cpu_to_node_group); | 5069 | &cpu_to_allnodes_group); |
5070 | |||
5071 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
5072 | /* Set up node groups */ | ||
5073 | struct sched_group *sg, *prev; | ||
5074 | cpumask_t nodemask = node_to_cpumask(i); | ||
5075 | cpumask_t domainspan; | ||
5076 | cpumask_t covered = CPU_MASK_NONE; | ||
5077 | int j; | ||
5078 | |||
5079 | cpus_and(nodemask, nodemask, *cpu_map); | ||
5080 | if (cpus_empty(nodemask)) | ||
5081 | continue; | ||
5082 | |||
5083 | domainspan = sched_domain_node_span(i); | ||
5084 | cpus_and(domainspan, domainspan, *cpu_map); | ||
5085 | |||
5086 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | ||
5087 | sched_group_nodes[i] = sg; | ||
5088 | for_each_cpu_mask(j, nodemask) { | ||
5089 | struct sched_domain *sd; | ||
5090 | sd = &per_cpu(node_domains, j); | ||
5091 | sd->groups = sg; | ||
5092 | if (sd->groups == NULL) { | ||
5093 | /* Turn off balancing if we have no groups */ | ||
5094 | sd->flags = 0; | ||
5095 | } | ||
5096 | } | ||
5097 | if (!sg) { | ||
5098 | printk(KERN_WARNING | ||
5099 | "Can not alloc domain group for node %d\n", i); | ||
5100 | continue; | ||
5101 | } | ||
5102 | sg->cpu_power = 0; | ||
5103 | sg->cpumask = nodemask; | ||
5104 | cpus_or(covered, covered, nodemask); | ||
5105 | prev = sg; | ||
5106 | |||
5107 | for (j = 0; j < MAX_NUMNODES; j++) { | ||
5108 | cpumask_t tmp, notcovered; | ||
5109 | int n = (i + j) % MAX_NUMNODES; | ||
5110 | |||
5111 | cpus_complement(notcovered, covered); | ||
5112 | cpus_and(tmp, notcovered, *cpu_map); | ||
5113 | cpus_and(tmp, tmp, domainspan); | ||
5114 | if (cpus_empty(tmp)) | ||
5115 | break; | ||
5116 | |||
5117 | nodemask = node_to_cpumask(n); | ||
5118 | cpus_and(tmp, tmp, nodemask); | ||
5119 | if (cpus_empty(tmp)) | ||
5120 | continue; | ||
5121 | |||
5122 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | ||
5123 | if (!sg) { | ||
5124 | printk(KERN_WARNING | ||
5125 | "Can not alloc domain group for node %d\n", j); | ||
5126 | break; | ||
5127 | } | ||
5128 | sg->cpu_power = 0; | ||
5129 | sg->cpumask = tmp; | ||
5130 | cpus_or(covered, covered, tmp); | ||
5131 | prev->next = sg; | ||
5132 | prev = sg; | ||
5133 | } | ||
5134 | prev->next = sched_group_nodes[i]; | ||
5135 | } | ||
5002 | #endif | 5136 | #endif |
5003 | 5137 | ||
5004 | /* Calculate CPU power for physical packages and nodes */ | 5138 | /* Calculate CPU power for physical packages and nodes */ |
@@ -5017,14 +5151,46 @@ static void build_sched_domains(const cpumask_t *cpu_map) | |||
5017 | sd->groups->cpu_power = power; | 5151 | sd->groups->cpu_power = power; |
5018 | 5152 | ||
5019 | #ifdef CONFIG_NUMA | 5153 | #ifdef CONFIG_NUMA |
5020 | if (i == first_cpu(sd->groups->cpumask)) { | 5154 | sd = &per_cpu(allnodes_domains, i); |
5021 | /* Only add "power" once for each physical package. */ | 5155 | if (sd->groups) { |
5022 | sd = &per_cpu(node_domains, i); | 5156 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * |
5023 | sd->groups->cpu_power += power; | 5157 | (cpus_weight(sd->groups->cpumask)-1) / 10; |
5158 | sd->groups->cpu_power = power; | ||
5024 | } | 5159 | } |
5025 | #endif | 5160 | #endif |
5026 | } | 5161 | } |
5027 | 5162 | ||
5163 | #ifdef CONFIG_NUMA | ||
5164 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
5165 | struct sched_group *sg = sched_group_nodes[i]; | ||
5166 | int j; | ||
5167 | |||
5168 | if (sg == NULL) | ||
5169 | continue; | ||
5170 | next_sg: | ||
5171 | for_each_cpu_mask(j, sg->cpumask) { | ||
5172 | struct sched_domain *sd; | ||
5173 | int power; | ||
5174 | |||
5175 | sd = &per_cpu(phys_domains, j); | ||
5176 | if (j != first_cpu(sd->groups->cpumask)) { | ||
5177 | /* | ||
5178 | * Only add "power" once for each | ||
5179 | * physical package. | ||
5180 | */ | ||
5181 | continue; | ||
5182 | } | ||
5183 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | ||
5184 | (cpus_weight(sd->groups->cpumask)-1) / 10; | ||
5185 | |||
5186 | sg->cpu_power += power; | ||
5187 | } | ||
5188 | sg = sg->next; | ||
5189 | if (sg != sched_group_nodes[i]) | ||
5190 | goto next_sg; | ||
5191 | } | ||
5192 | #endif | ||
5193 | |||
5028 | /* Attach the domains */ | 5194 | /* Attach the domains */ |
5029 | for_each_cpu_mask(i, *cpu_map) { | 5195 | for_each_cpu_mask(i, *cpu_map) { |
5030 | struct sched_domain *sd; | 5196 | struct sched_domain *sd; |
@@ -5039,13 +5205,10 @@ static void build_sched_domains(const cpumask_t *cpu_map) | |||
5039 | /* | 5205 | /* |
5040 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 5206 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
5041 | */ | 5207 | */ |
5042 | static void arch_init_sched_domains(cpumask_t *cpu_map) | 5208 | static void arch_init_sched_domains(const cpumask_t *cpu_map) |
5043 | { | 5209 | { |
5044 | cpumask_t cpu_default_map; | 5210 | cpumask_t cpu_default_map; |
5045 | 5211 | ||
5046 | #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) | ||
5047 | check_sibling_maps(); | ||
5048 | #endif | ||
5049 | /* | 5212 | /* |
5050 | * Setup mask for cpus without special case scheduling requirements. | 5213 | * Setup mask for cpus without special case scheduling requirements. |
5051 | * For now this just excludes isolated cpus, but could be used to | 5214 | * For now this just excludes isolated cpus, but could be used to |
@@ -5058,10 +5221,29 @@ static void arch_init_sched_domains(cpumask_t *cpu_map) | |||
5058 | 5221 | ||
5059 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 5222 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) |
5060 | { | 5223 | { |
5061 | /* Do nothing: everything is statically allocated. */ | 5224 | #ifdef CONFIG_NUMA |
5062 | } | 5225 | int i; |
5226 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
5227 | cpumask_t nodemask = node_to_cpumask(i); | ||
5228 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
5063 | 5229 | ||
5064 | #endif /* ARCH_HAS_SCHED_DOMAIN */ | 5230 | cpus_and(nodemask, nodemask, *cpu_map); |
5231 | if (cpus_empty(nodemask)) | ||
5232 | continue; | ||
5233 | |||
5234 | if (sg == NULL) | ||
5235 | continue; | ||
5236 | sg = sg->next; | ||
5237 | next_sg: | ||
5238 | oldsg = sg; | ||
5239 | sg = sg->next; | ||
5240 | kfree(oldsg); | ||
5241 | if (oldsg != sched_group_nodes[i]) | ||
5242 | goto next_sg; | ||
5243 | sched_group_nodes[i] = NULL; | ||
5244 | } | ||
5245 | #endif | ||
5246 | } | ||
5065 | 5247 | ||
5066 | /* | 5248 | /* |
5067 | * Detach sched domains from a group of cpus specified in cpu_map | 5249 | * Detach sched domains from a group of cpus specified in cpu_map |