diff options
Diffstat (limited to 'kernel/sched/topology.c')
-rw-r--r-- | kernel/sched/topology.c | 1658 |
1 files changed, 1658 insertions, 0 deletions
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c new file mode 100644 index 000000000000..1b0b4fb12837 --- /dev/null +++ b/kernel/sched/topology.c | |||
@@ -0,0 +1,1658 @@ | |||
1 | /* | ||
2 | * Scheduler topology setup/handling methods | ||
3 | */ | ||
4 | #include <linux/sched.h> | ||
5 | #include <linux/mutex.h> | ||
6 | |||
7 | #include "sched.h" | ||
8 | |||
9 | DEFINE_MUTEX(sched_domains_mutex); | ||
10 | |||
11 | /* Protected by sched_domains_mutex: */ | ||
12 | cpumask_var_t sched_domains_tmpmask; | ||
13 | |||
14 | #ifdef CONFIG_SCHED_DEBUG | ||
15 | |||
16 | static __read_mostly int sched_debug_enabled; | ||
17 | |||
18 | static int __init sched_debug_setup(char *str) | ||
19 | { | ||
20 | sched_debug_enabled = 1; | ||
21 | |||
22 | return 0; | ||
23 | } | ||
24 | early_param("sched_debug", sched_debug_setup); | ||
25 | |||
26 | static inline bool sched_debug(void) | ||
27 | { | ||
28 | return sched_debug_enabled; | ||
29 | } | ||
30 | |||
31 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | ||
32 | struct cpumask *groupmask) | ||
33 | { | ||
34 | struct sched_group *group = sd->groups; | ||
35 | |||
36 | cpumask_clear(groupmask); | ||
37 | |||
38 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); | ||
39 | |||
40 | if (!(sd->flags & SD_LOAD_BALANCE)) { | ||
41 | printk("does not load-balance\n"); | ||
42 | if (sd->parent) | ||
43 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" | ||
44 | " has parent"); | ||
45 | return -1; | ||
46 | } | ||
47 | |||
48 | printk(KERN_CONT "span %*pbl level %s\n", | ||
49 | cpumask_pr_args(sched_domain_span(sd)), sd->name); | ||
50 | |||
51 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
52 | printk(KERN_ERR "ERROR: domain->span does not contain " | ||
53 | "CPU%d\n", cpu); | ||
54 | } | ||
55 | if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { | ||
56 | printk(KERN_ERR "ERROR: domain->groups does not contain" | ||
57 | " CPU%d\n", cpu); | ||
58 | } | ||
59 | |||
60 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); | ||
61 | do { | ||
62 | if (!group) { | ||
63 | printk("\n"); | ||
64 | printk(KERN_ERR "ERROR: group is NULL\n"); | ||
65 | break; | ||
66 | } | ||
67 | |||
68 | if (!cpumask_weight(sched_group_cpus(group))) { | ||
69 | printk(KERN_CONT "\n"); | ||
70 | printk(KERN_ERR "ERROR: empty group\n"); | ||
71 | break; | ||
72 | } | ||
73 | |||
74 | if (!(sd->flags & SD_OVERLAP) && | ||
75 | cpumask_intersects(groupmask, sched_group_cpus(group))) { | ||
76 | printk(KERN_CONT "\n"); | ||
77 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | ||
78 | break; | ||
79 | } | ||
80 | |||
81 | cpumask_or(groupmask, groupmask, sched_group_cpus(group)); | ||
82 | |||
83 | printk(KERN_CONT " %*pbl", | ||
84 | cpumask_pr_args(sched_group_cpus(group))); | ||
85 | if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { | ||
86 | printk(KERN_CONT " (cpu_capacity = %lu)", | ||
87 | group->sgc->capacity); | ||
88 | } | ||
89 | |||
90 | group = group->next; | ||
91 | } while (group != sd->groups); | ||
92 | printk(KERN_CONT "\n"); | ||
93 | |||
94 | if (!cpumask_equal(sched_domain_span(sd), groupmask)) | ||
95 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | ||
96 | |||
97 | if (sd->parent && | ||
98 | !cpumask_subset(groupmask, sched_domain_span(sd->parent))) | ||
99 | printk(KERN_ERR "ERROR: parent span is not a superset " | ||
100 | "of domain->span\n"); | ||
101 | return 0; | ||
102 | } | ||
103 | |||
104 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | ||
105 | { | ||
106 | int level = 0; | ||
107 | |||
108 | if (!sched_debug_enabled) | ||
109 | return; | ||
110 | |||
111 | if (!sd) { | ||
112 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | ||
113 | return; | ||
114 | } | ||
115 | |||
116 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | ||
117 | |||
118 | for (;;) { | ||
119 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) | ||
120 | break; | ||
121 | level++; | ||
122 | sd = sd->parent; | ||
123 | if (!sd) | ||
124 | break; | ||
125 | } | ||
126 | } | ||
127 | #else /* !CONFIG_SCHED_DEBUG */ | ||
128 | |||
129 | # define sched_debug_enabled 0 | ||
130 | # define sched_domain_debug(sd, cpu) do { } while (0) | ||
131 | static inline bool sched_debug(void) | ||
132 | { | ||
133 | return false; | ||
134 | } | ||
135 | #endif /* CONFIG_SCHED_DEBUG */ | ||
136 | |||
137 | static int sd_degenerate(struct sched_domain *sd) | ||
138 | { | ||
139 | if (cpumask_weight(sched_domain_span(sd)) == 1) | ||
140 | return 1; | ||
141 | |||
142 | /* Following flags need at least 2 groups */ | ||
143 | if (sd->flags & (SD_LOAD_BALANCE | | ||
144 | SD_BALANCE_NEWIDLE | | ||
145 | SD_BALANCE_FORK | | ||
146 | SD_BALANCE_EXEC | | ||
147 | SD_SHARE_CPUCAPACITY | | ||
148 | SD_ASYM_CPUCAPACITY | | ||
149 | SD_SHARE_PKG_RESOURCES | | ||
150 | SD_SHARE_POWERDOMAIN)) { | ||
151 | if (sd->groups != sd->groups->next) | ||
152 | return 0; | ||
153 | } | ||
154 | |||
155 | /* Following flags don't use groups */ | ||
156 | if (sd->flags & (SD_WAKE_AFFINE)) | ||
157 | return 0; | ||
158 | |||
159 | return 1; | ||
160 | } | ||
161 | |||
162 | static int | ||
163 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | ||
164 | { | ||
165 | unsigned long cflags = sd->flags, pflags = parent->flags; | ||
166 | |||
167 | if (sd_degenerate(parent)) | ||
168 | return 1; | ||
169 | |||
170 | if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) | ||
171 | return 0; | ||
172 | |||
173 | /* Flags needing groups don't count if only 1 group in parent */ | ||
174 | if (parent->groups == parent->groups->next) { | ||
175 | pflags &= ~(SD_LOAD_BALANCE | | ||
176 | SD_BALANCE_NEWIDLE | | ||
177 | SD_BALANCE_FORK | | ||
178 | SD_BALANCE_EXEC | | ||
179 | SD_ASYM_CPUCAPACITY | | ||
180 | SD_SHARE_CPUCAPACITY | | ||
181 | SD_SHARE_PKG_RESOURCES | | ||
182 | SD_PREFER_SIBLING | | ||
183 | SD_SHARE_POWERDOMAIN); | ||
184 | if (nr_node_ids == 1) | ||
185 | pflags &= ~SD_SERIALIZE; | ||
186 | } | ||
187 | if (~cflags & pflags) | ||
188 | return 0; | ||
189 | |||
190 | return 1; | ||
191 | } | ||
192 | |||
193 | static void free_rootdomain(struct rcu_head *rcu) | ||
194 | { | ||
195 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); | ||
196 | |||
197 | cpupri_cleanup(&rd->cpupri); | ||
198 | cpudl_cleanup(&rd->cpudl); | ||
199 | free_cpumask_var(rd->dlo_mask); | ||
200 | free_cpumask_var(rd->rto_mask); | ||
201 | free_cpumask_var(rd->online); | ||
202 | free_cpumask_var(rd->span); | ||
203 | kfree(rd); | ||
204 | } | ||
205 | |||
206 | void rq_attach_root(struct rq *rq, struct root_domain *rd) | ||
207 | { | ||
208 | struct root_domain *old_rd = NULL; | ||
209 | unsigned long flags; | ||
210 | |||
211 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
212 | |||
213 | if (rq->rd) { | ||
214 | old_rd = rq->rd; | ||
215 | |||
216 | if (cpumask_test_cpu(rq->cpu, old_rd->online)) | ||
217 | set_rq_offline(rq); | ||
218 | |||
219 | cpumask_clear_cpu(rq->cpu, old_rd->span); | ||
220 | |||
221 | /* | ||
222 | * If we dont want to free the old_rd yet then | ||
223 | * set old_rd to NULL to skip the freeing later | ||
224 | * in this function: | ||
225 | */ | ||
226 | if (!atomic_dec_and_test(&old_rd->refcount)) | ||
227 | old_rd = NULL; | ||
228 | } | ||
229 | |||
230 | atomic_inc(&rd->refcount); | ||
231 | rq->rd = rd; | ||
232 | |||
233 | cpumask_set_cpu(rq->cpu, rd->span); | ||
234 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) | ||
235 | set_rq_online(rq); | ||
236 | |||
237 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
238 | |||
239 | if (old_rd) | ||
240 | call_rcu_sched(&old_rd->rcu, free_rootdomain); | ||
241 | } | ||
242 | |||
243 | static int init_rootdomain(struct root_domain *rd) | ||
244 | { | ||
245 | memset(rd, 0, sizeof(*rd)); | ||
246 | |||
247 | if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) | ||
248 | goto out; | ||
249 | if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) | ||
250 | goto free_span; | ||
251 | if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) | ||
252 | goto free_online; | ||
253 | if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | ||
254 | goto free_dlo_mask; | ||
255 | |||
256 | init_dl_bw(&rd->dl_bw); | ||
257 | if (cpudl_init(&rd->cpudl) != 0) | ||
258 | goto free_rto_mask; | ||
259 | |||
260 | if (cpupri_init(&rd->cpupri) != 0) | ||
261 | goto free_cpudl; | ||
262 | return 0; | ||
263 | |||
264 | free_cpudl: | ||
265 | cpudl_cleanup(&rd->cpudl); | ||
266 | free_rto_mask: | ||
267 | free_cpumask_var(rd->rto_mask); | ||
268 | free_dlo_mask: | ||
269 | free_cpumask_var(rd->dlo_mask); | ||
270 | free_online: | ||
271 | free_cpumask_var(rd->online); | ||
272 | free_span: | ||
273 | free_cpumask_var(rd->span); | ||
274 | out: | ||
275 | return -ENOMEM; | ||
276 | } | ||
277 | |||
278 | /* | ||
279 | * By default the system creates a single root-domain with all CPUs as | ||
280 | * members (mimicking the global state we have today). | ||
281 | */ | ||
282 | struct root_domain def_root_domain; | ||
283 | |||
284 | void init_defrootdomain(void) | ||
285 | { | ||
286 | init_rootdomain(&def_root_domain); | ||
287 | |||
288 | atomic_set(&def_root_domain.refcount, 1); | ||
289 | } | ||
290 | |||
291 | static struct root_domain *alloc_rootdomain(void) | ||
292 | { | ||
293 | struct root_domain *rd; | ||
294 | |||
295 | rd = kmalloc(sizeof(*rd), GFP_KERNEL); | ||
296 | if (!rd) | ||
297 | return NULL; | ||
298 | |||
299 | if (init_rootdomain(rd) != 0) { | ||
300 | kfree(rd); | ||
301 | return NULL; | ||
302 | } | ||
303 | |||
304 | return rd; | ||
305 | } | ||
306 | |||
307 | static void free_sched_groups(struct sched_group *sg, int free_sgc) | ||
308 | { | ||
309 | struct sched_group *tmp, *first; | ||
310 | |||
311 | if (!sg) | ||
312 | return; | ||
313 | |||
314 | first = sg; | ||
315 | do { | ||
316 | tmp = sg->next; | ||
317 | |||
318 | if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) | ||
319 | kfree(sg->sgc); | ||
320 | |||
321 | kfree(sg); | ||
322 | sg = tmp; | ||
323 | } while (sg != first); | ||
324 | } | ||
325 | |||
326 | static void destroy_sched_domain(struct sched_domain *sd) | ||
327 | { | ||
328 | /* | ||
329 | * If its an overlapping domain it has private groups, iterate and | ||
330 | * nuke them all. | ||
331 | */ | ||
332 | if (sd->flags & SD_OVERLAP) { | ||
333 | free_sched_groups(sd->groups, 1); | ||
334 | } else if (atomic_dec_and_test(&sd->groups->ref)) { | ||
335 | kfree(sd->groups->sgc); | ||
336 | kfree(sd->groups); | ||
337 | } | ||
338 | if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) | ||
339 | kfree(sd->shared); | ||
340 | kfree(sd); | ||
341 | } | ||
342 | |||
343 | static void destroy_sched_domains_rcu(struct rcu_head *rcu) | ||
344 | { | ||
345 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
346 | |||
347 | while (sd) { | ||
348 | struct sched_domain *parent = sd->parent; | ||
349 | destroy_sched_domain(sd); | ||
350 | sd = parent; | ||
351 | } | ||
352 | } | ||
353 | |||
354 | static void destroy_sched_domains(struct sched_domain *sd) | ||
355 | { | ||
356 | if (sd) | ||
357 | call_rcu(&sd->rcu, destroy_sched_domains_rcu); | ||
358 | } | ||
359 | |||
360 | /* | ||
361 | * Keep a special pointer to the highest sched_domain that has | ||
362 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this | ||
363 | * allows us to avoid some pointer chasing select_idle_sibling(). | ||
364 | * | ||
365 | * Also keep a unique ID per domain (we use the first CPU number in | ||
366 | * the cpumask of the domain), this allows us to quickly tell if | ||
367 | * two CPUs are in the same cache domain, see cpus_share_cache(). | ||
368 | */ | ||
369 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | ||
370 | DEFINE_PER_CPU(int, sd_llc_size); | ||
371 | DEFINE_PER_CPU(int, sd_llc_id); | ||
372 | DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); | ||
373 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); | ||
374 | DEFINE_PER_CPU(struct sched_domain *, sd_asym); | ||
375 | |||
376 | static void update_top_cache_domain(int cpu) | ||
377 | { | ||
378 | struct sched_domain_shared *sds = NULL; | ||
379 | struct sched_domain *sd; | ||
380 | int id = cpu; | ||
381 | int size = 1; | ||
382 | |||
383 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | ||
384 | if (sd) { | ||
385 | id = cpumask_first(sched_domain_span(sd)); | ||
386 | size = cpumask_weight(sched_domain_span(sd)); | ||
387 | sds = sd->shared; | ||
388 | } | ||
389 | |||
390 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | ||
391 | per_cpu(sd_llc_size, cpu) = size; | ||
392 | per_cpu(sd_llc_id, cpu) = id; | ||
393 | rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); | ||
394 | |||
395 | sd = lowest_flag_domain(cpu, SD_NUMA); | ||
396 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); | ||
397 | |||
398 | sd = highest_flag_domain(cpu, SD_ASYM_PACKING); | ||
399 | rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); | ||
400 | } | ||
401 | |||
402 | /* | ||
403 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | ||
404 | * hold the hotplug lock. | ||
405 | */ | ||
406 | static void | ||
407 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | ||
408 | { | ||
409 | struct rq *rq = cpu_rq(cpu); | ||
410 | struct sched_domain *tmp; | ||
411 | |||
412 | /* Remove the sched domains which do not contribute to scheduling. */ | ||
413 | for (tmp = sd; tmp; ) { | ||
414 | struct sched_domain *parent = tmp->parent; | ||
415 | if (!parent) | ||
416 | break; | ||
417 | |||
418 | if (sd_parent_degenerate(tmp, parent)) { | ||
419 | tmp->parent = parent->parent; | ||
420 | if (parent->parent) | ||
421 | parent->parent->child = tmp; | ||
422 | /* | ||
423 | * Transfer SD_PREFER_SIBLING down in case of a | ||
424 | * degenerate parent; the spans match for this | ||
425 | * so the property transfers. | ||
426 | */ | ||
427 | if (parent->flags & SD_PREFER_SIBLING) | ||
428 | tmp->flags |= SD_PREFER_SIBLING; | ||
429 | destroy_sched_domain(parent); | ||
430 | } else | ||
431 | tmp = tmp->parent; | ||
432 | } | ||
433 | |||
434 | if (sd && sd_degenerate(sd)) { | ||
435 | tmp = sd; | ||
436 | sd = sd->parent; | ||
437 | destroy_sched_domain(tmp); | ||
438 | if (sd) | ||
439 | sd->child = NULL; | ||
440 | } | ||
441 | |||
442 | sched_domain_debug(sd, cpu); | ||
443 | |||
444 | rq_attach_root(rq, rd); | ||
445 | tmp = rq->sd; | ||
446 | rcu_assign_pointer(rq->sd, sd); | ||
447 | destroy_sched_domains(tmp); | ||
448 | |||
449 | update_top_cache_domain(cpu); | ||
450 | } | ||
451 | |||
452 | /* Setup the mask of CPUs configured for isolated domains */ | ||
453 | static int __init isolated_cpu_setup(char *str) | ||
454 | { | ||
455 | int ret; | ||
456 | |||
457 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | ||
458 | ret = cpulist_parse(str, cpu_isolated_map); | ||
459 | if (ret) { | ||
460 | pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); | ||
461 | return 0; | ||
462 | } | ||
463 | return 1; | ||
464 | } | ||
465 | __setup("isolcpus=", isolated_cpu_setup); | ||
466 | |||
467 | struct s_data { | ||
468 | struct sched_domain ** __percpu sd; | ||
469 | struct root_domain *rd; | ||
470 | }; | ||
471 | |||
472 | enum s_alloc { | ||
473 | sa_rootdomain, | ||
474 | sa_sd, | ||
475 | sa_sd_storage, | ||
476 | sa_none, | ||
477 | }; | ||
478 | |||
479 | /* | ||
480 | * Build an iteration mask that can exclude certain CPUs from the upwards | ||
481 | * domain traversal. | ||
482 | * | ||
483 | * Asymmetric node setups can result in situations where the domain tree is of | ||
484 | * unequal depth, make sure to skip domains that already cover the entire | ||
485 | * range. | ||
486 | * | ||
487 | * In that case build_sched_domains() will have terminated the iteration early | ||
488 | * and our sibling sd spans will be empty. Domains should always include the | ||
489 | * CPU they're built on, so check that. | ||
490 | */ | ||
491 | static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) | ||
492 | { | ||
493 | const struct cpumask *span = sched_domain_span(sd); | ||
494 | struct sd_data *sdd = sd->private; | ||
495 | struct sched_domain *sibling; | ||
496 | int i; | ||
497 | |||
498 | for_each_cpu(i, span) { | ||
499 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
500 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
501 | continue; | ||
502 | |||
503 | cpumask_set_cpu(i, sched_group_mask(sg)); | ||
504 | } | ||
505 | } | ||
506 | |||
507 | /* | ||
508 | * Return the canonical balance CPU for this group, this is the first CPU | ||
509 | * of this group that's also in the iteration mask. | ||
510 | */ | ||
511 | int group_balance_cpu(struct sched_group *sg) | ||
512 | { | ||
513 | return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); | ||
514 | } | ||
515 | |||
516 | static int | ||
517 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) | ||
518 | { | ||
519 | struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; | ||
520 | const struct cpumask *span = sched_domain_span(sd); | ||
521 | struct cpumask *covered = sched_domains_tmpmask; | ||
522 | struct sd_data *sdd = sd->private; | ||
523 | struct sched_domain *sibling; | ||
524 | int i; | ||
525 | |||
526 | cpumask_clear(covered); | ||
527 | |||
528 | for_each_cpu(i, span) { | ||
529 | struct cpumask *sg_span; | ||
530 | |||
531 | if (cpumask_test_cpu(i, covered)) | ||
532 | continue; | ||
533 | |||
534 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
535 | |||
536 | /* See the comment near build_group_mask(). */ | ||
537 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
538 | continue; | ||
539 | |||
540 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
541 | GFP_KERNEL, cpu_to_node(cpu)); | ||
542 | |||
543 | if (!sg) | ||
544 | goto fail; | ||
545 | |||
546 | sg_span = sched_group_cpus(sg); | ||
547 | if (sibling->child) | ||
548 | cpumask_copy(sg_span, sched_domain_span(sibling->child)); | ||
549 | else | ||
550 | cpumask_set_cpu(i, sg_span); | ||
551 | |||
552 | cpumask_or(covered, covered, sg_span); | ||
553 | |||
554 | sg->sgc = *per_cpu_ptr(sdd->sgc, i); | ||
555 | if (atomic_inc_return(&sg->sgc->ref) == 1) | ||
556 | build_group_mask(sd, sg); | ||
557 | |||
558 | /* | ||
559 | * Initialize sgc->capacity such that even if we mess up the | ||
560 | * domains and no possible iteration will get us here, we won't | ||
561 | * die on a /0 trap. | ||
562 | */ | ||
563 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); | ||
564 | sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; | ||
565 | |||
566 | /* | ||
567 | * Make sure the first group of this domain contains the | ||
568 | * canonical balance CPU. Otherwise the sched_domain iteration | ||
569 | * breaks. See update_sg_lb_stats(). | ||
570 | */ | ||
571 | if ((!groups && cpumask_test_cpu(cpu, sg_span)) || | ||
572 | group_balance_cpu(sg) == cpu) | ||
573 | groups = sg; | ||
574 | |||
575 | if (!first) | ||
576 | first = sg; | ||
577 | if (last) | ||
578 | last->next = sg; | ||
579 | last = sg; | ||
580 | last->next = first; | ||
581 | } | ||
582 | sd->groups = groups; | ||
583 | |||
584 | return 0; | ||
585 | |||
586 | fail: | ||
587 | free_sched_groups(first, 0); | ||
588 | |||
589 | return -ENOMEM; | ||
590 | } | ||
591 | |||
592 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) | ||
593 | { | ||
594 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | ||
595 | struct sched_domain *child = sd->child; | ||
596 | |||
597 | if (child) | ||
598 | cpu = cpumask_first(sched_domain_span(child)); | ||
599 | |||
600 | if (sg) { | ||
601 | *sg = *per_cpu_ptr(sdd->sg, cpu); | ||
602 | (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); | ||
603 | |||
604 | /* For claim_allocations: */ | ||
605 | atomic_set(&(*sg)->sgc->ref, 1); | ||
606 | } | ||
607 | |||
608 | return cpu; | ||
609 | } | ||
610 | |||
611 | /* | ||
612 | * build_sched_groups will build a circular linked list of the groups | ||
613 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
614 | * and ->cpu_capacity to 0. | ||
615 | * | ||
616 | * Assumes the sched_domain tree is fully constructed | ||
617 | */ | ||
618 | static int | ||
619 | build_sched_groups(struct sched_domain *sd, int cpu) | ||
620 | { | ||
621 | struct sched_group *first = NULL, *last = NULL; | ||
622 | struct sd_data *sdd = sd->private; | ||
623 | const struct cpumask *span = sched_domain_span(sd); | ||
624 | struct cpumask *covered; | ||
625 | int i; | ||
626 | |||
627 | get_group(cpu, sdd, &sd->groups); | ||
628 | atomic_inc(&sd->groups->ref); | ||
629 | |||
630 | if (cpu != cpumask_first(span)) | ||
631 | return 0; | ||
632 | |||
633 | lockdep_assert_held(&sched_domains_mutex); | ||
634 | covered = sched_domains_tmpmask; | ||
635 | |||
636 | cpumask_clear(covered); | ||
637 | |||
638 | for_each_cpu(i, span) { | ||
639 | struct sched_group *sg; | ||
640 | int group, j; | ||
641 | |||
642 | if (cpumask_test_cpu(i, covered)) | ||
643 | continue; | ||
644 | |||
645 | group = get_group(i, sdd, &sg); | ||
646 | cpumask_setall(sched_group_mask(sg)); | ||
647 | |||
648 | for_each_cpu(j, span) { | ||
649 | if (get_group(j, sdd, NULL) != group) | ||
650 | continue; | ||
651 | |||
652 | cpumask_set_cpu(j, covered); | ||
653 | cpumask_set_cpu(j, sched_group_cpus(sg)); | ||
654 | } | ||
655 | |||
656 | if (!first) | ||
657 | first = sg; | ||
658 | if (last) | ||
659 | last->next = sg; | ||
660 | last = sg; | ||
661 | } | ||
662 | last->next = first; | ||
663 | |||
664 | return 0; | ||
665 | } | ||
666 | |||
667 | /* | ||
668 | * Initialize sched groups cpu_capacity. | ||
669 | * | ||
670 | * cpu_capacity indicates the capacity of sched group, which is used while | ||
671 | * distributing the load between different sched groups in a sched domain. | ||
672 | * Typically cpu_capacity for all the groups in a sched domain will be same | ||
673 | * unless there are asymmetries in the topology. If there are asymmetries, | ||
674 | * group having more cpu_capacity will pickup more load compared to the | ||
675 | * group having less cpu_capacity. | ||
676 | */ | ||
677 | static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) | ||
678 | { | ||
679 | struct sched_group *sg = sd->groups; | ||
680 | |||
681 | WARN_ON(!sg); | ||
682 | |||
683 | do { | ||
684 | int cpu, max_cpu = -1; | ||
685 | |||
686 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); | ||
687 | |||
688 | if (!(sd->flags & SD_ASYM_PACKING)) | ||
689 | goto next; | ||
690 | |||
691 | for_each_cpu(cpu, sched_group_cpus(sg)) { | ||
692 | if (max_cpu < 0) | ||
693 | max_cpu = cpu; | ||
694 | else if (sched_asym_prefer(cpu, max_cpu)) | ||
695 | max_cpu = cpu; | ||
696 | } | ||
697 | sg->asym_prefer_cpu = max_cpu; | ||
698 | |||
699 | next: | ||
700 | sg = sg->next; | ||
701 | } while (sg != sd->groups); | ||
702 | |||
703 | if (cpu != group_balance_cpu(sg)) | ||
704 | return; | ||
705 | |||
706 | update_group_capacity(sd, cpu); | ||
707 | } | ||
708 | |||
709 | /* | ||
710 | * Initializers for schedule domains | ||
711 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | ||
712 | */ | ||
713 | |||
714 | static int default_relax_domain_level = -1; | ||
715 | int sched_domain_level_max; | ||
716 | |||
717 | static int __init setup_relax_domain_level(char *str) | ||
718 | { | ||
719 | if (kstrtoint(str, 0, &default_relax_domain_level)) | ||
720 | pr_warn("Unable to set relax_domain_level\n"); | ||
721 | |||
722 | return 1; | ||
723 | } | ||
724 | __setup("relax_domain_level=", setup_relax_domain_level); | ||
725 | |||
726 | static void set_domain_attribute(struct sched_domain *sd, | ||
727 | struct sched_domain_attr *attr) | ||
728 | { | ||
729 | int request; | ||
730 | |||
731 | if (!attr || attr->relax_domain_level < 0) { | ||
732 | if (default_relax_domain_level < 0) | ||
733 | return; | ||
734 | else | ||
735 | request = default_relax_domain_level; | ||
736 | } else | ||
737 | request = attr->relax_domain_level; | ||
738 | if (request < sd->level) { | ||
739 | /* Turn off idle balance on this domain: */ | ||
740 | sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); | ||
741 | } else { | ||
742 | /* Turn on idle balance on this domain: */ | ||
743 | sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); | ||
744 | } | ||
745 | } | ||
746 | |||
747 | static void __sdt_free(const struct cpumask *cpu_map); | ||
748 | static int __sdt_alloc(const struct cpumask *cpu_map); | ||
749 | |||
750 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | ||
751 | const struct cpumask *cpu_map) | ||
752 | { | ||
753 | switch (what) { | ||
754 | case sa_rootdomain: | ||
755 | if (!atomic_read(&d->rd->refcount)) | ||
756 | free_rootdomain(&d->rd->rcu); | ||
757 | /* Fall through */ | ||
758 | case sa_sd: | ||
759 | free_percpu(d->sd); | ||
760 | /* Fall through */ | ||
761 | case sa_sd_storage: | ||
762 | __sdt_free(cpu_map); | ||
763 | /* Fall through */ | ||
764 | case sa_none: | ||
765 | break; | ||
766 | } | ||
767 | } | ||
768 | |||
769 | static enum s_alloc | ||
770 | __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) | ||
771 | { | ||
772 | memset(d, 0, sizeof(*d)); | ||
773 | |||
774 | if (__sdt_alloc(cpu_map)) | ||
775 | return sa_sd_storage; | ||
776 | d->sd = alloc_percpu(struct sched_domain *); | ||
777 | if (!d->sd) | ||
778 | return sa_sd_storage; | ||
779 | d->rd = alloc_rootdomain(); | ||
780 | if (!d->rd) | ||
781 | return sa_sd; | ||
782 | return sa_rootdomain; | ||
783 | } | ||
784 | |||
785 | /* | ||
786 | * NULL the sd_data elements we've used to build the sched_domain and | ||
787 | * sched_group structure so that the subsequent __free_domain_allocs() | ||
788 | * will not free the data we're using. | ||
789 | */ | ||
790 | static void claim_allocations(int cpu, struct sched_domain *sd) | ||
791 | { | ||
792 | struct sd_data *sdd = sd->private; | ||
793 | |||
794 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); | ||
795 | *per_cpu_ptr(sdd->sd, cpu) = NULL; | ||
796 | |||
797 | if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) | ||
798 | *per_cpu_ptr(sdd->sds, cpu) = NULL; | ||
799 | |||
800 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) | ||
801 | *per_cpu_ptr(sdd->sg, cpu) = NULL; | ||
802 | |||
803 | if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) | ||
804 | *per_cpu_ptr(sdd->sgc, cpu) = NULL; | ||
805 | } | ||
806 | |||
807 | #ifdef CONFIG_NUMA | ||
808 | static int sched_domains_numa_levels; | ||
809 | enum numa_topology_type sched_numa_topology_type; | ||
810 | static int *sched_domains_numa_distance; | ||
811 | int sched_max_numa_distance; | ||
812 | static struct cpumask ***sched_domains_numa_masks; | ||
813 | static int sched_domains_curr_level; | ||
814 | #endif | ||
815 | |||
816 | /* | ||
817 | * SD_flags allowed in topology descriptions. | ||
818 | * | ||
819 | * These flags are purely descriptive of the topology and do not prescribe | ||
820 | * behaviour. Behaviour is artificial and mapped in the below sd_init() | ||
821 | * function: | ||
822 | * | ||
823 | * SD_SHARE_CPUCAPACITY - describes SMT topologies | ||
824 | * SD_SHARE_PKG_RESOURCES - describes shared caches | ||
825 | * SD_NUMA - describes NUMA topologies | ||
826 | * SD_SHARE_POWERDOMAIN - describes shared power domain | ||
827 | * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies | ||
828 | * | ||
829 | * Odd one out, which beside describing the topology has a quirk also | ||
830 | * prescribes the desired behaviour that goes along with it: | ||
831 | * | ||
832 | * SD_ASYM_PACKING - describes SMT quirks | ||
833 | */ | ||
834 | #define TOPOLOGY_SD_FLAGS \ | ||
835 | (SD_SHARE_CPUCAPACITY | \ | ||
836 | SD_SHARE_PKG_RESOURCES | \ | ||
837 | SD_NUMA | \ | ||
838 | SD_ASYM_PACKING | \ | ||
839 | SD_ASYM_CPUCAPACITY | \ | ||
840 | SD_SHARE_POWERDOMAIN) | ||
841 | |||
842 | static struct sched_domain * | ||
843 | sd_init(struct sched_domain_topology_level *tl, | ||
844 | const struct cpumask *cpu_map, | ||
845 | struct sched_domain *child, int cpu) | ||
846 | { | ||
847 | struct sd_data *sdd = &tl->data; | ||
848 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | ||
849 | int sd_id, sd_weight, sd_flags = 0; | ||
850 | |||
851 | #ifdef CONFIG_NUMA | ||
852 | /* | ||
853 | * Ugly hack to pass state to sd_numa_mask()... | ||
854 | */ | ||
855 | sched_domains_curr_level = tl->numa_level; | ||
856 | #endif | ||
857 | |||
858 | sd_weight = cpumask_weight(tl->mask(cpu)); | ||
859 | |||
860 | if (tl->sd_flags) | ||
861 | sd_flags = (*tl->sd_flags)(); | ||
862 | if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, | ||
863 | "wrong sd_flags in topology description\n")) | ||
864 | sd_flags &= ~TOPOLOGY_SD_FLAGS; | ||
865 | |||
866 | *sd = (struct sched_domain){ | ||
867 | .min_interval = sd_weight, | ||
868 | .max_interval = 2*sd_weight, | ||
869 | .busy_factor = 32, | ||
870 | .imbalance_pct = 125, | ||
871 | |||
872 | .cache_nice_tries = 0, | ||
873 | .busy_idx = 0, | ||
874 | .idle_idx = 0, | ||
875 | .newidle_idx = 0, | ||
876 | .wake_idx = 0, | ||
877 | .forkexec_idx = 0, | ||
878 | |||
879 | .flags = 1*SD_LOAD_BALANCE | ||
880 | | 1*SD_BALANCE_NEWIDLE | ||
881 | | 1*SD_BALANCE_EXEC | ||
882 | | 1*SD_BALANCE_FORK | ||
883 | | 0*SD_BALANCE_WAKE | ||
884 | | 1*SD_WAKE_AFFINE | ||
885 | | 0*SD_SHARE_CPUCAPACITY | ||
886 | | 0*SD_SHARE_PKG_RESOURCES | ||
887 | | 0*SD_SERIALIZE | ||
888 | | 0*SD_PREFER_SIBLING | ||
889 | | 0*SD_NUMA | ||
890 | | sd_flags | ||
891 | , | ||
892 | |||
893 | .last_balance = jiffies, | ||
894 | .balance_interval = sd_weight, | ||
895 | .smt_gain = 0, | ||
896 | .max_newidle_lb_cost = 0, | ||
897 | .next_decay_max_lb_cost = jiffies, | ||
898 | .child = child, | ||
899 | #ifdef CONFIG_SCHED_DEBUG | ||
900 | .name = tl->name, | ||
901 | #endif | ||
902 | }; | ||
903 | |||
904 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
905 | sd_id = cpumask_first(sched_domain_span(sd)); | ||
906 | |||
907 | /* | ||
908 | * Convert topological properties into behaviour. | ||
909 | */ | ||
910 | |||
911 | if (sd->flags & SD_ASYM_CPUCAPACITY) { | ||
912 | struct sched_domain *t = sd; | ||
913 | |||
914 | for_each_lower_domain(t) | ||
915 | t->flags |= SD_BALANCE_WAKE; | ||
916 | } | ||
917 | |||
918 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | ||
919 | sd->flags |= SD_PREFER_SIBLING; | ||
920 | sd->imbalance_pct = 110; | ||
921 | sd->smt_gain = 1178; /* ~15% */ | ||
922 | |||
923 | } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
924 | sd->imbalance_pct = 117; | ||
925 | sd->cache_nice_tries = 1; | ||
926 | sd->busy_idx = 2; | ||
927 | |||
928 | #ifdef CONFIG_NUMA | ||
929 | } else if (sd->flags & SD_NUMA) { | ||
930 | sd->cache_nice_tries = 2; | ||
931 | sd->busy_idx = 3; | ||
932 | sd->idle_idx = 2; | ||
933 | |||
934 | sd->flags |= SD_SERIALIZE; | ||
935 | if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { | ||
936 | sd->flags &= ~(SD_BALANCE_EXEC | | ||
937 | SD_BALANCE_FORK | | ||
938 | SD_WAKE_AFFINE); | ||
939 | } | ||
940 | |||
941 | #endif | ||
942 | } else { | ||
943 | sd->flags |= SD_PREFER_SIBLING; | ||
944 | sd->cache_nice_tries = 1; | ||
945 | sd->busy_idx = 2; | ||
946 | sd->idle_idx = 1; | ||
947 | } | ||
948 | |||
949 | /* | ||
950 | * For all levels sharing cache; connect a sched_domain_shared | ||
951 | * instance. | ||
952 | */ | ||
953 | if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
954 | sd->shared = *per_cpu_ptr(sdd->sds, sd_id); | ||
955 | atomic_inc(&sd->shared->ref); | ||
956 | atomic_set(&sd->shared->nr_busy_cpus, sd_weight); | ||
957 | } | ||
958 | |||
959 | sd->private = sdd; | ||
960 | |||
961 | return sd; | ||
962 | } | ||
963 | |||
964 | /* | ||
965 | * Topology list, bottom-up. | ||
966 | */ | ||
967 | static struct sched_domain_topology_level default_topology[] = { | ||
968 | #ifdef CONFIG_SCHED_SMT | ||
969 | { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, | ||
970 | #endif | ||
971 | #ifdef CONFIG_SCHED_MC | ||
972 | { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, | ||
973 | #endif | ||
974 | { cpu_cpu_mask, SD_INIT_NAME(DIE) }, | ||
975 | { NULL, }, | ||
976 | }; | ||
977 | |||
978 | static struct sched_domain_topology_level *sched_domain_topology = | ||
979 | default_topology; | ||
980 | |||
981 | #define for_each_sd_topology(tl) \ | ||
982 | for (tl = sched_domain_topology; tl->mask; tl++) | ||
983 | |||
984 | void set_sched_topology(struct sched_domain_topology_level *tl) | ||
985 | { | ||
986 | if (WARN_ON_ONCE(sched_smp_initialized)) | ||
987 | return; | ||
988 | |||
989 | sched_domain_topology = tl; | ||
990 | } | ||
991 | |||
992 | #ifdef CONFIG_NUMA | ||
993 | |||
994 | static const struct cpumask *sd_numa_mask(int cpu) | ||
995 | { | ||
996 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | ||
997 | } | ||
998 | |||
999 | static void sched_numa_warn(const char *str) | ||
1000 | { | ||
1001 | static int done = false; | ||
1002 | int i,j; | ||
1003 | |||
1004 | if (done) | ||
1005 | return; | ||
1006 | |||
1007 | done = true; | ||
1008 | |||
1009 | printk(KERN_WARNING "ERROR: %s\n\n", str); | ||
1010 | |||
1011 | for (i = 0; i < nr_node_ids; i++) { | ||
1012 | printk(KERN_WARNING " "); | ||
1013 | for (j = 0; j < nr_node_ids; j++) | ||
1014 | printk(KERN_CONT "%02d ", node_distance(i,j)); | ||
1015 | printk(KERN_CONT "\n"); | ||
1016 | } | ||
1017 | printk(KERN_WARNING "\n"); | ||
1018 | } | ||
1019 | |||
1020 | bool find_numa_distance(int distance) | ||
1021 | { | ||
1022 | int i; | ||
1023 | |||
1024 | if (distance == node_distance(0, 0)) | ||
1025 | return true; | ||
1026 | |||
1027 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
1028 | if (sched_domains_numa_distance[i] == distance) | ||
1029 | return true; | ||
1030 | } | ||
1031 | |||
1032 | return false; | ||
1033 | } | ||
1034 | |||
1035 | /* | ||
1036 | * A system can have three types of NUMA topology: | ||
1037 | * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system | ||
1038 | * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes | ||
1039 | * NUMA_BACKPLANE: nodes can reach other nodes through a backplane | ||
1040 | * | ||
1041 | * The difference between a glueless mesh topology and a backplane | ||
1042 | * topology lies in whether communication between not directly | ||
1043 | * connected nodes goes through intermediary nodes (where programs | ||
1044 | * could run), or through backplane controllers. This affects | ||
1045 | * placement of programs. | ||
1046 | * | ||
1047 | * The type of topology can be discerned with the following tests: | ||
1048 | * - If the maximum distance between any nodes is 1 hop, the system | ||
1049 | * is directly connected. | ||
1050 | * - If for two nodes A and B, located N > 1 hops away from each other, | ||
1051 | * there is an intermediary node C, which is < N hops away from both | ||
1052 | * nodes A and B, the system is a glueless mesh. | ||
1053 | */ | ||
1054 | static void init_numa_topology_type(void) | ||
1055 | { | ||
1056 | int a, b, c, n; | ||
1057 | |||
1058 | n = sched_max_numa_distance; | ||
1059 | |||
1060 | if (sched_domains_numa_levels <= 1) { | ||
1061 | sched_numa_topology_type = NUMA_DIRECT; | ||
1062 | return; | ||
1063 | } | ||
1064 | |||
1065 | for_each_online_node(a) { | ||
1066 | for_each_online_node(b) { | ||
1067 | /* Find two nodes furthest removed from each other. */ | ||
1068 | if (node_distance(a, b) < n) | ||
1069 | continue; | ||
1070 | |||
1071 | /* Is there an intermediary node between a and b? */ | ||
1072 | for_each_online_node(c) { | ||
1073 | if (node_distance(a, c) < n && | ||
1074 | node_distance(b, c) < n) { | ||
1075 | sched_numa_topology_type = | ||
1076 | NUMA_GLUELESS_MESH; | ||
1077 | return; | ||
1078 | } | ||
1079 | } | ||
1080 | |||
1081 | sched_numa_topology_type = NUMA_BACKPLANE; | ||
1082 | return; | ||
1083 | } | ||
1084 | } | ||
1085 | } | ||
1086 | |||
1087 | void sched_init_numa(void) | ||
1088 | { | ||
1089 | int next_distance, curr_distance = node_distance(0, 0); | ||
1090 | struct sched_domain_topology_level *tl; | ||
1091 | int level = 0; | ||
1092 | int i, j, k; | ||
1093 | |||
1094 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); | ||
1095 | if (!sched_domains_numa_distance) | ||
1096 | return; | ||
1097 | |||
1098 | /* | ||
1099 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the | ||
1100 | * unique distances in the node_distance() table. | ||
1101 | * | ||
1102 | * Assumes node_distance(0,j) includes all distances in | ||
1103 | * node_distance(i,j) in order to avoid cubic time. | ||
1104 | */ | ||
1105 | next_distance = curr_distance; | ||
1106 | for (i = 0; i < nr_node_ids; i++) { | ||
1107 | for (j = 0; j < nr_node_ids; j++) { | ||
1108 | for (k = 0; k < nr_node_ids; k++) { | ||
1109 | int distance = node_distance(i, k); | ||
1110 | |||
1111 | if (distance > curr_distance && | ||
1112 | (distance < next_distance || | ||
1113 | next_distance == curr_distance)) | ||
1114 | next_distance = distance; | ||
1115 | |||
1116 | /* | ||
1117 | * While not a strong assumption it would be nice to know | ||
1118 | * about cases where if node A is connected to B, B is not | ||
1119 | * equally connected to A. | ||
1120 | */ | ||
1121 | if (sched_debug() && node_distance(k, i) != distance) | ||
1122 | sched_numa_warn("Node-distance not symmetric"); | ||
1123 | |||
1124 | if (sched_debug() && i && !find_numa_distance(distance)) | ||
1125 | sched_numa_warn("Node-0 not representative"); | ||
1126 | } | ||
1127 | if (next_distance != curr_distance) { | ||
1128 | sched_domains_numa_distance[level++] = next_distance; | ||
1129 | sched_domains_numa_levels = level; | ||
1130 | curr_distance = next_distance; | ||
1131 | } else break; | ||
1132 | } | ||
1133 | |||
1134 | /* | ||
1135 | * In case of sched_debug() we verify the above assumption. | ||
1136 | */ | ||
1137 | if (!sched_debug()) | ||
1138 | break; | ||
1139 | } | ||
1140 | |||
1141 | if (!level) | ||
1142 | return; | ||
1143 | |||
1144 | /* | ||
1145 | * 'level' contains the number of unique distances, excluding the | ||
1146 | * identity distance node_distance(i,i). | ||
1147 | * | ||
1148 | * The sched_domains_numa_distance[] array includes the actual distance | ||
1149 | * numbers. | ||
1150 | */ | ||
1151 | |||
1152 | /* | ||
1153 | * Here, we should temporarily reset sched_domains_numa_levels to 0. | ||
1154 | * If it fails to allocate memory for array sched_domains_numa_masks[][], | ||
1155 | * the array will contain less then 'level' members. This could be | ||
1156 | * dangerous when we use it to iterate array sched_domains_numa_masks[][] | ||
1157 | * in other functions. | ||
1158 | * | ||
1159 | * We reset it to 'level' at the end of this function. | ||
1160 | */ | ||
1161 | sched_domains_numa_levels = 0; | ||
1162 | |||
1163 | sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); | ||
1164 | if (!sched_domains_numa_masks) | ||
1165 | return; | ||
1166 | |||
1167 | /* | ||
1168 | * Now for each level, construct a mask per node which contains all | ||
1169 | * CPUs of nodes that are that many hops away from us. | ||
1170 | */ | ||
1171 | for (i = 0; i < level; i++) { | ||
1172 | sched_domains_numa_masks[i] = | ||
1173 | kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); | ||
1174 | if (!sched_domains_numa_masks[i]) | ||
1175 | return; | ||
1176 | |||
1177 | for (j = 0; j < nr_node_ids; j++) { | ||
1178 | struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); | ||
1179 | if (!mask) | ||
1180 | return; | ||
1181 | |||
1182 | sched_domains_numa_masks[i][j] = mask; | ||
1183 | |||
1184 | for_each_node(k) { | ||
1185 | if (node_distance(j, k) > sched_domains_numa_distance[i]) | ||
1186 | continue; | ||
1187 | |||
1188 | cpumask_or(mask, mask, cpumask_of_node(k)); | ||
1189 | } | ||
1190 | } | ||
1191 | } | ||
1192 | |||
1193 | /* Compute default topology size */ | ||
1194 | for (i = 0; sched_domain_topology[i].mask; i++); | ||
1195 | |||
1196 | tl = kzalloc((i + level + 1) * | ||
1197 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); | ||
1198 | if (!tl) | ||
1199 | return; | ||
1200 | |||
1201 | /* | ||
1202 | * Copy the default topology bits.. | ||
1203 | */ | ||
1204 | for (i = 0; sched_domain_topology[i].mask; i++) | ||
1205 | tl[i] = sched_domain_topology[i]; | ||
1206 | |||
1207 | /* | ||
1208 | * .. and append 'j' levels of NUMA goodness. | ||
1209 | */ | ||
1210 | for (j = 0; j < level; i++, j++) { | ||
1211 | tl[i] = (struct sched_domain_topology_level){ | ||
1212 | .mask = sd_numa_mask, | ||
1213 | .sd_flags = cpu_numa_flags, | ||
1214 | .flags = SDTL_OVERLAP, | ||
1215 | .numa_level = j, | ||
1216 | SD_INIT_NAME(NUMA) | ||
1217 | }; | ||
1218 | } | ||
1219 | |||
1220 | sched_domain_topology = tl; | ||
1221 | |||
1222 | sched_domains_numa_levels = level; | ||
1223 | sched_max_numa_distance = sched_domains_numa_distance[level - 1]; | ||
1224 | |||
1225 | init_numa_topology_type(); | ||
1226 | } | ||
1227 | |||
1228 | void sched_domains_numa_masks_set(unsigned int cpu) | ||
1229 | { | ||
1230 | int node = cpu_to_node(cpu); | ||
1231 | int i, j; | ||
1232 | |||
1233 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
1234 | for (j = 0; j < nr_node_ids; j++) { | ||
1235 | if (node_distance(j, node) <= sched_domains_numa_distance[i]) | ||
1236 | cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); | ||
1237 | } | ||
1238 | } | ||
1239 | } | ||
1240 | |||
1241 | void sched_domains_numa_masks_clear(unsigned int cpu) | ||
1242 | { | ||
1243 | int i, j; | ||
1244 | |||
1245 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
1246 | for (j = 0; j < nr_node_ids; j++) | ||
1247 | cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); | ||
1248 | } | ||
1249 | } | ||
1250 | |||
1251 | #endif /* CONFIG_NUMA */ | ||
1252 | |||
1253 | static int __sdt_alloc(const struct cpumask *cpu_map) | ||
1254 | { | ||
1255 | struct sched_domain_topology_level *tl; | ||
1256 | int j; | ||
1257 | |||
1258 | for_each_sd_topology(tl) { | ||
1259 | struct sd_data *sdd = &tl->data; | ||
1260 | |||
1261 | sdd->sd = alloc_percpu(struct sched_domain *); | ||
1262 | if (!sdd->sd) | ||
1263 | return -ENOMEM; | ||
1264 | |||
1265 | sdd->sds = alloc_percpu(struct sched_domain_shared *); | ||
1266 | if (!sdd->sds) | ||
1267 | return -ENOMEM; | ||
1268 | |||
1269 | sdd->sg = alloc_percpu(struct sched_group *); | ||
1270 | if (!sdd->sg) | ||
1271 | return -ENOMEM; | ||
1272 | |||
1273 | sdd->sgc = alloc_percpu(struct sched_group_capacity *); | ||
1274 | if (!sdd->sgc) | ||
1275 | return -ENOMEM; | ||
1276 | |||
1277 | for_each_cpu(j, cpu_map) { | ||
1278 | struct sched_domain *sd; | ||
1279 | struct sched_domain_shared *sds; | ||
1280 | struct sched_group *sg; | ||
1281 | struct sched_group_capacity *sgc; | ||
1282 | |||
1283 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | ||
1284 | GFP_KERNEL, cpu_to_node(j)); | ||
1285 | if (!sd) | ||
1286 | return -ENOMEM; | ||
1287 | |||
1288 | *per_cpu_ptr(sdd->sd, j) = sd; | ||
1289 | |||
1290 | sds = kzalloc_node(sizeof(struct sched_domain_shared), | ||
1291 | GFP_KERNEL, cpu_to_node(j)); | ||
1292 | if (!sds) | ||
1293 | return -ENOMEM; | ||
1294 | |||
1295 | *per_cpu_ptr(sdd->sds, j) = sds; | ||
1296 | |||
1297 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
1298 | GFP_KERNEL, cpu_to_node(j)); | ||
1299 | if (!sg) | ||
1300 | return -ENOMEM; | ||
1301 | |||
1302 | sg->next = sg; | ||
1303 | |||
1304 | *per_cpu_ptr(sdd->sg, j) = sg; | ||
1305 | |||
1306 | sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), | ||
1307 | GFP_KERNEL, cpu_to_node(j)); | ||
1308 | if (!sgc) | ||
1309 | return -ENOMEM; | ||
1310 | |||
1311 | *per_cpu_ptr(sdd->sgc, j) = sgc; | ||
1312 | } | ||
1313 | } | ||
1314 | |||
1315 | return 0; | ||
1316 | } | ||
1317 | |||
1318 | static void __sdt_free(const struct cpumask *cpu_map) | ||
1319 | { | ||
1320 | struct sched_domain_topology_level *tl; | ||
1321 | int j; | ||
1322 | |||
1323 | for_each_sd_topology(tl) { | ||
1324 | struct sd_data *sdd = &tl->data; | ||
1325 | |||
1326 | for_each_cpu(j, cpu_map) { | ||
1327 | struct sched_domain *sd; | ||
1328 | |||
1329 | if (sdd->sd) { | ||
1330 | sd = *per_cpu_ptr(sdd->sd, j); | ||
1331 | if (sd && (sd->flags & SD_OVERLAP)) | ||
1332 | free_sched_groups(sd->groups, 0); | ||
1333 | kfree(*per_cpu_ptr(sdd->sd, j)); | ||
1334 | } | ||
1335 | |||
1336 | if (sdd->sds) | ||
1337 | kfree(*per_cpu_ptr(sdd->sds, j)); | ||
1338 | if (sdd->sg) | ||
1339 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
1340 | if (sdd->sgc) | ||
1341 | kfree(*per_cpu_ptr(sdd->sgc, j)); | ||
1342 | } | ||
1343 | free_percpu(sdd->sd); | ||
1344 | sdd->sd = NULL; | ||
1345 | free_percpu(sdd->sds); | ||
1346 | sdd->sds = NULL; | ||
1347 | free_percpu(sdd->sg); | ||
1348 | sdd->sg = NULL; | ||
1349 | free_percpu(sdd->sgc); | ||
1350 | sdd->sgc = NULL; | ||
1351 | } | ||
1352 | } | ||
1353 | |||
1354 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | ||
1355 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
1356 | struct sched_domain *child, int cpu) | ||
1357 | { | ||
1358 | struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); | ||
1359 | |||
1360 | if (child) { | ||
1361 | sd->level = child->level + 1; | ||
1362 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | ||
1363 | child->parent = sd; | ||
1364 | |||
1365 | if (!cpumask_subset(sched_domain_span(child), | ||
1366 | sched_domain_span(sd))) { | ||
1367 | pr_err("BUG: arch topology borken\n"); | ||
1368 | #ifdef CONFIG_SCHED_DEBUG | ||
1369 | pr_err(" the %s domain not a subset of the %s domain\n", | ||
1370 | child->name, sd->name); | ||
1371 | #endif | ||
1372 | /* Fixup, ensure @sd has at least @child cpus. */ | ||
1373 | cpumask_or(sched_domain_span(sd), | ||
1374 | sched_domain_span(sd), | ||
1375 | sched_domain_span(child)); | ||
1376 | } | ||
1377 | |||
1378 | } | ||
1379 | set_domain_attribute(sd, attr); | ||
1380 | |||
1381 | return sd; | ||
1382 | } | ||
1383 | |||
1384 | /* | ||
1385 | * Build sched domains for a given set of CPUs and attach the sched domains | ||
1386 | * to the individual CPUs | ||
1387 | */ | ||
1388 | static int | ||
1389 | build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) | ||
1390 | { | ||
1391 | enum s_alloc alloc_state; | ||
1392 | struct sched_domain *sd; | ||
1393 | struct s_data d; | ||
1394 | struct rq *rq = NULL; | ||
1395 | int i, ret = -ENOMEM; | ||
1396 | |||
1397 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | ||
1398 | if (alloc_state != sa_rootdomain) | ||
1399 | goto error; | ||
1400 | |||
1401 | /* Set up domains for CPUs specified by the cpu_map: */ | ||
1402 | for_each_cpu(i, cpu_map) { | ||
1403 | struct sched_domain_topology_level *tl; | ||
1404 | |||
1405 | sd = NULL; | ||
1406 | for_each_sd_topology(tl) { | ||
1407 | sd = build_sched_domain(tl, cpu_map, attr, sd, i); | ||
1408 | if (tl == sched_domain_topology) | ||
1409 | *per_cpu_ptr(d.sd, i) = sd; | ||
1410 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) | ||
1411 | sd->flags |= SD_OVERLAP; | ||
1412 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) | ||
1413 | break; | ||
1414 | } | ||
1415 | } | ||
1416 | |||
1417 | /* Build the groups for the domains */ | ||
1418 | for_each_cpu(i, cpu_map) { | ||
1419 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
1420 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); | ||
1421 | if (sd->flags & SD_OVERLAP) { | ||
1422 | if (build_overlap_sched_groups(sd, i)) | ||
1423 | goto error; | ||
1424 | } else { | ||
1425 | if (build_sched_groups(sd, i)) | ||
1426 | goto error; | ||
1427 | } | ||
1428 | } | ||
1429 | } | ||
1430 | |||
1431 | /* Calculate CPU capacity for physical packages and nodes */ | ||
1432 | for (i = nr_cpumask_bits-1; i >= 0; i--) { | ||
1433 | if (!cpumask_test_cpu(i, cpu_map)) | ||
1434 | continue; | ||
1435 | |||
1436 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
1437 | claim_allocations(i, sd); | ||
1438 | init_sched_groups_capacity(i, sd); | ||
1439 | } | ||
1440 | } | ||
1441 | |||
1442 | /* Attach the domains */ | ||
1443 | rcu_read_lock(); | ||
1444 | for_each_cpu(i, cpu_map) { | ||
1445 | rq = cpu_rq(i); | ||
1446 | sd = *per_cpu_ptr(d.sd, i); | ||
1447 | |||
1448 | /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ | ||
1449 | if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) | ||
1450 | WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); | ||
1451 | |||
1452 | cpu_attach_domain(sd, d.rd, i); | ||
1453 | } | ||
1454 | rcu_read_unlock(); | ||
1455 | |||
1456 | if (rq && sched_debug_enabled) { | ||
1457 | pr_info("span: %*pbl (max cpu_capacity = %lu)\n", | ||
1458 | cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); | ||
1459 | } | ||
1460 | |||
1461 | ret = 0; | ||
1462 | error: | ||
1463 | __free_domain_allocs(&d, alloc_state, cpu_map); | ||
1464 | return ret; | ||
1465 | } | ||
1466 | |||
1467 | /* Current sched domains: */ | ||
1468 | static cpumask_var_t *doms_cur; | ||
1469 | |||
1470 | /* Number of sched domains in 'doms_cur': */ | ||
1471 | static int ndoms_cur; | ||
1472 | |||
1473 | /* Attribues of custom domains in 'doms_cur' */ | ||
1474 | static struct sched_domain_attr *dattr_cur; | ||
1475 | |||
1476 | /* | ||
1477 | * Special case: If a kmalloc() of a doms_cur partition (array of | ||
1478 | * cpumask) fails, then fallback to a single sched domain, | ||
1479 | * as determined by the single cpumask fallback_doms. | ||
1480 | */ | ||
1481 | cpumask_var_t fallback_doms; | ||
1482 | |||
1483 | /* | ||
1484 | * arch_update_cpu_topology lets virtualized architectures update the | ||
1485 | * CPU core maps. It is supposed to return 1 if the topology changed | ||
1486 | * or 0 if it stayed the same. | ||
1487 | */ | ||
1488 | int __weak arch_update_cpu_topology(void) | ||
1489 | { | ||
1490 | return 0; | ||
1491 | } | ||
1492 | |||
1493 | cpumask_var_t *alloc_sched_domains(unsigned int ndoms) | ||
1494 | { | ||
1495 | int i; | ||
1496 | cpumask_var_t *doms; | ||
1497 | |||
1498 | doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); | ||
1499 | if (!doms) | ||
1500 | return NULL; | ||
1501 | for (i = 0; i < ndoms; i++) { | ||
1502 | if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { | ||
1503 | free_sched_domains(doms, i); | ||
1504 | return NULL; | ||
1505 | } | ||
1506 | } | ||
1507 | return doms; | ||
1508 | } | ||
1509 | |||
1510 | void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | ||
1511 | { | ||
1512 | unsigned int i; | ||
1513 | for (i = 0; i < ndoms; i++) | ||
1514 | free_cpumask_var(doms[i]); | ||
1515 | kfree(doms); | ||
1516 | } | ||
1517 | |||
1518 | /* | ||
1519 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | ||
1520 | * For now this just excludes isolated CPUs, but could be used to | ||
1521 | * exclude other special cases in the future. | ||
1522 | */ | ||
1523 | int init_sched_domains(const struct cpumask *cpu_map) | ||
1524 | { | ||
1525 | int err; | ||
1526 | |||
1527 | arch_update_cpu_topology(); | ||
1528 | ndoms_cur = 1; | ||
1529 | doms_cur = alloc_sched_domains(ndoms_cur); | ||
1530 | if (!doms_cur) | ||
1531 | doms_cur = &fallback_doms; | ||
1532 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | ||
1533 | err = build_sched_domains(doms_cur[0], NULL); | ||
1534 | register_sched_domain_sysctl(); | ||
1535 | |||
1536 | return err; | ||
1537 | } | ||
1538 | |||
1539 | /* | ||
1540 | * Detach sched domains from a group of CPUs specified in cpu_map | ||
1541 | * These CPUs will now be attached to the NULL domain | ||
1542 | */ | ||
1543 | static void detach_destroy_domains(const struct cpumask *cpu_map) | ||
1544 | { | ||
1545 | int i; | ||
1546 | |||
1547 | rcu_read_lock(); | ||
1548 | for_each_cpu(i, cpu_map) | ||
1549 | cpu_attach_domain(NULL, &def_root_domain, i); | ||
1550 | rcu_read_unlock(); | ||
1551 | } | ||
1552 | |||
1553 | /* handle null as "default" */ | ||
1554 | static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | ||
1555 | struct sched_domain_attr *new, int idx_new) | ||
1556 | { | ||
1557 | struct sched_domain_attr tmp; | ||
1558 | |||
1559 | /* Fast path: */ | ||
1560 | if (!new && !cur) | ||
1561 | return 1; | ||
1562 | |||
1563 | tmp = SD_ATTR_INIT; | ||
1564 | return !memcmp(cur ? (cur + idx_cur) : &tmp, | ||
1565 | new ? (new + idx_new) : &tmp, | ||
1566 | sizeof(struct sched_domain_attr)); | ||
1567 | } | ||
1568 | |||
1569 | /* | ||
1570 | * Partition sched domains as specified by the 'ndoms_new' | ||
1571 | * cpumasks in the array doms_new[] of cpumasks. This compares | ||
1572 | * doms_new[] to the current sched domain partitioning, doms_cur[]. | ||
1573 | * It destroys each deleted domain and builds each new domain. | ||
1574 | * | ||
1575 | * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. | ||
1576 | * The masks don't intersect (don't overlap.) We should setup one | ||
1577 | * sched domain for each mask. CPUs not in any of the cpumasks will | ||
1578 | * not be load balanced. If the same cpumask appears both in the | ||
1579 | * current 'doms_cur' domains and in the new 'doms_new', we can leave | ||
1580 | * it as it is. | ||
1581 | * | ||
1582 | * The passed in 'doms_new' should be allocated using | ||
1583 | * alloc_sched_domains. This routine takes ownership of it and will | ||
1584 | * free_sched_domains it when done with it. If the caller failed the | ||
1585 | * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, | ||
1586 | * and partition_sched_domains() will fallback to the single partition | ||
1587 | * 'fallback_doms', it also forces the domains to be rebuilt. | ||
1588 | * | ||
1589 | * If doms_new == NULL it will be replaced with cpu_online_mask. | ||
1590 | * ndoms_new == 0 is a special case for destroying existing domains, | ||
1591 | * and it will not create the default domain. | ||
1592 | * | ||
1593 | * Call with hotplug lock held | ||
1594 | */ | ||
1595 | void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], | ||
1596 | struct sched_domain_attr *dattr_new) | ||
1597 | { | ||
1598 | int i, j, n; | ||
1599 | int new_topology; | ||
1600 | |||
1601 | mutex_lock(&sched_domains_mutex); | ||
1602 | |||
1603 | /* Always unregister in case we don't destroy any domains: */ | ||
1604 | unregister_sched_domain_sysctl(); | ||
1605 | |||
1606 | /* Let the architecture update CPU core mappings: */ | ||
1607 | new_topology = arch_update_cpu_topology(); | ||
1608 | |||
1609 | n = doms_new ? ndoms_new : 0; | ||
1610 | |||
1611 | /* Destroy deleted domains: */ | ||
1612 | for (i = 0; i < ndoms_cur; i++) { | ||
1613 | for (j = 0; j < n && !new_topology; j++) { | ||
1614 | if (cpumask_equal(doms_cur[i], doms_new[j]) | ||
1615 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | ||
1616 | goto match1; | ||
1617 | } | ||
1618 | /* No match - a current sched domain not in new doms_new[] */ | ||
1619 | detach_destroy_domains(doms_cur[i]); | ||
1620 | match1: | ||
1621 | ; | ||
1622 | } | ||
1623 | |||
1624 | n = ndoms_cur; | ||
1625 | if (doms_new == NULL) { | ||
1626 | n = 0; | ||
1627 | doms_new = &fallback_doms; | ||
1628 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); | ||
1629 | WARN_ON_ONCE(dattr_new); | ||
1630 | } | ||
1631 | |||
1632 | /* Build new domains: */ | ||
1633 | for (i = 0; i < ndoms_new; i++) { | ||
1634 | for (j = 0; j < n && !new_topology; j++) { | ||
1635 | if (cpumask_equal(doms_new[i], doms_cur[j]) | ||
1636 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | ||
1637 | goto match2; | ||
1638 | } | ||
1639 | /* No match - add a new doms_new */ | ||
1640 | build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); | ||
1641 | match2: | ||
1642 | ; | ||
1643 | } | ||
1644 | |||
1645 | /* Remember the new sched domains: */ | ||
1646 | if (doms_cur != &fallback_doms) | ||
1647 | free_sched_domains(doms_cur, ndoms_cur); | ||
1648 | |||
1649 | kfree(dattr_cur); | ||
1650 | doms_cur = doms_new; | ||
1651 | dattr_cur = dattr_new; | ||
1652 | ndoms_cur = ndoms_new; | ||
1653 | |||
1654 | register_sched_domain_sysctl(); | ||
1655 | |||
1656 | mutex_unlock(&sched_domains_mutex); | ||
1657 | } | ||
1658 | |||