diff options
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 123 |
1 files changed, 48 insertions, 75 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 5e348ae37ce9..0ddb48d40e4d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -103,9 +103,6 @@ struct cpuset { | |||
103 | /* for custom sched domain */ | 103 | /* for custom sched domain */ |
104 | int relax_domain_level; | 104 | int relax_domain_level; |
105 | 105 | ||
106 | /* used for walking a cpuset hierarchy */ | ||
107 | struct list_head stack_list; | ||
108 | |||
109 | struct work_struct hotplug_work; | 106 | struct work_struct hotplug_work; |
110 | }; | 107 | }; |
111 | 108 | ||
@@ -207,6 +204,20 @@ static struct cpuset top_cpuset = { | |||
207 | cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ | 204 | cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ |
208 | if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) | 205 | if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) |
209 | 206 | ||
207 | /** | ||
208 | * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants | ||
209 | * @des_cs: loop cursor pointing to the current descendant | ||
210 | * @pos_cgrp: used for iteration | ||
211 | * @root_cs: target cpuset to walk ancestor of | ||
212 | * | ||
213 | * Walk @des_cs through the online descendants of @root_cs. Must be used | ||
214 | * with RCU read locked. The caller may modify @pos_cgrp by calling | ||
215 | * cgroup_rightmost_descendant() to skip subtree. | ||
216 | */ | ||
217 | #define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ | ||
218 | cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ | ||
219 | if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) | ||
220 | |||
210 | /* | 221 | /* |
211 | * There are two global mutexes guarding cpuset structures - cpuset_mutex | 222 | * There are two global mutexes guarding cpuset structures - cpuset_mutex |
212 | * and callback_mutex. The latter may nest inside the former. We also | 223 | * and callback_mutex. The latter may nest inside the former. We also |
@@ -507,31 +518,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | |||
507 | return; | 518 | return; |
508 | } | 519 | } |
509 | 520 | ||
510 | static void | 521 | static void update_domain_attr_tree(struct sched_domain_attr *dattr, |
511 | update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | 522 | struct cpuset *root_cs) |
512 | { | 523 | { |
513 | LIST_HEAD(q); | 524 | struct cpuset *cp; |
514 | 525 | struct cgroup *pos_cgrp; | |
515 | list_add(&c->stack_list, &q); | ||
516 | while (!list_empty(&q)) { | ||
517 | struct cpuset *cp; | ||
518 | struct cgroup *cont; | ||
519 | struct cpuset *child; | ||
520 | |||
521 | cp = list_first_entry(&q, struct cpuset, stack_list); | ||
522 | list_del(q.next); | ||
523 | 526 | ||
524 | if (cpumask_empty(cp->cpus_allowed)) | 527 | rcu_read_lock(); |
528 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | ||
529 | /* skip the whole subtree if @cp doesn't have any CPU */ | ||
530 | if (cpumask_empty(cp->cpus_allowed)) { | ||
531 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | ||
525 | continue; | 532 | continue; |
533 | } | ||
526 | 534 | ||
527 | if (is_sched_load_balance(cp)) | 535 | if (is_sched_load_balance(cp)) |
528 | update_domain_attr(dattr, cp); | 536 | update_domain_attr(dattr, cp); |
529 | |||
530 | rcu_read_lock(); | ||
531 | cpuset_for_each_child(child, cont, cp) | ||
532 | list_add_tail(&child->stack_list, &q); | ||
533 | rcu_read_unlock(); | ||
534 | } | 537 | } |
538 | rcu_read_unlock(); | ||
535 | } | 539 | } |
536 | 540 | ||
537 | /* | 541 | /* |
@@ -591,7 +595,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
591 | static int generate_sched_domains(cpumask_var_t **domains, | 595 | static int generate_sched_domains(cpumask_var_t **domains, |
592 | struct sched_domain_attr **attributes) | 596 | struct sched_domain_attr **attributes) |
593 | { | 597 | { |
594 | LIST_HEAD(q); /* queue of cpusets to be scanned */ | ||
595 | struct cpuset *cp; /* scans q */ | 598 | struct cpuset *cp; /* scans q */ |
596 | struct cpuset **csa; /* array of all cpuset ptrs */ | 599 | struct cpuset **csa; /* array of all cpuset ptrs */ |
597 | int csn; /* how many cpuset ptrs in csa so far */ | 600 | int csn; /* how many cpuset ptrs in csa so far */ |
@@ -600,6 +603,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
600 | struct sched_domain_attr *dattr; /* attributes for custom domains */ | 603 | struct sched_domain_attr *dattr; /* attributes for custom domains */ |
601 | int ndoms = 0; /* number of sched domains in result */ | 604 | int ndoms = 0; /* number of sched domains in result */ |
602 | int nslot; /* next empty doms[] struct cpumask slot */ | 605 | int nslot; /* next empty doms[] struct cpumask slot */ |
606 | struct cgroup *pos_cgrp; | ||
603 | 607 | ||
604 | doms = NULL; | 608 | doms = NULL; |
605 | dattr = NULL; | 609 | dattr = NULL; |
@@ -627,33 +631,27 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
627 | goto done; | 631 | goto done; |
628 | csn = 0; | 632 | csn = 0; |
629 | 633 | ||
630 | list_add(&top_cpuset.stack_list, &q); | 634 | rcu_read_lock(); |
631 | while (!list_empty(&q)) { | 635 | cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { |
632 | struct cgroup *cont; | ||
633 | struct cpuset *child; /* scans child cpusets of cp */ | ||
634 | |||
635 | cp = list_first_entry(&q, struct cpuset, stack_list); | ||
636 | list_del(q.next); | ||
637 | |||
638 | if (cpumask_empty(cp->cpus_allowed)) | ||
639 | continue; | ||
640 | |||
641 | /* | 636 | /* |
642 | * All child cpusets contain a subset of the parent's cpus, so | 637 | * Continue traversing beyond @cp iff @cp has some CPUs and |
643 | * just skip them, and then we call update_domain_attr_tree() | 638 | * isn't load balancing. The former is obvious. The |
644 | * to calc relax_domain_level of the corresponding sched | 639 | * latter: All child cpusets contain a subset of the |
645 | * domain. | 640 | * parent's cpus, so just skip them, and then we call |
641 | * update_domain_attr_tree() to calc relax_domain_level of | ||
642 | * the corresponding sched domain. | ||
646 | */ | 643 | */ |
647 | if (is_sched_load_balance(cp)) { | 644 | if (!cpumask_empty(cp->cpus_allowed) && |
648 | csa[csn++] = cp; | 645 | !is_sched_load_balance(cp)) |
649 | continue; | 646 | continue; |
650 | } | ||
651 | 647 | ||
652 | rcu_read_lock(); | 648 | if (is_sched_load_balance(cp)) |
653 | cpuset_for_each_child(child, cont, cp) | 649 | csa[csn++] = cp; |
654 | list_add_tail(&child->stack_list, &q); | 650 | |
655 | rcu_read_unlock(); | 651 | /* skip @cp's subtree */ |
656 | } | 652 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); |
653 | } | ||
654 | rcu_read_unlock(); | ||
657 | 655 | ||
658 | for (i = 0; i < csn; i++) | 656 | for (i = 0; i < csn; i++) |
659 | csa[i]->pn = i; | 657 | csa[i]->pn = i; |
@@ -2068,31 +2066,6 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
2068 | move_member_tasks_to_cpuset(cs, parent); | 2066 | move_member_tasks_to_cpuset(cs, parent); |
2069 | } | 2067 | } |
2070 | 2068 | ||
2071 | /* | ||
2072 | * Helper function to traverse cpusets. | ||
2073 | * It can be used to walk the cpuset tree from top to bottom, completing | ||
2074 | * one layer before dropping down to the next (thus always processing a | ||
2075 | * node before any of its children). | ||
2076 | */ | ||
2077 | static struct cpuset *cpuset_next(struct list_head *queue) | ||
2078 | { | ||
2079 | struct cpuset *cp; | ||
2080 | struct cpuset *child; /* scans child cpusets of cp */ | ||
2081 | struct cgroup *cont; | ||
2082 | |||
2083 | if (list_empty(queue)) | ||
2084 | return NULL; | ||
2085 | |||
2086 | cp = list_first_entry(queue, struct cpuset, stack_list); | ||
2087 | list_del(queue->next); | ||
2088 | rcu_read_lock(); | ||
2089 | cpuset_for_each_child(child, cont, cp) | ||
2090 | list_add_tail(&child->stack_list, queue); | ||
2091 | rcu_read_unlock(); | ||
2092 | |||
2093 | return cp; | ||
2094 | } | ||
2095 | |||
2096 | /** | 2069 | /** |
2097 | * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset | 2070 | * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset |
2098 | * @cs: cpuset in interest | 2071 | * @cs: cpuset in interest |
@@ -2229,12 +2202,12 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2229 | /* if cpus or mems went down, we need to propagate to descendants */ | 2202 | /* if cpus or mems went down, we need to propagate to descendants */ |
2230 | if (cpus_offlined || mems_offlined) { | 2203 | if (cpus_offlined || mems_offlined) { |
2231 | struct cpuset *cs; | 2204 | struct cpuset *cs; |
2232 | LIST_HEAD(queue); | 2205 | struct cgroup *pos_cgrp; |
2233 | 2206 | ||
2234 | list_add_tail(&top_cpuset.stack_list, &queue); | 2207 | rcu_read_lock(); |
2235 | while ((cs = cpuset_next(&queue))) | 2208 | cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) |
2236 | if (cs != &top_cpuset) | 2209 | schedule_cpuset_propagate_hotplug(cs); |
2237 | schedule_cpuset_propagate_hotplug(cs); | 2210 | rcu_read_unlock(); |
2238 | } | 2211 | } |
2239 | 2212 | ||
2240 | mutex_unlock(&cpuset_mutex); | 2213 | mutex_unlock(&cpuset_mutex); |