aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--kernel/cpuset.c123
1 files changed, 48 insertions, 75 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 5e348ae37ce9..0ddb48d40e4d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -103,9 +103,6 @@ struct cpuset {
103 /* for custom sched domain */ 103 /* for custom sched domain */
104 int relax_domain_level; 104 int relax_domain_level;
105 105
106 /* used for walking a cpuset hierarchy */
107 struct list_head stack_list;
108
109 struct work_struct hotplug_work; 106 struct work_struct hotplug_work;
110}; 107};
111 108
@@ -207,6 +204,20 @@ static struct cpuset top_cpuset = {
207 cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ 204 cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \
208 if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) 205 if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
209 206
207/**
208 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
209 * @des_cs: loop cursor pointing to the current descendant
210 * @pos_cgrp: used for iteration
211 * @root_cs: target cpuset to walk ancestor of
212 *
213 * Walk @des_cs through the online descendants of @root_cs. Must be used
214 * with RCU read locked. The caller may modify @pos_cgrp by calling
215 * cgroup_rightmost_descendant() to skip subtree.
216 */
217#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \
218 cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
219 if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
220
210/* 221/*
211 * There are two global mutexes guarding cpuset structures - cpuset_mutex 222 * There are two global mutexes guarding cpuset structures - cpuset_mutex
212 * and callback_mutex. The latter may nest inside the former. We also 223 * and callback_mutex. The latter may nest inside the former. We also
@@ -507,31 +518,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
507 return; 518 return;
508} 519}
509 520
510static void 521static void update_domain_attr_tree(struct sched_domain_attr *dattr,
511update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) 522 struct cpuset *root_cs)
512{ 523{
513 LIST_HEAD(q); 524 struct cpuset *cp;
514 525 struct cgroup *pos_cgrp;
515 list_add(&c->stack_list, &q);
516 while (!list_empty(&q)) {
517 struct cpuset *cp;
518 struct cgroup *cont;
519 struct cpuset *child;
520
521 cp = list_first_entry(&q, struct cpuset, stack_list);
522 list_del(q.next);
523 526
524 if (cpumask_empty(cp->cpus_allowed)) 527 rcu_read_lock();
528 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
529 /* skip the whole subtree if @cp doesn't have any CPU */
530 if (cpumask_empty(cp->cpus_allowed)) {
531 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
525 continue; 532 continue;
533 }
526 534
527 if (is_sched_load_balance(cp)) 535 if (is_sched_load_balance(cp))
528 update_domain_attr(dattr, cp); 536 update_domain_attr(dattr, cp);
529
530 rcu_read_lock();
531 cpuset_for_each_child(child, cont, cp)
532 list_add_tail(&child->stack_list, &q);
533 rcu_read_unlock();
534 } 537 }
538 rcu_read_unlock();
535} 539}
536 540
537/* 541/*
@@ -591,7 +595,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
591static int generate_sched_domains(cpumask_var_t **domains, 595static int generate_sched_domains(cpumask_var_t **domains,
592 struct sched_domain_attr **attributes) 596 struct sched_domain_attr **attributes)
593{ 597{
594 LIST_HEAD(q); /* queue of cpusets to be scanned */
595 struct cpuset *cp; /* scans q */ 598 struct cpuset *cp; /* scans q */
596 struct cpuset **csa; /* array of all cpuset ptrs */ 599 struct cpuset **csa; /* array of all cpuset ptrs */
597 int csn; /* how many cpuset ptrs in csa so far */ 600 int csn; /* how many cpuset ptrs in csa so far */
@@ -600,6 +603,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
600 struct sched_domain_attr *dattr; /* attributes for custom domains */ 603 struct sched_domain_attr *dattr; /* attributes for custom domains */
601 int ndoms = 0; /* number of sched domains in result */ 604 int ndoms = 0; /* number of sched domains in result */
602 int nslot; /* next empty doms[] struct cpumask slot */ 605 int nslot; /* next empty doms[] struct cpumask slot */
606 struct cgroup *pos_cgrp;
603 607
604 doms = NULL; 608 doms = NULL;
605 dattr = NULL; 609 dattr = NULL;
@@ -627,33 +631,27 @@ static int generate_sched_domains(cpumask_var_t **domains,
627 goto done; 631 goto done;
628 csn = 0; 632 csn = 0;
629 633
630 list_add(&top_cpuset.stack_list, &q); 634 rcu_read_lock();
631 while (!list_empty(&q)) { 635 cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
632 struct cgroup *cont;
633 struct cpuset *child; /* scans child cpusets of cp */
634
635 cp = list_first_entry(&q, struct cpuset, stack_list);
636 list_del(q.next);
637
638 if (cpumask_empty(cp->cpus_allowed))
639 continue;
640
641 /* 636 /*
642 * All child cpusets contain a subset of the parent's cpus, so 637 * Continue traversing beyond @cp iff @cp has some CPUs and
643 * just skip them, and then we call update_domain_attr_tree() 638 * isn't load balancing. The former is obvious. The
644 * to calc relax_domain_level of the corresponding sched 639 * latter: All child cpusets contain a subset of the
645 * domain. 640 * parent's cpus, so just skip them, and then we call
641 * update_domain_attr_tree() to calc relax_domain_level of
642 * the corresponding sched domain.
646 */ 643 */
647 if (is_sched_load_balance(cp)) { 644 if (!cpumask_empty(cp->cpus_allowed) &&
648 csa[csn++] = cp; 645 !is_sched_load_balance(cp))
649 continue; 646 continue;
650 }
651 647
652 rcu_read_lock(); 648 if (is_sched_load_balance(cp))
653 cpuset_for_each_child(child, cont, cp) 649 csa[csn++] = cp;
654 list_add_tail(&child->stack_list, &q); 650
655 rcu_read_unlock(); 651 /* skip @cp's subtree */
656 } 652 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
653 }
654 rcu_read_unlock();
657 655
658 for (i = 0; i < csn; i++) 656 for (i = 0; i < csn; i++)
659 csa[i]->pn = i; 657 csa[i]->pn = i;
@@ -2068,31 +2066,6 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2068 move_member_tasks_to_cpuset(cs, parent); 2066 move_member_tasks_to_cpuset(cs, parent);
2069} 2067}
2070 2068
2071/*
2072 * Helper function to traverse cpusets.
2073 * It can be used to walk the cpuset tree from top to bottom, completing
2074 * one layer before dropping down to the next (thus always processing a
2075 * node before any of its children).
2076 */
2077static struct cpuset *cpuset_next(struct list_head *queue)
2078{
2079 struct cpuset *cp;
2080 struct cpuset *child; /* scans child cpusets of cp */
2081 struct cgroup *cont;
2082
2083 if (list_empty(queue))
2084 return NULL;
2085
2086 cp = list_first_entry(queue, struct cpuset, stack_list);
2087 list_del(queue->next);
2088 rcu_read_lock();
2089 cpuset_for_each_child(child, cont, cp)
2090 list_add_tail(&child->stack_list, queue);
2091 rcu_read_unlock();
2092
2093 return cp;
2094}
2095
2096/** 2069/**
2097 * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset 2070 * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
2098 * @cs: cpuset in interest 2071 * @cs: cpuset in interest
@@ -2229,12 +2202,12 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2229 /* if cpus or mems went down, we need to propagate to descendants */ 2202 /* if cpus or mems went down, we need to propagate to descendants */
2230 if (cpus_offlined || mems_offlined) { 2203 if (cpus_offlined || mems_offlined) {
2231 struct cpuset *cs; 2204 struct cpuset *cs;
2232 LIST_HEAD(queue); 2205 struct cgroup *pos_cgrp;
2233 2206
2234 list_add_tail(&top_cpuset.stack_list, &queue); 2207 rcu_read_lock();
2235 while ((cs = cpuset_next(&queue))) 2208 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
2236 if (cs != &top_cpuset) 2209 schedule_cpuset_propagate_hotplug(cs);
2237 schedule_cpuset_propagate_hotplug(cs); 2210 rcu_read_unlock();
2238 } 2211 }
2239 2212
2240 mutex_unlock(&cpuset_mutex); 2213 mutex_unlock(&cpuset_mutex);