aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c351
1 files changed, 198 insertions, 153 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5ab79cf516d..eab7bd6628e0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -14,6 +14,8 @@
14 * 2003-10-22 Updates by Stephen Hemminger. 14 * 2003-10-22 Updates by Stephen Hemminger.
15 * 2004 May-July Rework by Paul Jackson. 15 * 2004 May-July Rework by Paul Jackson.
16 * 2006 Rework by Paul Menage to use generic cgroups 16 * 2006 Rework by Paul Menage to use generic cgroups
17 * 2008 Rework of the scheduler domains and CPU hotplug handling
18 * by Max Krasnyansky
17 * 19 *
18 * This file is subject to the terms and conditions of the GNU General Public 20 * This file is subject to the terms and conditions of the GNU General Public
19 * License. See the file COPYING in the main directory of the Linux 21 * License. See the file COPYING in the main directory of the Linux
@@ -236,9 +238,11 @@ static struct cpuset top_cpuset = {
236 238
237static DEFINE_MUTEX(callback_mutex); 239static DEFINE_MUTEX(callback_mutex);
238 240
239/* This is ugly, but preserves the userspace API for existing cpuset 241/*
242 * This is ugly, but preserves the userspace API for existing cpuset
240 * users. If someone tries to mount the "cpuset" filesystem, we 243 * users. If someone tries to mount the "cpuset" filesystem, we
241 * silently switch it to mount "cgroup" instead */ 244 * silently switch it to mount "cgroup" instead
245 */
242static int cpuset_get_sb(struct file_system_type *fs_type, 246static int cpuset_get_sb(struct file_system_type *fs_type,
243 int flags, const char *unused_dev_name, 247 int flags, const char *unused_dev_name,
244 void *data, struct vfsmount *mnt) 248 void *data, struct vfsmount *mnt)
@@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
473} 477}
474 478
475/* 479/*
476 * Helper routine for rebuild_sched_domains(). 480 * Helper routine for generate_sched_domains().
477 * Do cpusets a, b have overlapping cpus_allowed masks? 481 * Do cpusets a, b have overlapping cpus_allowed masks?
478 */ 482 */
479
480static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 483static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
481{ 484{
482 return cpus_intersects(a->cpus_allowed, b->cpus_allowed); 485 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
@@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
518} 521}
519 522
520/* 523/*
521 * rebuild_sched_domains() 524 * generate_sched_domains()
522 * 525 *
523 * This routine will be called to rebuild the scheduler's dynamic 526 * This function builds a partial partition of the systems CPUs
524 * sched domains: 527 * A 'partial partition' is a set of non-overlapping subsets whose
525 * - if the flag 'sched_load_balance' of any cpuset with non-empty 528 * union is a subset of that set.
526 * 'cpus' changes, 529 * The output of this function needs to be passed to kernel/sched.c
527 * - or if the 'cpus' allowed changes in any cpuset which has that 530 * partition_sched_domains() routine, which will rebuild the scheduler's
528 * flag enabled, 531 * load balancing domains (sched domains) as specified by that partial
529 * - or if the 'sched_relax_domain_level' of any cpuset which has 532 * partition.
530 * that flag enabled and with non-empty 'cpus' changes,
531 * - or if any cpuset with non-empty 'cpus' is removed,
532 * - or if a cpu gets offlined.
533 *
534 * This routine builds a partial partition of the systems CPUs
535 * (the set of non-overlappping cpumask_t's in the array 'part'
536 * below), and passes that partial partition to the kernel/sched.c
537 * partition_sched_domains() routine, which will rebuild the
538 * schedulers load balancing domains (sched domains) as specified
539 * by that partial partition. A 'partial partition' is a set of
540 * non-overlapping subsets whose union is a subset of that set.
541 * 533 *
542 * See "What is sched_load_balance" in Documentation/cpusets.txt 534 * See "What is sched_load_balance" in Documentation/cpusets.txt
543 * for a background explanation of this. 535 * for a background explanation of this.
@@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
547 * domains when operating in the severe memory shortage situations 539 * domains when operating in the severe memory shortage situations
548 * that could cause allocation failures below. 540 * that could cause allocation failures below.
549 * 541 *
550 * Call with cgroup_mutex held. May take callback_mutex during 542 * Must be called with cgroup_lock held.
551 * call due to the kfifo_alloc() and kmalloc() calls. May nest
552 * a call to the get_online_cpus()/put_online_cpus() pair.
553 * Must not be called holding callback_mutex, because we must not
554 * call get_online_cpus() while holding callback_mutex. Elsewhere
555 * the kernel nests callback_mutex inside get_online_cpus() calls.
556 * So the reverse nesting would risk an ABBA deadlock.
557 * 543 *
558 * The three key local variables below are: 544 * The three key local variables below are:
559 * q - a linked-list queue of cpuset pointers, used to implement a 545 * q - a linked-list queue of cpuset pointers, used to implement a
@@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
588 * element of the partition (one sched domain) to be passed to 574 * element of the partition (one sched domain) to be passed to
589 * partition_sched_domains(). 575 * partition_sched_domains().
590 */ 576 */
591 577static int generate_sched_domains(cpumask_t **domains,
592void rebuild_sched_domains(void) 578 struct sched_domain_attr **attributes)
593{ 579{
594 LIST_HEAD(q); /* queue of cpusets to be scanned*/ 580 LIST_HEAD(q); /* queue of cpusets to be scanned */
595 struct cpuset *cp; /* scans q */ 581 struct cpuset *cp; /* scans q */
596 struct cpuset **csa; /* array of all cpuset ptrs */ 582 struct cpuset **csa; /* array of all cpuset ptrs */
597 int csn; /* how many cpuset ptrs in csa so far */ 583 int csn; /* how many cpuset ptrs in csa so far */
@@ -601,23 +587,26 @@ void rebuild_sched_domains(void)
601 int ndoms; /* number of sched domains in result */ 587 int ndoms; /* number of sched domains in result */
602 int nslot; /* next empty doms[] cpumask_t slot */ 588 int nslot; /* next empty doms[] cpumask_t slot */
603 589
604 csa = NULL; 590 ndoms = 0;
605 doms = NULL; 591 doms = NULL;
606 dattr = NULL; 592 dattr = NULL;
593 csa = NULL;
607 594
608 /* Special case for the 99% of systems with one, full, sched domain */ 595 /* Special case for the 99% of systems with one, full, sched domain */
609 if (is_sched_load_balance(&top_cpuset)) { 596 if (is_sched_load_balance(&top_cpuset)) {
610 ndoms = 1;
611 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 597 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
612 if (!doms) 598 if (!doms)
613 goto rebuild; 599 goto done;
600
614 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); 601 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
615 if (dattr) { 602 if (dattr) {
616 *dattr = SD_ATTR_INIT; 603 *dattr = SD_ATTR_INIT;
617 update_domain_attr_tree(dattr, &top_cpuset); 604 update_domain_attr_tree(dattr, &top_cpuset);
618 } 605 }
619 *doms = top_cpuset.cpus_allowed; 606 *doms = top_cpuset.cpus_allowed;
620 goto rebuild; 607
608 ndoms = 1;
609 goto done;
621 } 610 }
622 611
623 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); 612 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
@@ -680,61 +669,141 @@ restart:
680 } 669 }
681 } 670 }
682 671
683 /* Convert <csn, csa> to <ndoms, doms> */ 672 /*
673 * Now we know how many domains to create.
674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
675 */
684 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
685 if (!doms) 677 if (!doms) {
686 goto rebuild; 678 ndoms = 0;
679 goto done;
680 }
681
682 /*
683 * The rest of the code, including the scheduler, can deal with
684 * dattr==NULL case. No need to abort if alloc fails.
685 */
687 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); 686 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
688 687
689 for (nslot = 0, i = 0; i < csn; i++) { 688 for (nslot = 0, i = 0; i < csn; i++) {
690 struct cpuset *a = csa[i]; 689 struct cpuset *a = csa[i];
690 cpumask_t *dp;
691 int apn = a->pn; 691 int apn = a->pn;
692 692
693 if (apn >= 0) { 693 if (apn < 0) {
694 cpumask_t *dp = doms + nslot; 694 /* Skip completed partitions */
695 695 continue;
696 if (nslot == ndoms) { 696 }
697 static int warnings = 10; 697
698 if (warnings) { 698 dp = doms + nslot;
699 printk(KERN_WARNING 699
700 "rebuild_sched_domains confused:" 700 if (nslot == ndoms) {
701 " nslot %d, ndoms %d, csn %d, i %d," 701 static int warnings = 10;
702 " apn %d\n", 702 if (warnings) {
703 nslot, ndoms, csn, i, apn); 703 printk(KERN_WARNING
704 warnings--; 704 "rebuild_sched_domains confused:"
705 } 705 " nslot %d, ndoms %d, csn %d, i %d,"
706 continue; 706 " apn %d\n",
707 nslot, ndoms, csn, i, apn);
708 warnings--;
707 } 709 }
710 continue;
711 }
708 712
709 cpus_clear(*dp); 713 cpus_clear(*dp);
710 if (dattr) 714 if (dattr)
711 *(dattr + nslot) = SD_ATTR_INIT; 715 *(dattr + nslot) = SD_ATTR_INIT;
712 for (j = i; j < csn; j++) { 716 for (j = i; j < csn; j++) {
713 struct cpuset *b = csa[j]; 717 struct cpuset *b = csa[j];
714 718
715 if (apn == b->pn) { 719 if (apn == b->pn) {
716 cpus_or(*dp, *dp, b->cpus_allowed); 720 cpus_or(*dp, *dp, b->cpus_allowed);
717 b->pn = -1; 721 if (dattr)
718 if (dattr) 722 update_domain_attr_tree(dattr + nslot, b);
719 update_domain_attr_tree(dattr 723
720 + nslot, b); 724 /* Done with this partition */
721 } 725 b->pn = -1;
722 } 726 }
723 nslot++;
724 } 727 }
728 nslot++;
725 } 729 }
726 BUG_ON(nslot != ndoms); 730 BUG_ON(nslot != ndoms);
727 731
728rebuild: 732done:
729 /* Have scheduler rebuild sched domains */ 733 kfree(csa);
734
735 *domains = doms;
736 *attributes = dattr;
737 return ndoms;
738}
739
740/*
741 * Rebuild scheduler domains.
742 *
743 * Call with neither cgroup_mutex held nor within get_online_cpus().
744 * Takes both cgroup_mutex and get_online_cpus().
745 *
746 * Cannot be directly called from cpuset code handling changes
747 * to the cpuset pseudo-filesystem, because it cannot be called
748 * from code that already holds cgroup_mutex.
749 */
750static void do_rebuild_sched_domains(struct work_struct *unused)
751{
752 struct sched_domain_attr *attr;
753 cpumask_t *doms;
754 int ndoms;
755
730 get_online_cpus(); 756 get_online_cpus();
731 partition_sched_domains(ndoms, doms, dattr); 757
758 /* Generate domain masks and attrs */
759 cgroup_lock();
760 ndoms = generate_sched_domains(&doms, &attr);
761 cgroup_unlock();
762
763 /* Have scheduler rebuild the domains */
764 partition_sched_domains(ndoms, doms, attr);
765
732 put_online_cpus(); 766 put_online_cpus();
767}
733 768
734done: 769static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
735 kfree(csa); 770
736 /* Don't kfree(doms) -- partition_sched_domains() does that. */ 771/*
737 /* Don't kfree(dattr) -- partition_sched_domains() does that. */ 772 * Rebuild scheduler domains, asynchronously via workqueue.
773 *
774 * If the flag 'sched_load_balance' of any cpuset with non-empty
775 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
776 * which has that flag enabled, or if any cpuset with a non-empty
777 * 'cpus' is removed, then call this routine to rebuild the
778 * scheduler's dynamic sched domains.
779 *
780 * The rebuild_sched_domains() and partition_sched_domains()
781 * routines must nest cgroup_lock() inside get_online_cpus(),
782 * but such cpuset changes as these must nest that locking the
783 * other way, holding cgroup_lock() for much of the code.
784 *
785 * So in order to avoid an ABBA deadlock, the cpuset code handling
786 * these user changes delegates the actual sched domain rebuilding
787 * to a separate workqueue thread, which ends up processing the
788 * above do_rebuild_sched_domains() function.
789 */
790static void async_rebuild_sched_domains(void)
791{
792 schedule_work(&rebuild_sched_domains_work);
793}
794
795/*
796 * Accomplishes the same scheduler domain rebuild as the above
797 * async_rebuild_sched_domains(), however it directly calls the
798 * rebuild routine synchronously rather than calling it via an
799 * asynchronous work thread.
800 *
801 * This can only be called from code that is not holding
802 * cgroup_mutex (not nested in a cgroup_lock() call.)
803 */
804void rebuild_sched_domains(void)
805{
806 do_rebuild_sched_domains(NULL);
738} 807}
739 808
740/** 809/**
@@ -774,37 +843,25 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
774/** 843/**
775 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 844 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
776 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 845 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
846 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
777 * 847 *
778 * Called with cgroup_mutex held 848 * Called with cgroup_mutex held
779 * 849 *
780 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 850 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
781 * calling callback functions for each. 851 * calling callback functions for each.
782 * 852 *
783 * Return 0 if successful, -errno if not. 853 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
854 * if @heap != NULL.
784 */ 855 */
785static int update_tasks_cpumask(struct cpuset *cs) 856static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
786{ 857{
787 struct cgroup_scanner scan; 858 struct cgroup_scanner scan;
788 struct ptr_heap heap;
789 int retval;
790
791 /*
792 * cgroup_scan_tasks() will initialize heap->gt for us.
793 * heap_init() is still needed here for we should not change
794 * cs->cpus_allowed when heap_init() fails.
795 */
796 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
797 if (retval)
798 return retval;
799 859
800 scan.cg = cs->css.cgroup; 860 scan.cg = cs->css.cgroup;
801 scan.test_task = cpuset_test_cpumask; 861 scan.test_task = cpuset_test_cpumask;
802 scan.process_task = cpuset_change_cpumask; 862 scan.process_task = cpuset_change_cpumask;
803 scan.heap = &heap; 863 scan.heap = heap;
804 retval = cgroup_scan_tasks(&scan); 864 cgroup_scan_tasks(&scan);
805
806 heap_free(&heap);
807 return retval;
808} 865}
809 866
810/** 867/**
@@ -814,6 +871,7 @@ static int update_tasks_cpumask(struct cpuset *cs)
814 */ 871 */
815static int update_cpumask(struct cpuset *cs, const char *buf) 872static int update_cpumask(struct cpuset *cs, const char *buf)
816{ 873{
874 struct ptr_heap heap;
817 struct cpuset trialcs; 875 struct cpuset trialcs;
818 int retval; 876 int retval;
819 int is_load_balanced; 877 int is_load_balanced;
@@ -848,6 +906,10 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
848 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) 906 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
849 return 0; 907 return 0;
850 908
909 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
910 if (retval)
911 return retval;
912
851 is_load_balanced = is_sched_load_balance(&trialcs); 913 is_load_balanced = is_sched_load_balance(&trialcs);
852 914
853 mutex_lock(&callback_mutex); 915 mutex_lock(&callback_mutex);
@@ -858,12 +920,12 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
858 * Scan tasks in the cpuset, and update the cpumasks of any 920 * Scan tasks in the cpuset, and update the cpumasks of any
859 * that need an update. 921 * that need an update.
860 */ 922 */
861 retval = update_tasks_cpumask(cs); 923 update_tasks_cpumask(cs, &heap);
862 if (retval < 0) 924
863 return retval; 925 heap_free(&heap);
864 926
865 if (is_load_balanced) 927 if (is_load_balanced)
866 rebuild_sched_domains(); 928 async_rebuild_sched_domains();
867 return 0; 929 return 0;
868} 930}
869 931
@@ -1090,7 +1152,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1090 if (val != cs->relax_domain_level) { 1152 if (val != cs->relax_domain_level) {
1091 cs->relax_domain_level = val; 1153 cs->relax_domain_level = val;
1092 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) 1154 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
1093 rebuild_sched_domains(); 1155 async_rebuild_sched_domains();
1094 } 1156 }
1095 1157
1096 return 0; 1158 return 0;
@@ -1131,7 +1193,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1131 mutex_unlock(&callback_mutex); 1193 mutex_unlock(&callback_mutex);
1132 1194
1133 if (cpus_nonempty && balance_flag_changed) 1195 if (cpus_nonempty && balance_flag_changed)
1134 rebuild_sched_domains(); 1196 async_rebuild_sched_domains();
1135 1197
1136 return 0; 1198 return 0;
1137} 1199}
@@ -1492,6 +1554,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1492 default: 1554 default:
1493 BUG(); 1555 BUG();
1494 } 1556 }
1557
1558 /* Unreachable but makes gcc happy */
1559 return 0;
1495} 1560}
1496 1561
1497static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) 1562static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
@@ -1504,6 +1569,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
1504 default: 1569 default:
1505 BUG(); 1570 BUG();
1506 } 1571 }
1572
1573 /* Unrechable but makes gcc happy */
1574 return 0;
1507} 1575}
1508 1576
1509 1577
@@ -1692,15 +1760,9 @@ static struct cgroup_subsys_state *cpuset_create(
1692} 1760}
1693 1761
1694/* 1762/*
1695 * Locking note on the strange update_flag() call below:
1696 *
1697 * If the cpuset being removed has its flag 'sched_load_balance' 1763 * If the cpuset being removed has its flag 'sched_load_balance'
1698 * enabled, then simulate turning sched_load_balance off, which 1764 * enabled, then simulate turning sched_load_balance off, which
1699 * will call rebuild_sched_domains(). The get_online_cpus() 1765 * will call async_rebuild_sched_domains().
1700 * call in rebuild_sched_domains() must not be made while holding
1701 * callback_mutex. Elsewhere the kernel nests callback_mutex inside
1702 * get_online_cpus() calls. So the reverse nesting would risk an
1703 * ABBA deadlock.
1704 */ 1766 */
1705 1767
1706static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 1768static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -1719,7 +1781,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1719struct cgroup_subsys cpuset_subsys = { 1781struct cgroup_subsys cpuset_subsys = {
1720 .name = "cpuset", 1782 .name = "cpuset",
1721 .create = cpuset_create, 1783 .create = cpuset_create,
1722 .destroy = cpuset_destroy, 1784 .destroy = cpuset_destroy,
1723 .can_attach = cpuset_can_attach, 1785 .can_attach = cpuset_can_attach,
1724 .attach = cpuset_attach, 1786 .attach = cpuset_attach,
1725 .populate = cpuset_populate, 1787 .populate = cpuset_populate,
@@ -1811,7 +1873,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1811} 1873}
1812 1874
1813/* 1875/*
1814 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 1876 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
1815 * or memory nodes, we need to walk over the cpuset hierarchy, 1877 * or memory nodes, we need to walk over the cpuset hierarchy,
1816 * removing that CPU or node from all cpusets. If this removes the 1878 * removing that CPU or node from all cpusets. If this removes the
1817 * last CPU or node from a cpuset, then move the tasks in the empty 1879 * last CPU or node from a cpuset, then move the tasks in the empty
@@ -1859,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1859 * that has tasks along with an empty 'mems'. But if we did see such 1921 * that has tasks along with an empty 'mems'. But if we did see such
1860 * a cpuset, we'd handle it just like we do if its 'cpus' was empty. 1922 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
1861 */ 1923 */
1862static void scan_for_empty_cpusets(const struct cpuset *root) 1924static void scan_for_empty_cpusets(struct cpuset *root)
1863{ 1925{
1864 LIST_HEAD(queue); 1926 LIST_HEAD(queue);
1865 struct cpuset *cp; /* scans cpusets being updated */ 1927 struct cpuset *cp; /* scans cpusets being updated */
@@ -1896,42 +1958,13 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
1896 nodes_empty(cp->mems_allowed)) 1958 nodes_empty(cp->mems_allowed))
1897 remove_tasks_in_empty_cpuset(cp); 1959 remove_tasks_in_empty_cpuset(cp);
1898 else { 1960 else {
1899 update_tasks_cpumask(cp); 1961 update_tasks_cpumask(cp, NULL);
1900 update_tasks_nodemask(cp, &oldmems); 1962 update_tasks_nodemask(cp, &oldmems);
1901 } 1963 }
1902 } 1964 }
1903} 1965}
1904 1966
1905/* 1967/*
1906 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
1907 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
1908 * track what's online after any CPU or memory node hotplug or unplug event.
1909 *
1910 * Since there are two callers of this routine, one for CPU hotplug
1911 * events and one for memory node hotplug events, we could have coded
1912 * two separate routines here. We code it as a single common routine
1913 * in order to minimize text size.
1914 */
1915
1916static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
1917{
1918 cgroup_lock();
1919
1920 top_cpuset.cpus_allowed = cpu_online_map;
1921 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1922 scan_for_empty_cpusets(&top_cpuset);
1923
1924 /*
1925 * Scheduler destroys domains on hotplug events.
1926 * Rebuild them based on the current settings.
1927 */
1928 if (rebuild_sd)
1929 rebuild_sched_domains();
1930
1931 cgroup_unlock();
1932}
1933
1934/*
1935 * The top_cpuset tracks what CPUs and Memory Nodes are online, 1968 * The top_cpuset tracks what CPUs and Memory Nodes are online,
1936 * period. This is necessary in order to make cpusets transparent 1969 * period. This is necessary in order to make cpusets transparent
1937 * (of no affect) on systems that are actively using CPU hotplug 1970 * (of no affect) on systems that are actively using CPU hotplug
@@ -1939,40 +1972,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
1939 * 1972 *
1940 * This routine ensures that top_cpuset.cpus_allowed tracks 1973 * This routine ensures that top_cpuset.cpus_allowed tracks
1941 * cpu_online_map on each CPU hotplug (cpuhp) event. 1974 * cpu_online_map on each CPU hotplug (cpuhp) event.
1975 *
1976 * Called within get_online_cpus(). Needs to call cgroup_lock()
1977 * before calling generate_sched_domains().
1942 */ 1978 */
1943 1979static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1944static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
1945 unsigned long phase, void *unused_cpu) 1980 unsigned long phase, void *unused_cpu)
1946{ 1981{
1982 struct sched_domain_attr *attr;
1983 cpumask_t *doms;
1984 int ndoms;
1985
1947 switch (phase) { 1986 switch (phase) {
1948 case CPU_UP_CANCELED:
1949 case CPU_UP_CANCELED_FROZEN:
1950 case CPU_DOWN_FAILED:
1951 case CPU_DOWN_FAILED_FROZEN:
1952 case CPU_ONLINE: 1987 case CPU_ONLINE:
1953 case CPU_ONLINE_FROZEN: 1988 case CPU_ONLINE_FROZEN:
1954 case CPU_DEAD: 1989 case CPU_DEAD:
1955 case CPU_DEAD_FROZEN: 1990 case CPU_DEAD_FROZEN:
1956 common_cpu_mem_hotplug_unplug(1);
1957 break; 1991 break;
1992
1958 default: 1993 default:
1959 return NOTIFY_DONE; 1994 return NOTIFY_DONE;
1960 } 1995 }
1961 1996
1997 cgroup_lock();
1998 top_cpuset.cpus_allowed = cpu_online_map;
1999 scan_for_empty_cpusets(&top_cpuset);
2000 ndoms = generate_sched_domains(&doms, &attr);
2001 cgroup_unlock();
2002
2003 /* Have scheduler rebuild the domains */
2004 partition_sched_domains(ndoms, doms, attr);
2005
1962 return NOTIFY_OK; 2006 return NOTIFY_OK;
1963} 2007}
1964 2008
1965#ifdef CONFIG_MEMORY_HOTPLUG 2009#ifdef CONFIG_MEMORY_HOTPLUG
1966/* 2010/*
1967 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2011 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
1968 * Call this routine anytime after you change 2012 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
1969 * node_states[N_HIGH_MEMORY]. 2013 * See also the previous routine cpuset_track_online_cpus().
1970 * See also the previous routine cpuset_handle_cpuhp().
1971 */ 2014 */
1972
1973void cpuset_track_online_nodes(void) 2015void cpuset_track_online_nodes(void)
1974{ 2016{
1975 common_cpu_mem_hotplug_unplug(0); 2017 cgroup_lock();
2018 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2019 scan_for_empty_cpusets(&top_cpuset);
2020 cgroup_unlock();
1976} 2021}
1977#endif 2022#endif
1978 2023
@@ -1987,7 +2032,7 @@ void __init cpuset_init_smp(void)
1987 top_cpuset.cpus_allowed = cpu_online_map; 2032 top_cpuset.cpus_allowed = cpu_online_map;
1988 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2033 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1989 2034
1990 hotcpu_notifier(cpuset_handle_cpuhp, 0); 2035 hotcpu_notifier(cpuset_track_online_cpus, 0);
1991} 2036}
1992 2037
1993/** 2038/**