aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMax Krasnyansky <maxk@qualcomm.com>2008-08-11 17:33:53 -0400
committerIngo Molnar <mingo@elte.hu>2008-08-14 05:23:51 -0400
commitcf417141cbb3a4ceb5cca15b2c1f099bd0a6603c (patch)
tree958d4539b90d72ba6a7573181ce5ef7153003b4d
parentb635acec48bcaa9183fcbf4e3955616b0d4119b5 (diff)
sched, cpuset: rework sched domains and CPU hotplug handling (v4)
This is an updated version of my previous cpuset patch on top of the latest mainline git. The patch fixes CPU hotplug handling issues in the current cpusets code. Namely circular locking in rebuild_sched_domains() and unsafe access to the cpu_online_map in the cpuset cpu hotplug handler. This version includes changes suggested by Paul Jackson (naming, comments, style, etc). I also got rid of the separate workqueue thread because it is now safe to call get_online_cpus() from workqueue callbacks. Here are some more details: rebuild_sched_domains() is the only way to rebuild sched domains correctly based on the current cpuset settings. What this means is that we need to be able to call it from different contexts, like cpu hotplug for example. Also latest scheduler code in -tip now calls rebuild_sched_domains() directly from functions like arch_reinit_sched_domains(). In order to support that properly we need to rework cpuset locking rules to avoid circular dependencies, which is what this patch does. New lock nesting rules are explained in the comments. We can now safely call rebuild_sched_domains() from virtually any context. The only requirement is that it needs to be called under get_online_cpus(). This allows cpu hotplug handlers and the scheduler to call rebuild_sched_domains() directly. The rest of the cpuset code now offloads sched domains rebuilds to a workqueue (async_rebuild_sched_domains()). This version of the patch addresses comments from the previous review. I fixed all miss-formated comments and trailing spaces. I also factored out the code that builds domain masks and split up CPU and memory hotplug handling. This was needed to simplify locking, to avoid unsafe access to the cpu_online_map from mem hotplug handler, and in general to make things cleaner. The patch passes moderate testing (building kernel with -j 16, creating & removing domains and bringing cpus off/online at the same time) on the quad-core2 based machine. It passes lockdep checks, even with preemptable RCU enabled. This time I also tested in with suspend/resume path and everything is working as expected. Signed-off-by: Max Krasnyansky <maxk@qualcomm.com> Acked-by: Paul Jackson <pj@sgi.com> Cc: menage@google.com Cc: a.p.zijlstra@chello.nl Cc: vegard.nossum@gmail.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--kernel/cpuset.c312
1 files changed, 182 insertions, 130 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5ab79cf516d..f227bc172690 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -14,6 +14,8 @@
14 * 2003-10-22 Updates by Stephen Hemminger. 14 * 2003-10-22 Updates by Stephen Hemminger.
15 * 2004 May-July Rework by Paul Jackson. 15 * 2004 May-July Rework by Paul Jackson.
16 * 2006 Rework by Paul Menage to use generic cgroups 16 * 2006 Rework by Paul Menage to use generic cgroups
17 * 2008 Rework of the scheduler domains and CPU hotplug handling
18 * by Max Krasnyansky
17 * 19 *
18 * This file is subject to the terms and conditions of the GNU General Public 20 * This file is subject to the terms and conditions of the GNU General Public
19 * License. See the file COPYING in the main directory of the Linux 21 * License. See the file COPYING in the main directory of the Linux
@@ -236,9 +238,11 @@ static struct cpuset top_cpuset = {
236 238
237static DEFINE_MUTEX(callback_mutex); 239static DEFINE_MUTEX(callback_mutex);
238 240
239/* This is ugly, but preserves the userspace API for existing cpuset 241/*
242 * This is ugly, but preserves the userspace API for existing cpuset
240 * users. If someone tries to mount the "cpuset" filesystem, we 243 * users. If someone tries to mount the "cpuset" filesystem, we
241 * silently switch it to mount "cgroup" instead */ 244 * silently switch it to mount "cgroup" instead
245 */
242static int cpuset_get_sb(struct file_system_type *fs_type, 246static int cpuset_get_sb(struct file_system_type *fs_type,
243 int flags, const char *unused_dev_name, 247 int flags, const char *unused_dev_name,
244 void *data, struct vfsmount *mnt) 248 void *data, struct vfsmount *mnt)
@@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
473} 477}
474 478
475/* 479/*
476 * Helper routine for rebuild_sched_domains(). 480 * Helper routine for generate_sched_domains().
477 * Do cpusets a, b have overlapping cpus_allowed masks? 481 * Do cpusets a, b have overlapping cpus_allowed masks?
478 */ 482 */
479
480static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 483static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
481{ 484{
482 return cpus_intersects(a->cpus_allowed, b->cpus_allowed); 485 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
@@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
518} 521}
519 522
520/* 523/*
521 * rebuild_sched_domains() 524 * generate_sched_domains()
522 * 525 *
523 * This routine will be called to rebuild the scheduler's dynamic 526 * This function builds a partial partition of the systems CPUs
524 * sched domains: 527 * A 'partial partition' is a set of non-overlapping subsets whose
525 * - if the flag 'sched_load_balance' of any cpuset with non-empty 528 * union is a subset of that set.
526 * 'cpus' changes, 529 * The output of this function needs to be passed to kernel/sched.c
527 * - or if the 'cpus' allowed changes in any cpuset which has that 530 * partition_sched_domains() routine, which will rebuild the scheduler's
528 * flag enabled, 531 * load balancing domains (sched domains) as specified by that partial
529 * - or if the 'sched_relax_domain_level' of any cpuset which has 532 * partition.
530 * that flag enabled and with non-empty 'cpus' changes,
531 * - or if any cpuset with non-empty 'cpus' is removed,
532 * - or if a cpu gets offlined.
533 *
534 * This routine builds a partial partition of the systems CPUs
535 * (the set of non-overlappping cpumask_t's in the array 'part'
536 * below), and passes that partial partition to the kernel/sched.c
537 * partition_sched_domains() routine, which will rebuild the
538 * schedulers load balancing domains (sched domains) as specified
539 * by that partial partition. A 'partial partition' is a set of
540 * non-overlapping subsets whose union is a subset of that set.
541 * 533 *
542 * See "What is sched_load_balance" in Documentation/cpusets.txt 534 * See "What is sched_load_balance" in Documentation/cpusets.txt
543 * for a background explanation of this. 535 * for a background explanation of this.
@@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
547 * domains when operating in the severe memory shortage situations 539 * domains when operating in the severe memory shortage situations
548 * that could cause allocation failures below. 540 * that could cause allocation failures below.
549 * 541 *
550 * Call with cgroup_mutex held. May take callback_mutex during 542 * Must be called with cgroup_lock held.
551 * call due to the kfifo_alloc() and kmalloc() calls. May nest
552 * a call to the get_online_cpus()/put_online_cpus() pair.
553 * Must not be called holding callback_mutex, because we must not
554 * call get_online_cpus() while holding callback_mutex. Elsewhere
555 * the kernel nests callback_mutex inside get_online_cpus() calls.
556 * So the reverse nesting would risk an ABBA deadlock.
557 * 543 *
558 * The three key local variables below are: 544 * The three key local variables below are:
559 * q - a linked-list queue of cpuset pointers, used to implement a 545 * q - a linked-list queue of cpuset pointers, used to implement a
@@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
588 * element of the partition (one sched domain) to be passed to 574 * element of the partition (one sched domain) to be passed to
589 * partition_sched_domains(). 575 * partition_sched_domains().
590 */ 576 */
591 577static int generate_sched_domains(cpumask_t **domains,
592void rebuild_sched_domains(void) 578 struct sched_domain_attr **attributes)
593{ 579{
594 LIST_HEAD(q); /* queue of cpusets to be scanned*/ 580 LIST_HEAD(q); /* queue of cpusets to be scanned */
595 struct cpuset *cp; /* scans q */ 581 struct cpuset *cp; /* scans q */
596 struct cpuset **csa; /* array of all cpuset ptrs */ 582 struct cpuset **csa; /* array of all cpuset ptrs */
597 int csn; /* how many cpuset ptrs in csa so far */ 583 int csn; /* how many cpuset ptrs in csa so far */
@@ -601,23 +587,26 @@ void rebuild_sched_domains(void)
601 int ndoms; /* number of sched domains in result */ 587 int ndoms; /* number of sched domains in result */
602 int nslot; /* next empty doms[] cpumask_t slot */ 588 int nslot; /* next empty doms[] cpumask_t slot */
603 589
604 csa = NULL; 590 ndoms = 0;
605 doms = NULL; 591 doms = NULL;
606 dattr = NULL; 592 dattr = NULL;
593 csa = NULL;
607 594
608 /* Special case for the 99% of systems with one, full, sched domain */ 595 /* Special case for the 99% of systems with one, full, sched domain */
609 if (is_sched_load_balance(&top_cpuset)) { 596 if (is_sched_load_balance(&top_cpuset)) {
610 ndoms = 1;
611 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 597 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
612 if (!doms) 598 if (!doms)
613 goto rebuild; 599 goto done;
600
614 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); 601 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
615 if (dattr) { 602 if (dattr) {
616 *dattr = SD_ATTR_INIT; 603 *dattr = SD_ATTR_INIT;
617 update_domain_attr_tree(dattr, &top_cpuset); 604 update_domain_attr_tree(dattr, &top_cpuset);
618 } 605 }
619 *doms = top_cpuset.cpus_allowed; 606 *doms = top_cpuset.cpus_allowed;
620 goto rebuild; 607
608 ndoms = 1;
609 goto done;
621 } 610 }
622 611
623 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); 612 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
@@ -680,61 +669,141 @@ restart:
680 } 669 }
681 } 670 }
682 671
683 /* Convert <csn, csa> to <ndoms, doms> */ 672 /*
673 * Now we know how many domains to create.
674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
675 */
684 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
685 if (!doms) 677 if (!doms) {
686 goto rebuild; 678 ndoms = 0;
679 goto done;
680 }
681
682 /*
683 * The rest of the code, including the scheduler, can deal with
684 * dattr==NULL case. No need to abort if alloc fails.
685 */
687 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); 686 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
688 687
689 for (nslot = 0, i = 0; i < csn; i++) { 688 for (nslot = 0, i = 0; i < csn; i++) {
690 struct cpuset *a = csa[i]; 689 struct cpuset *a = csa[i];
690 cpumask_t *dp;
691 int apn = a->pn; 691 int apn = a->pn;
692 692
693 if (apn >= 0) { 693 if (apn < 0) {
694 cpumask_t *dp = doms + nslot; 694 /* Skip completed partitions */
695 695 continue;
696 if (nslot == ndoms) { 696 }
697 static int warnings = 10; 697
698 if (warnings) { 698 dp = doms + nslot;
699 printk(KERN_WARNING 699
700 "rebuild_sched_domains confused:" 700 if (nslot == ndoms) {
701 " nslot %d, ndoms %d, csn %d, i %d," 701 static int warnings = 10;
702 " apn %d\n", 702 if (warnings) {
703 nslot, ndoms, csn, i, apn); 703 printk(KERN_WARNING
704 warnings--; 704 "rebuild_sched_domains confused:"
705 } 705 " nslot %d, ndoms %d, csn %d, i %d,"
706 continue; 706 " apn %d\n",
707 nslot, ndoms, csn, i, apn);
708 warnings--;
707 } 709 }
710 continue;
711 }
708 712
709 cpus_clear(*dp); 713 cpus_clear(*dp);
710 if (dattr) 714 if (dattr)
711 *(dattr + nslot) = SD_ATTR_INIT; 715 *(dattr + nslot) = SD_ATTR_INIT;
712 for (j = i; j < csn; j++) { 716 for (j = i; j < csn; j++) {
713 struct cpuset *b = csa[j]; 717 struct cpuset *b = csa[j];
714 718
715 if (apn == b->pn) { 719 if (apn == b->pn) {
716 cpus_or(*dp, *dp, b->cpus_allowed); 720 cpus_or(*dp, *dp, b->cpus_allowed);
717 b->pn = -1; 721 if (dattr)
718 if (dattr) 722 update_domain_attr_tree(dattr + nslot, b);
719 update_domain_attr_tree(dattr 723
720 + nslot, b); 724 /* Done with this partition */
721 } 725 b->pn = -1;
722 } 726 }
723 nslot++;
724 } 727 }
728 nslot++;
725 } 729 }
726 BUG_ON(nslot != ndoms); 730 BUG_ON(nslot != ndoms);
727 731
728rebuild: 732done:
729 /* Have scheduler rebuild sched domains */ 733 kfree(csa);
734
735 *domains = doms;
736 *attributes = dattr;
737 return ndoms;
738}
739
740/*
741 * Rebuild scheduler domains.
742 *
743 * Call with neither cgroup_mutex held nor within get_online_cpus().
744 * Takes both cgroup_mutex and get_online_cpus().
745 *
746 * Cannot be directly called from cpuset code handling changes
747 * to the cpuset pseudo-filesystem, because it cannot be called
748 * from code that already holds cgroup_mutex.
749 */
750static void do_rebuild_sched_domains(struct work_struct *unused)
751{
752 struct sched_domain_attr *attr;
753 cpumask_t *doms;
754 int ndoms;
755
730 get_online_cpus(); 756 get_online_cpus();
731 partition_sched_domains(ndoms, doms, dattr); 757
758 /* Generate domain masks and attrs */
759 cgroup_lock();
760 ndoms = generate_sched_domains(&doms, &attr);
761 cgroup_unlock();
762
763 /* Have scheduler rebuild the domains */
764 partition_sched_domains(ndoms, doms, attr);
765
732 put_online_cpus(); 766 put_online_cpus();
767}
733 768
734done: 769static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
735 kfree(csa); 770
736 /* Don't kfree(doms) -- partition_sched_domains() does that. */ 771/*
737 /* Don't kfree(dattr) -- partition_sched_domains() does that. */ 772 * Rebuild scheduler domains, asynchronously via workqueue.
773 *
774 * If the flag 'sched_load_balance' of any cpuset with non-empty
775 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
776 * which has that flag enabled, or if any cpuset with a non-empty
777 * 'cpus' is removed, then call this routine to rebuild the
778 * scheduler's dynamic sched domains.
779 *
780 * The rebuild_sched_domains() and partition_sched_domains()
781 * routines must nest cgroup_lock() inside get_online_cpus(),
782 * but such cpuset changes as these must nest that locking the
783 * other way, holding cgroup_lock() for much of the code.
784 *
785 * So in order to avoid an ABBA deadlock, the cpuset code handling
786 * these user changes delegates the actual sched domain rebuilding
787 * to a separate workqueue thread, which ends up processing the
788 * above do_rebuild_sched_domains() function.
789 */
790static void async_rebuild_sched_domains(void)
791{
792 schedule_work(&rebuild_sched_domains_work);
793}
794
795/*
796 * Accomplishes the same scheduler domain rebuild as the above
797 * async_rebuild_sched_domains(), however it directly calls the
798 * rebuild routine synchronously rather than calling it via an
799 * asynchronous work thread.
800 *
801 * This can only be called from code that is not holding
802 * cgroup_mutex (not nested in a cgroup_lock() call.)
803 */
804void rebuild_sched_domains(void)
805{
806 do_rebuild_sched_domains(NULL);
738} 807}
739 808
740/** 809/**
@@ -863,7 +932,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
863 return retval; 932 return retval;
864 933
865 if (is_load_balanced) 934 if (is_load_balanced)
866 rebuild_sched_domains(); 935 async_rebuild_sched_domains();
867 return 0; 936 return 0;
868} 937}
869 938
@@ -1090,7 +1159,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1090 if (val != cs->relax_domain_level) { 1159 if (val != cs->relax_domain_level) {
1091 cs->relax_domain_level = val; 1160 cs->relax_domain_level = val;
1092 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) 1161 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
1093 rebuild_sched_domains(); 1162 async_rebuild_sched_domains();
1094 } 1163 }
1095 1164
1096 return 0; 1165 return 0;
@@ -1131,7 +1200,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1131 mutex_unlock(&callback_mutex); 1200 mutex_unlock(&callback_mutex);
1132 1201
1133 if (cpus_nonempty && balance_flag_changed) 1202 if (cpus_nonempty && balance_flag_changed)
1134 rebuild_sched_domains(); 1203 async_rebuild_sched_domains();
1135 1204
1136 return 0; 1205 return 0;
1137} 1206}
@@ -1492,6 +1561,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1492 default: 1561 default:
1493 BUG(); 1562 BUG();
1494 } 1563 }
1564
1565 /* Unreachable but makes gcc happy */
1566 return 0;
1495} 1567}
1496 1568
1497static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) 1569static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
@@ -1504,6 +1576,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
1504 default: 1576 default:
1505 BUG(); 1577 BUG();
1506 } 1578 }
1579
1580 /* Unrechable but makes gcc happy */
1581 return 0;
1507} 1582}
1508 1583
1509 1584
@@ -1692,15 +1767,9 @@ static struct cgroup_subsys_state *cpuset_create(
1692} 1767}
1693 1768
1694/* 1769/*
1695 * Locking note on the strange update_flag() call below:
1696 *
1697 * If the cpuset being removed has its flag 'sched_load_balance' 1770 * If the cpuset being removed has its flag 'sched_load_balance'
1698 * enabled, then simulate turning sched_load_balance off, which 1771 * enabled, then simulate turning sched_load_balance off, which
1699 * will call rebuild_sched_domains(). The get_online_cpus() 1772 * will call async_rebuild_sched_domains().
1700 * call in rebuild_sched_domains() must not be made while holding
1701 * callback_mutex. Elsewhere the kernel nests callback_mutex inside
1702 * get_online_cpus() calls. So the reverse nesting would risk an
1703 * ABBA deadlock.
1704 */ 1773 */
1705 1774
1706static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 1775static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -1719,7 +1788,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1719struct cgroup_subsys cpuset_subsys = { 1788struct cgroup_subsys cpuset_subsys = {
1720 .name = "cpuset", 1789 .name = "cpuset",
1721 .create = cpuset_create, 1790 .create = cpuset_create,
1722 .destroy = cpuset_destroy, 1791 .destroy = cpuset_destroy,
1723 .can_attach = cpuset_can_attach, 1792 .can_attach = cpuset_can_attach,
1724 .attach = cpuset_attach, 1793 .attach = cpuset_attach,
1725 .populate = cpuset_populate, 1794 .populate = cpuset_populate,
@@ -1811,7 +1880,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1811} 1880}
1812 1881
1813/* 1882/*
1814 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 1883 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
1815 * or memory nodes, we need to walk over the cpuset hierarchy, 1884 * or memory nodes, we need to walk over the cpuset hierarchy,
1816 * removing that CPU or node from all cpusets. If this removes the 1885 * removing that CPU or node from all cpusets. If this removes the
1817 * last CPU or node from a cpuset, then move the tasks in the empty 1886 * last CPU or node from a cpuset, then move the tasks in the empty
@@ -1903,35 +1972,6 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
1903} 1972}
1904 1973
1905/* 1974/*
1906 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
1907 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
1908 * track what's online after any CPU or memory node hotplug or unplug event.
1909 *
1910 * Since there are two callers of this routine, one for CPU hotplug
1911 * events and one for memory node hotplug events, we could have coded
1912 * two separate routines here. We code it as a single common routine
1913 * in order to minimize text size.
1914 */
1915
1916static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
1917{
1918 cgroup_lock();
1919
1920 top_cpuset.cpus_allowed = cpu_online_map;
1921 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1922 scan_for_empty_cpusets(&top_cpuset);
1923
1924 /*
1925 * Scheduler destroys domains on hotplug events.
1926 * Rebuild them based on the current settings.
1927 */
1928 if (rebuild_sd)
1929 rebuild_sched_domains();
1930
1931 cgroup_unlock();
1932}
1933
1934/*
1935 * The top_cpuset tracks what CPUs and Memory Nodes are online, 1975 * The top_cpuset tracks what CPUs and Memory Nodes are online,
1936 * period. This is necessary in order to make cpusets transparent 1976 * period. This is necessary in order to make cpusets transparent
1937 * (of no affect) on systems that are actively using CPU hotplug 1977 * (of no affect) on systems that are actively using CPU hotplug
@@ -1939,40 +1979,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
1939 * 1979 *
1940 * This routine ensures that top_cpuset.cpus_allowed tracks 1980 * This routine ensures that top_cpuset.cpus_allowed tracks
1941 * cpu_online_map on each CPU hotplug (cpuhp) event. 1981 * cpu_online_map on each CPU hotplug (cpuhp) event.
1982 *
1983 * Called within get_online_cpus(). Needs to call cgroup_lock()
1984 * before calling generate_sched_domains().
1942 */ 1985 */
1943 1986static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1944static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
1945 unsigned long phase, void *unused_cpu) 1987 unsigned long phase, void *unused_cpu)
1946{ 1988{
1989 struct sched_domain_attr *attr;
1990 cpumask_t *doms;
1991 int ndoms;
1992
1947 switch (phase) { 1993 switch (phase) {
1948 case CPU_UP_CANCELED:
1949 case CPU_UP_CANCELED_FROZEN:
1950 case CPU_DOWN_FAILED:
1951 case CPU_DOWN_FAILED_FROZEN:
1952 case CPU_ONLINE: 1994 case CPU_ONLINE:
1953 case CPU_ONLINE_FROZEN: 1995 case CPU_ONLINE_FROZEN:
1954 case CPU_DEAD: 1996 case CPU_DEAD:
1955 case CPU_DEAD_FROZEN: 1997 case CPU_DEAD_FROZEN:
1956 common_cpu_mem_hotplug_unplug(1);
1957 break; 1998 break;
1999
1958 default: 2000 default:
1959 return NOTIFY_DONE; 2001 return NOTIFY_DONE;
1960 } 2002 }
1961 2003
2004 cgroup_lock();
2005 top_cpuset.cpus_allowed = cpu_online_map;
2006 scan_for_empty_cpusets(&top_cpuset);
2007 ndoms = generate_sched_domains(&doms, &attr);
2008 cgroup_unlock();
2009
2010 /* Have scheduler rebuild the domains */
2011 partition_sched_domains(ndoms, doms, attr);
2012
1962 return NOTIFY_OK; 2013 return NOTIFY_OK;
1963} 2014}
1964 2015
1965#ifdef CONFIG_MEMORY_HOTPLUG 2016#ifdef CONFIG_MEMORY_HOTPLUG
1966/* 2017/*
1967 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2018 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
1968 * Call this routine anytime after you change 2019 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
1969 * node_states[N_HIGH_MEMORY]. 2020 * See also the previous routine cpuset_track_online_cpus().
1970 * See also the previous routine cpuset_handle_cpuhp().
1971 */ 2021 */
1972
1973void cpuset_track_online_nodes(void) 2022void cpuset_track_online_nodes(void)
1974{ 2023{
1975 common_cpu_mem_hotplug_unplug(0); 2024 cgroup_lock();
2025 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2026 scan_for_empty_cpusets(&top_cpuset);
2027 cgroup_unlock();
1976} 2028}
1977#endif 2029#endif
1978 2030
@@ -1987,7 +2039,7 @@ void __init cpuset_init_smp(void)
1987 top_cpuset.cpus_allowed = cpu_online_map; 2039 top_cpuset.cpus_allowed = cpu_online_map;
1988 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2040 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1989 2041
1990 hotcpu_notifier(cpuset_handle_cpuhp, 0); 2042 hotcpu_notifier(cpuset_track_online_cpus, 0);
1991} 2043}
1992 2044
1993/** 2045/**