diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpuset.c | 312 | ||||
-rw-r--r-- | kernel/sched.c | 19 |
2 files changed, 195 insertions, 136 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d5ab79cf516d..f227bc172690 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -14,6 +14,8 @@ | |||
14 | * 2003-10-22 Updates by Stephen Hemminger. | 14 | * 2003-10-22 Updates by Stephen Hemminger. |
15 | * 2004 May-July Rework by Paul Jackson. | 15 | * 2004 May-July Rework by Paul Jackson. |
16 | * 2006 Rework by Paul Menage to use generic cgroups | 16 | * 2006 Rework by Paul Menage to use generic cgroups |
17 | * 2008 Rework of the scheduler domains and CPU hotplug handling | ||
18 | * by Max Krasnyansky | ||
17 | * | 19 | * |
18 | * This file is subject to the terms and conditions of the GNU General Public | 20 | * This file is subject to the terms and conditions of the GNU General Public |
19 | * License. See the file COPYING in the main directory of the Linux | 21 | * License. See the file COPYING in the main directory of the Linux |
@@ -236,9 +238,11 @@ static struct cpuset top_cpuset = { | |||
236 | 238 | ||
237 | static DEFINE_MUTEX(callback_mutex); | 239 | static DEFINE_MUTEX(callback_mutex); |
238 | 240 | ||
239 | /* This is ugly, but preserves the userspace API for existing cpuset | 241 | /* |
242 | * This is ugly, but preserves the userspace API for existing cpuset | ||
240 | * users. If someone tries to mount the "cpuset" filesystem, we | 243 | * users. If someone tries to mount the "cpuset" filesystem, we |
241 | * silently switch it to mount "cgroup" instead */ | 244 | * silently switch it to mount "cgroup" instead |
245 | */ | ||
242 | static int cpuset_get_sb(struct file_system_type *fs_type, | 246 | static int cpuset_get_sb(struct file_system_type *fs_type, |
243 | int flags, const char *unused_dev_name, | 247 | int flags, const char *unused_dev_name, |
244 | void *data, struct vfsmount *mnt) | 248 | void *data, struct vfsmount *mnt) |
@@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
473 | } | 477 | } |
474 | 478 | ||
475 | /* | 479 | /* |
476 | * Helper routine for rebuild_sched_domains(). | 480 | * Helper routine for generate_sched_domains(). |
477 | * Do cpusets a, b have overlapping cpus_allowed masks? | 481 | * Do cpusets a, b have overlapping cpus_allowed masks? |
478 | */ | 482 | */ |
479 | |||
480 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | 483 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) |
481 | { | 484 | { |
482 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); | 485 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); |
@@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
518 | } | 521 | } |
519 | 522 | ||
520 | /* | 523 | /* |
521 | * rebuild_sched_domains() | 524 | * generate_sched_domains() |
522 | * | 525 | * |
523 | * This routine will be called to rebuild the scheduler's dynamic | 526 | * This function builds a partial partition of the systems CPUs |
524 | * sched domains: | 527 | * A 'partial partition' is a set of non-overlapping subsets whose |
525 | * - if the flag 'sched_load_balance' of any cpuset with non-empty | 528 | * union is a subset of that set. |
526 | * 'cpus' changes, | 529 | * The output of this function needs to be passed to kernel/sched.c |
527 | * - or if the 'cpus' allowed changes in any cpuset which has that | 530 | * partition_sched_domains() routine, which will rebuild the scheduler's |
528 | * flag enabled, | 531 | * load balancing domains (sched domains) as specified by that partial |
529 | * - or if the 'sched_relax_domain_level' of any cpuset which has | 532 | * partition. |
530 | * that flag enabled and with non-empty 'cpus' changes, | ||
531 | * - or if any cpuset with non-empty 'cpus' is removed, | ||
532 | * - or if a cpu gets offlined. | ||
533 | * | ||
534 | * This routine builds a partial partition of the systems CPUs | ||
535 | * (the set of non-overlappping cpumask_t's in the array 'part' | ||
536 | * below), and passes that partial partition to the kernel/sched.c | ||
537 | * partition_sched_domains() routine, which will rebuild the | ||
538 | * schedulers load balancing domains (sched domains) as specified | ||
539 | * by that partial partition. A 'partial partition' is a set of | ||
540 | * non-overlapping subsets whose union is a subset of that set. | ||
541 | * | 533 | * |
542 | * See "What is sched_load_balance" in Documentation/cpusets.txt | 534 | * See "What is sched_load_balance" in Documentation/cpusets.txt |
543 | * for a background explanation of this. | 535 | * for a background explanation of this. |
@@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
547 | * domains when operating in the severe memory shortage situations | 539 | * domains when operating in the severe memory shortage situations |
548 | * that could cause allocation failures below. | 540 | * that could cause allocation failures below. |
549 | * | 541 | * |
550 | * Call with cgroup_mutex held. May take callback_mutex during | 542 | * Must be called with cgroup_lock held. |
551 | * call due to the kfifo_alloc() and kmalloc() calls. May nest | ||
552 | * a call to the get_online_cpus()/put_online_cpus() pair. | ||
553 | * Must not be called holding callback_mutex, because we must not | ||
554 | * call get_online_cpus() while holding callback_mutex. Elsewhere | ||
555 | * the kernel nests callback_mutex inside get_online_cpus() calls. | ||
556 | * So the reverse nesting would risk an ABBA deadlock. | ||
557 | * | 543 | * |
558 | * The three key local variables below are: | 544 | * The three key local variables below are: |
559 | * q - a linked-list queue of cpuset pointers, used to implement a | 545 | * q - a linked-list queue of cpuset pointers, used to implement a |
@@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
588 | * element of the partition (one sched domain) to be passed to | 574 | * element of the partition (one sched domain) to be passed to |
589 | * partition_sched_domains(). | 575 | * partition_sched_domains(). |
590 | */ | 576 | */ |
591 | 577 | static int generate_sched_domains(cpumask_t **domains, | |
592 | void rebuild_sched_domains(void) | 578 | struct sched_domain_attr **attributes) |
593 | { | 579 | { |
594 | LIST_HEAD(q); /* queue of cpusets to be scanned*/ | 580 | LIST_HEAD(q); /* queue of cpusets to be scanned */ |
595 | struct cpuset *cp; /* scans q */ | 581 | struct cpuset *cp; /* scans q */ |
596 | struct cpuset **csa; /* array of all cpuset ptrs */ | 582 | struct cpuset **csa; /* array of all cpuset ptrs */ |
597 | int csn; /* how many cpuset ptrs in csa so far */ | 583 | int csn; /* how many cpuset ptrs in csa so far */ |
@@ -601,23 +587,26 @@ void rebuild_sched_domains(void) | |||
601 | int ndoms; /* number of sched domains in result */ | 587 | int ndoms; /* number of sched domains in result */ |
602 | int nslot; /* next empty doms[] cpumask_t slot */ | 588 | int nslot; /* next empty doms[] cpumask_t slot */ |
603 | 589 | ||
604 | csa = NULL; | 590 | ndoms = 0; |
605 | doms = NULL; | 591 | doms = NULL; |
606 | dattr = NULL; | 592 | dattr = NULL; |
593 | csa = NULL; | ||
607 | 594 | ||
608 | /* Special case for the 99% of systems with one, full, sched domain */ | 595 | /* Special case for the 99% of systems with one, full, sched domain */ |
609 | if (is_sched_load_balance(&top_cpuset)) { | 596 | if (is_sched_load_balance(&top_cpuset)) { |
610 | ndoms = 1; | ||
611 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | 597 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); |
612 | if (!doms) | 598 | if (!doms) |
613 | goto rebuild; | 599 | goto done; |
600 | |||
614 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); | 601 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); |
615 | if (dattr) { | 602 | if (dattr) { |
616 | *dattr = SD_ATTR_INIT; | 603 | *dattr = SD_ATTR_INIT; |
617 | update_domain_attr_tree(dattr, &top_cpuset); | 604 | update_domain_attr_tree(dattr, &top_cpuset); |
618 | } | 605 | } |
619 | *doms = top_cpuset.cpus_allowed; | 606 | *doms = top_cpuset.cpus_allowed; |
620 | goto rebuild; | 607 | |
608 | ndoms = 1; | ||
609 | goto done; | ||
621 | } | 610 | } |
622 | 611 | ||
623 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); | 612 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); |
@@ -680,61 +669,141 @@ restart: | |||
680 | } | 669 | } |
681 | } | 670 | } |
682 | 671 | ||
683 | /* Convert <csn, csa> to <ndoms, doms> */ | 672 | /* |
673 | * Now we know how many domains to create. | ||
674 | * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. | ||
675 | */ | ||
684 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); | 676 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); |
685 | if (!doms) | 677 | if (!doms) { |
686 | goto rebuild; | 678 | ndoms = 0; |
679 | goto done; | ||
680 | } | ||
681 | |||
682 | /* | ||
683 | * The rest of the code, including the scheduler, can deal with | ||
684 | * dattr==NULL case. No need to abort if alloc fails. | ||
685 | */ | ||
687 | dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); | 686 | dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); |
688 | 687 | ||
689 | for (nslot = 0, i = 0; i < csn; i++) { | 688 | for (nslot = 0, i = 0; i < csn; i++) { |
690 | struct cpuset *a = csa[i]; | 689 | struct cpuset *a = csa[i]; |
690 | cpumask_t *dp; | ||
691 | int apn = a->pn; | 691 | int apn = a->pn; |
692 | 692 | ||
693 | if (apn >= 0) { | 693 | if (apn < 0) { |
694 | cpumask_t *dp = doms + nslot; | 694 | /* Skip completed partitions */ |
695 | 695 | continue; | |
696 | if (nslot == ndoms) { | 696 | } |
697 | static int warnings = 10; | 697 | |
698 | if (warnings) { | 698 | dp = doms + nslot; |
699 | printk(KERN_WARNING | 699 | |
700 | "rebuild_sched_domains confused:" | 700 | if (nslot == ndoms) { |
701 | " nslot %d, ndoms %d, csn %d, i %d," | 701 | static int warnings = 10; |
702 | " apn %d\n", | 702 | if (warnings) { |
703 | nslot, ndoms, csn, i, apn); | 703 | printk(KERN_WARNING |
704 | warnings--; | 704 | "rebuild_sched_domains confused:" |
705 | } | 705 | " nslot %d, ndoms %d, csn %d, i %d," |
706 | continue; | 706 | " apn %d\n", |
707 | nslot, ndoms, csn, i, apn); | ||
708 | warnings--; | ||
707 | } | 709 | } |
710 | continue; | ||
711 | } | ||
708 | 712 | ||
709 | cpus_clear(*dp); | 713 | cpus_clear(*dp); |
710 | if (dattr) | 714 | if (dattr) |
711 | *(dattr + nslot) = SD_ATTR_INIT; | 715 | *(dattr + nslot) = SD_ATTR_INIT; |
712 | for (j = i; j < csn; j++) { | 716 | for (j = i; j < csn; j++) { |
713 | struct cpuset *b = csa[j]; | 717 | struct cpuset *b = csa[j]; |
714 | 718 | ||
715 | if (apn == b->pn) { | 719 | if (apn == b->pn) { |
716 | cpus_or(*dp, *dp, b->cpus_allowed); | 720 | cpus_or(*dp, *dp, b->cpus_allowed); |
717 | b->pn = -1; | 721 | if (dattr) |
718 | if (dattr) | 722 | update_domain_attr_tree(dattr + nslot, b); |
719 | update_domain_attr_tree(dattr | 723 | |
720 | + nslot, b); | 724 | /* Done with this partition */ |
721 | } | 725 | b->pn = -1; |
722 | } | 726 | } |
723 | nslot++; | ||
724 | } | 727 | } |
728 | nslot++; | ||
725 | } | 729 | } |
726 | BUG_ON(nslot != ndoms); | 730 | BUG_ON(nslot != ndoms); |
727 | 731 | ||
728 | rebuild: | 732 | done: |
729 | /* Have scheduler rebuild sched domains */ | 733 | kfree(csa); |
734 | |||
735 | *domains = doms; | ||
736 | *attributes = dattr; | ||
737 | return ndoms; | ||
738 | } | ||
739 | |||
740 | /* | ||
741 | * Rebuild scheduler domains. | ||
742 | * | ||
743 | * Call with neither cgroup_mutex held nor within get_online_cpus(). | ||
744 | * Takes both cgroup_mutex and get_online_cpus(). | ||
745 | * | ||
746 | * Cannot be directly called from cpuset code handling changes | ||
747 | * to the cpuset pseudo-filesystem, because it cannot be called | ||
748 | * from code that already holds cgroup_mutex. | ||
749 | */ | ||
750 | static void do_rebuild_sched_domains(struct work_struct *unused) | ||
751 | { | ||
752 | struct sched_domain_attr *attr; | ||
753 | cpumask_t *doms; | ||
754 | int ndoms; | ||
755 | |||
730 | get_online_cpus(); | 756 | get_online_cpus(); |
731 | partition_sched_domains(ndoms, doms, dattr); | 757 | |
758 | /* Generate domain masks and attrs */ | ||
759 | cgroup_lock(); | ||
760 | ndoms = generate_sched_domains(&doms, &attr); | ||
761 | cgroup_unlock(); | ||
762 | |||
763 | /* Have scheduler rebuild the domains */ | ||
764 | partition_sched_domains(ndoms, doms, attr); | ||
765 | |||
732 | put_online_cpus(); | 766 | put_online_cpus(); |
767 | } | ||
733 | 768 | ||
734 | done: | 769 | static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); |
735 | kfree(csa); | 770 | |
736 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ | 771 | /* |
737 | /* Don't kfree(dattr) -- partition_sched_domains() does that. */ | 772 | * Rebuild scheduler domains, asynchronously via workqueue. |
773 | * | ||
774 | * If the flag 'sched_load_balance' of any cpuset with non-empty | ||
775 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset | ||
776 | * which has that flag enabled, or if any cpuset with a non-empty | ||
777 | * 'cpus' is removed, then call this routine to rebuild the | ||
778 | * scheduler's dynamic sched domains. | ||
779 | * | ||
780 | * The rebuild_sched_domains() and partition_sched_domains() | ||
781 | * routines must nest cgroup_lock() inside get_online_cpus(), | ||
782 | * but such cpuset changes as these must nest that locking the | ||
783 | * other way, holding cgroup_lock() for much of the code. | ||
784 | * | ||
785 | * So in order to avoid an ABBA deadlock, the cpuset code handling | ||
786 | * these user changes delegates the actual sched domain rebuilding | ||
787 | * to a separate workqueue thread, which ends up processing the | ||
788 | * above do_rebuild_sched_domains() function. | ||
789 | */ | ||
790 | static void async_rebuild_sched_domains(void) | ||
791 | { | ||
792 | schedule_work(&rebuild_sched_domains_work); | ||
793 | } | ||
794 | |||
795 | /* | ||
796 | * Accomplishes the same scheduler domain rebuild as the above | ||
797 | * async_rebuild_sched_domains(), however it directly calls the | ||
798 | * rebuild routine synchronously rather than calling it via an | ||
799 | * asynchronous work thread. | ||
800 | * | ||
801 | * This can only be called from code that is not holding | ||
802 | * cgroup_mutex (not nested in a cgroup_lock() call.) | ||
803 | */ | ||
804 | void rebuild_sched_domains(void) | ||
805 | { | ||
806 | do_rebuild_sched_domains(NULL); | ||
738 | } | 807 | } |
739 | 808 | ||
740 | /** | 809 | /** |
@@ -863,7 +932,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf) | |||
863 | return retval; | 932 | return retval; |
864 | 933 | ||
865 | if (is_load_balanced) | 934 | if (is_load_balanced) |
866 | rebuild_sched_domains(); | 935 | async_rebuild_sched_domains(); |
867 | return 0; | 936 | return 0; |
868 | } | 937 | } |
869 | 938 | ||
@@ -1090,7 +1159,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
1090 | if (val != cs->relax_domain_level) { | 1159 | if (val != cs->relax_domain_level) { |
1091 | cs->relax_domain_level = val; | 1160 | cs->relax_domain_level = val; |
1092 | if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) | 1161 | if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) |
1093 | rebuild_sched_domains(); | 1162 | async_rebuild_sched_domains(); |
1094 | } | 1163 | } |
1095 | 1164 | ||
1096 | return 0; | 1165 | return 0; |
@@ -1131,7 +1200,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | |||
1131 | mutex_unlock(&callback_mutex); | 1200 | mutex_unlock(&callback_mutex); |
1132 | 1201 | ||
1133 | if (cpus_nonempty && balance_flag_changed) | 1202 | if (cpus_nonempty && balance_flag_changed) |
1134 | rebuild_sched_domains(); | 1203 | async_rebuild_sched_domains(); |
1135 | 1204 | ||
1136 | return 0; | 1205 | return 0; |
1137 | } | 1206 | } |
@@ -1492,6 +1561,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) | |||
1492 | default: | 1561 | default: |
1493 | BUG(); | 1562 | BUG(); |
1494 | } | 1563 | } |
1564 | |||
1565 | /* Unreachable but makes gcc happy */ | ||
1566 | return 0; | ||
1495 | } | 1567 | } |
1496 | 1568 | ||
1497 | static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) | 1569 | static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) |
@@ -1504,6 +1576,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) | |||
1504 | default: | 1576 | default: |
1505 | BUG(); | 1577 | BUG(); |
1506 | } | 1578 | } |
1579 | |||
1580 | /* Unrechable but makes gcc happy */ | ||
1581 | return 0; | ||
1507 | } | 1582 | } |
1508 | 1583 | ||
1509 | 1584 | ||
@@ -1692,15 +1767,9 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1692 | } | 1767 | } |
1693 | 1768 | ||
1694 | /* | 1769 | /* |
1695 | * Locking note on the strange update_flag() call below: | ||
1696 | * | ||
1697 | * If the cpuset being removed has its flag 'sched_load_balance' | 1770 | * If the cpuset being removed has its flag 'sched_load_balance' |
1698 | * enabled, then simulate turning sched_load_balance off, which | 1771 | * enabled, then simulate turning sched_load_balance off, which |
1699 | * will call rebuild_sched_domains(). The get_online_cpus() | 1772 | * will call async_rebuild_sched_domains(). |
1700 | * call in rebuild_sched_domains() must not be made while holding | ||
1701 | * callback_mutex. Elsewhere the kernel nests callback_mutex inside | ||
1702 | * get_online_cpus() calls. So the reverse nesting would risk an | ||
1703 | * ABBA deadlock. | ||
1704 | */ | 1773 | */ |
1705 | 1774 | ||
1706 | static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | 1775 | static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) |
@@ -1719,7 +1788,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1719 | struct cgroup_subsys cpuset_subsys = { | 1788 | struct cgroup_subsys cpuset_subsys = { |
1720 | .name = "cpuset", | 1789 | .name = "cpuset", |
1721 | .create = cpuset_create, | 1790 | .create = cpuset_create, |
1722 | .destroy = cpuset_destroy, | 1791 | .destroy = cpuset_destroy, |
1723 | .can_attach = cpuset_can_attach, | 1792 | .can_attach = cpuset_can_attach, |
1724 | .attach = cpuset_attach, | 1793 | .attach = cpuset_attach, |
1725 | .populate = cpuset_populate, | 1794 | .populate = cpuset_populate, |
@@ -1811,7 +1880,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) | |||
1811 | } | 1880 | } |
1812 | 1881 | ||
1813 | /* | 1882 | /* |
1814 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs | 1883 | * If CPU and/or memory hotplug handlers, below, unplug any CPUs |
1815 | * or memory nodes, we need to walk over the cpuset hierarchy, | 1884 | * or memory nodes, we need to walk over the cpuset hierarchy, |
1816 | * removing that CPU or node from all cpusets. If this removes the | 1885 | * removing that CPU or node from all cpusets. If this removes the |
1817 | * last CPU or node from a cpuset, then move the tasks in the empty | 1886 | * last CPU or node from a cpuset, then move the tasks in the empty |
@@ -1903,35 +1972,6 @@ static void scan_for_empty_cpusets(const struct cpuset *root) | |||
1903 | } | 1972 | } |
1904 | 1973 | ||
1905 | /* | 1974 | /* |
1906 | * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track | ||
1907 | * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to | ||
1908 | * track what's online after any CPU or memory node hotplug or unplug event. | ||
1909 | * | ||
1910 | * Since there are two callers of this routine, one for CPU hotplug | ||
1911 | * events and one for memory node hotplug events, we could have coded | ||
1912 | * two separate routines here. We code it as a single common routine | ||
1913 | * in order to minimize text size. | ||
1914 | */ | ||
1915 | |||
1916 | static void common_cpu_mem_hotplug_unplug(int rebuild_sd) | ||
1917 | { | ||
1918 | cgroup_lock(); | ||
1919 | |||
1920 | top_cpuset.cpus_allowed = cpu_online_map; | ||
1921 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | ||
1922 | scan_for_empty_cpusets(&top_cpuset); | ||
1923 | |||
1924 | /* | ||
1925 | * Scheduler destroys domains on hotplug events. | ||
1926 | * Rebuild them based on the current settings. | ||
1927 | */ | ||
1928 | if (rebuild_sd) | ||
1929 | rebuild_sched_domains(); | ||
1930 | |||
1931 | cgroup_unlock(); | ||
1932 | } | ||
1933 | |||
1934 | /* | ||
1935 | * The top_cpuset tracks what CPUs and Memory Nodes are online, | 1975 | * The top_cpuset tracks what CPUs and Memory Nodes are online, |
1936 | * period. This is necessary in order to make cpusets transparent | 1976 | * period. This is necessary in order to make cpusets transparent |
1937 | * (of no affect) on systems that are actively using CPU hotplug | 1977 | * (of no affect) on systems that are actively using CPU hotplug |
@@ -1939,40 +1979,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd) | |||
1939 | * | 1979 | * |
1940 | * This routine ensures that top_cpuset.cpus_allowed tracks | 1980 | * This routine ensures that top_cpuset.cpus_allowed tracks |
1941 | * cpu_online_map on each CPU hotplug (cpuhp) event. | 1981 | * cpu_online_map on each CPU hotplug (cpuhp) event. |
1982 | * | ||
1983 | * Called within get_online_cpus(). Needs to call cgroup_lock() | ||
1984 | * before calling generate_sched_domains(). | ||
1942 | */ | 1985 | */ |
1943 | 1986 | static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | |
1944 | static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, | ||
1945 | unsigned long phase, void *unused_cpu) | 1987 | unsigned long phase, void *unused_cpu) |
1946 | { | 1988 | { |
1989 | struct sched_domain_attr *attr; | ||
1990 | cpumask_t *doms; | ||
1991 | int ndoms; | ||
1992 | |||
1947 | switch (phase) { | 1993 | switch (phase) { |
1948 | case CPU_UP_CANCELED: | ||
1949 | case CPU_UP_CANCELED_FROZEN: | ||
1950 | case CPU_DOWN_FAILED: | ||
1951 | case CPU_DOWN_FAILED_FROZEN: | ||
1952 | case CPU_ONLINE: | 1994 | case CPU_ONLINE: |
1953 | case CPU_ONLINE_FROZEN: | 1995 | case CPU_ONLINE_FROZEN: |
1954 | case CPU_DEAD: | 1996 | case CPU_DEAD: |
1955 | case CPU_DEAD_FROZEN: | 1997 | case CPU_DEAD_FROZEN: |
1956 | common_cpu_mem_hotplug_unplug(1); | ||
1957 | break; | 1998 | break; |
1999 | |||
1958 | default: | 2000 | default: |
1959 | return NOTIFY_DONE; | 2001 | return NOTIFY_DONE; |
1960 | } | 2002 | } |
1961 | 2003 | ||
2004 | cgroup_lock(); | ||
2005 | top_cpuset.cpus_allowed = cpu_online_map; | ||
2006 | scan_for_empty_cpusets(&top_cpuset); | ||
2007 | ndoms = generate_sched_domains(&doms, &attr); | ||
2008 | cgroup_unlock(); | ||
2009 | |||
2010 | /* Have scheduler rebuild the domains */ | ||
2011 | partition_sched_domains(ndoms, doms, attr); | ||
2012 | |||
1962 | return NOTIFY_OK; | 2013 | return NOTIFY_OK; |
1963 | } | 2014 | } |
1964 | 2015 | ||
1965 | #ifdef CONFIG_MEMORY_HOTPLUG | 2016 | #ifdef CONFIG_MEMORY_HOTPLUG |
1966 | /* | 2017 | /* |
1967 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. | 2018 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. |
1968 | * Call this routine anytime after you change | 2019 | * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. |
1969 | * node_states[N_HIGH_MEMORY]. | 2020 | * See also the previous routine cpuset_track_online_cpus(). |
1970 | * See also the previous routine cpuset_handle_cpuhp(). | ||
1971 | */ | 2021 | */ |
1972 | |||
1973 | void cpuset_track_online_nodes(void) | 2022 | void cpuset_track_online_nodes(void) |
1974 | { | 2023 | { |
1975 | common_cpu_mem_hotplug_unplug(0); | 2024 | cgroup_lock(); |
2025 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | ||
2026 | scan_for_empty_cpusets(&top_cpuset); | ||
2027 | cgroup_unlock(); | ||
1976 | } | 2028 | } |
1977 | #endif | 2029 | #endif |
1978 | 2030 | ||
@@ -1987,7 +2039,7 @@ void __init cpuset_init_smp(void) | |||
1987 | top_cpuset.cpus_allowed = cpu_online_map; | 2039 | top_cpuset.cpus_allowed = cpu_online_map; |
1988 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2040 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
1989 | 2041 | ||
1990 | hotcpu_notifier(cpuset_handle_cpuhp, 0); | 2042 | hotcpu_notifier(cpuset_track_online_cpus, 0); |
1991 | } | 2043 | } |
1992 | 2044 | ||
1993 | /** | 2045 | /** |
diff --git a/kernel/sched.c b/kernel/sched.c index 1a5f73c1fcdc..cc1f81b50b82 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -7696,24 +7696,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | |||
7696 | * and partition_sched_domains() will fallback to the single partition | 7696 | * and partition_sched_domains() will fallback to the single partition |
7697 | * 'fallback_doms', it also forces the domains to be rebuilt. | 7697 | * 'fallback_doms', it also forces the domains to be rebuilt. |
7698 | * | 7698 | * |
7699 | * If doms_new==NULL it will be replaced with cpu_online_map. | ||
7700 | * ndoms_new==0 is a special case for destroying existing domains. | ||
7701 | * It will not create the default domain. | ||
7702 | * | ||
7699 | * Call with hotplug lock held | 7703 | * Call with hotplug lock held |
7700 | */ | 7704 | */ |
7701 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, | 7705 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, |
7702 | struct sched_domain_attr *dattr_new) | 7706 | struct sched_domain_attr *dattr_new) |
7703 | { | 7707 | { |
7704 | int i, j; | 7708 | int i, j, n; |
7705 | 7709 | ||
7706 | mutex_lock(&sched_domains_mutex); | 7710 | mutex_lock(&sched_domains_mutex); |
7707 | 7711 | ||
7708 | /* always unregister in case we don't destroy any domains */ | 7712 | /* always unregister in case we don't destroy any domains */ |
7709 | unregister_sched_domain_sysctl(); | 7713 | unregister_sched_domain_sysctl(); |
7710 | 7714 | ||
7711 | if (doms_new == NULL) | 7715 | n = doms_new ? ndoms_new : 0; |
7712 | ndoms_new = 0; | ||
7713 | 7716 | ||
7714 | /* Destroy deleted domains */ | 7717 | /* Destroy deleted domains */ |
7715 | for (i = 0; i < ndoms_cur; i++) { | 7718 | for (i = 0; i < ndoms_cur; i++) { |
7716 | for (j = 0; j < ndoms_new; j++) { | 7719 | for (j = 0; j < n; j++) { |
7717 | if (cpus_equal(doms_cur[i], doms_new[j]) | 7720 | if (cpus_equal(doms_cur[i], doms_new[j]) |
7718 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | 7721 | && dattrs_equal(dattr_cur, i, dattr_new, j)) |
7719 | goto match1; | 7722 | goto match1; |
@@ -7726,7 +7729,6 @@ match1: | |||
7726 | 7729 | ||
7727 | if (doms_new == NULL) { | 7730 | if (doms_new == NULL) { |
7728 | ndoms_cur = 0; | 7731 | ndoms_cur = 0; |
7729 | ndoms_new = 1; | ||
7730 | doms_new = &fallback_doms; | 7732 | doms_new = &fallback_doms; |
7731 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); | 7733 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); |
7732 | dattr_new = NULL; | 7734 | dattr_new = NULL; |
@@ -7763,8 +7765,13 @@ match2: | |||
7763 | int arch_reinit_sched_domains(void) | 7765 | int arch_reinit_sched_domains(void) |
7764 | { | 7766 | { |
7765 | get_online_cpus(); | 7767 | get_online_cpus(); |
7768 | |||
7769 | /* Destroy domains first to force the rebuild */ | ||
7770 | partition_sched_domains(0, NULL, NULL); | ||
7771 | |||
7766 | rebuild_sched_domains(); | 7772 | rebuild_sched_domains(); |
7767 | put_online_cpus(); | 7773 | put_online_cpus(); |
7774 | |||
7768 | return 0; | 7775 | return 0; |
7769 | } | 7776 | } |
7770 | 7777 | ||
@@ -7848,7 +7855,7 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
7848 | case CPU_ONLINE_FROZEN: | 7855 | case CPU_ONLINE_FROZEN: |
7849 | case CPU_DEAD: | 7856 | case CPU_DEAD: |
7850 | case CPU_DEAD_FROZEN: | 7857 | case CPU_DEAD_FROZEN: |
7851 | partition_sched_domains(0, NULL, NULL); | 7858 | partition_sched_domains(1, NULL, NULL); |
7852 | return NOTIFY_OK; | 7859 | return NOTIFY_OK; |
7853 | 7860 | ||
7854 | default: | 7861 | default: |