diff options
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 351 |
1 files changed, 198 insertions, 153 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d5ab79cf516d..eab7bd6628e0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -14,6 +14,8 @@ | |||
14 | * 2003-10-22 Updates by Stephen Hemminger. | 14 | * 2003-10-22 Updates by Stephen Hemminger. |
15 | * 2004 May-July Rework by Paul Jackson. | 15 | * 2004 May-July Rework by Paul Jackson. |
16 | * 2006 Rework by Paul Menage to use generic cgroups | 16 | * 2006 Rework by Paul Menage to use generic cgroups |
17 | * 2008 Rework of the scheduler domains and CPU hotplug handling | ||
18 | * by Max Krasnyansky | ||
17 | * | 19 | * |
18 | * This file is subject to the terms and conditions of the GNU General Public | 20 | * This file is subject to the terms and conditions of the GNU General Public |
19 | * License. See the file COPYING in the main directory of the Linux | 21 | * License. See the file COPYING in the main directory of the Linux |
@@ -236,9 +238,11 @@ static struct cpuset top_cpuset = { | |||
236 | 238 | ||
237 | static DEFINE_MUTEX(callback_mutex); | 239 | static DEFINE_MUTEX(callback_mutex); |
238 | 240 | ||
239 | /* This is ugly, but preserves the userspace API for existing cpuset | 241 | /* |
242 | * This is ugly, but preserves the userspace API for existing cpuset | ||
240 | * users. If someone tries to mount the "cpuset" filesystem, we | 243 | * users. If someone tries to mount the "cpuset" filesystem, we |
241 | * silently switch it to mount "cgroup" instead */ | 244 | * silently switch it to mount "cgroup" instead |
245 | */ | ||
242 | static int cpuset_get_sb(struct file_system_type *fs_type, | 246 | static int cpuset_get_sb(struct file_system_type *fs_type, |
243 | int flags, const char *unused_dev_name, | 247 | int flags, const char *unused_dev_name, |
244 | void *data, struct vfsmount *mnt) | 248 | void *data, struct vfsmount *mnt) |
@@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
473 | } | 477 | } |
474 | 478 | ||
475 | /* | 479 | /* |
476 | * Helper routine for rebuild_sched_domains(). | 480 | * Helper routine for generate_sched_domains(). |
477 | * Do cpusets a, b have overlapping cpus_allowed masks? | 481 | * Do cpusets a, b have overlapping cpus_allowed masks? |
478 | */ | 482 | */ |
479 | |||
480 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | 483 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) |
481 | { | 484 | { |
482 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); | 485 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); |
@@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
518 | } | 521 | } |
519 | 522 | ||
520 | /* | 523 | /* |
521 | * rebuild_sched_domains() | 524 | * generate_sched_domains() |
522 | * | 525 | * |
523 | * This routine will be called to rebuild the scheduler's dynamic | 526 | * This function builds a partial partition of the systems CPUs |
524 | * sched domains: | 527 | * A 'partial partition' is a set of non-overlapping subsets whose |
525 | * - if the flag 'sched_load_balance' of any cpuset with non-empty | 528 | * union is a subset of that set. |
526 | * 'cpus' changes, | 529 | * The output of this function needs to be passed to kernel/sched.c |
527 | * - or if the 'cpus' allowed changes in any cpuset which has that | 530 | * partition_sched_domains() routine, which will rebuild the scheduler's |
528 | * flag enabled, | 531 | * load balancing domains (sched domains) as specified by that partial |
529 | * - or if the 'sched_relax_domain_level' of any cpuset which has | 532 | * partition. |
530 | * that flag enabled and with non-empty 'cpus' changes, | ||
531 | * - or if any cpuset with non-empty 'cpus' is removed, | ||
532 | * - or if a cpu gets offlined. | ||
533 | * | ||
534 | * This routine builds a partial partition of the systems CPUs | ||
535 | * (the set of non-overlappping cpumask_t's in the array 'part' | ||
536 | * below), and passes that partial partition to the kernel/sched.c | ||
537 | * partition_sched_domains() routine, which will rebuild the | ||
538 | * schedulers load balancing domains (sched domains) as specified | ||
539 | * by that partial partition. A 'partial partition' is a set of | ||
540 | * non-overlapping subsets whose union is a subset of that set. | ||
541 | * | 533 | * |
542 | * See "What is sched_load_balance" in Documentation/cpusets.txt | 534 | * See "What is sched_load_balance" in Documentation/cpusets.txt |
543 | * for a background explanation of this. | 535 | * for a background explanation of this. |
@@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
547 | * domains when operating in the severe memory shortage situations | 539 | * domains when operating in the severe memory shortage situations |
548 | * that could cause allocation failures below. | 540 | * that could cause allocation failures below. |
549 | * | 541 | * |
550 | * Call with cgroup_mutex held. May take callback_mutex during | 542 | * Must be called with cgroup_lock held. |
551 | * call due to the kfifo_alloc() and kmalloc() calls. May nest | ||
552 | * a call to the get_online_cpus()/put_online_cpus() pair. | ||
553 | * Must not be called holding callback_mutex, because we must not | ||
554 | * call get_online_cpus() while holding callback_mutex. Elsewhere | ||
555 | * the kernel nests callback_mutex inside get_online_cpus() calls. | ||
556 | * So the reverse nesting would risk an ABBA deadlock. | ||
557 | * | 543 | * |
558 | * The three key local variables below are: | 544 | * The three key local variables below are: |
559 | * q - a linked-list queue of cpuset pointers, used to implement a | 545 | * q - a linked-list queue of cpuset pointers, used to implement a |
@@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
588 | * element of the partition (one sched domain) to be passed to | 574 | * element of the partition (one sched domain) to be passed to |
589 | * partition_sched_domains(). | 575 | * partition_sched_domains(). |
590 | */ | 576 | */ |
591 | 577 | static int generate_sched_domains(cpumask_t **domains, | |
592 | void rebuild_sched_domains(void) | 578 | struct sched_domain_attr **attributes) |
593 | { | 579 | { |
594 | LIST_HEAD(q); /* queue of cpusets to be scanned*/ | 580 | LIST_HEAD(q); /* queue of cpusets to be scanned */ |
595 | struct cpuset *cp; /* scans q */ | 581 | struct cpuset *cp; /* scans q */ |
596 | struct cpuset **csa; /* array of all cpuset ptrs */ | 582 | struct cpuset **csa; /* array of all cpuset ptrs */ |
597 | int csn; /* how many cpuset ptrs in csa so far */ | 583 | int csn; /* how many cpuset ptrs in csa so far */ |
@@ -601,23 +587,26 @@ void rebuild_sched_domains(void) | |||
601 | int ndoms; /* number of sched domains in result */ | 587 | int ndoms; /* number of sched domains in result */ |
602 | int nslot; /* next empty doms[] cpumask_t slot */ | 588 | int nslot; /* next empty doms[] cpumask_t slot */ |
603 | 589 | ||
604 | csa = NULL; | 590 | ndoms = 0; |
605 | doms = NULL; | 591 | doms = NULL; |
606 | dattr = NULL; | 592 | dattr = NULL; |
593 | csa = NULL; | ||
607 | 594 | ||
608 | /* Special case for the 99% of systems with one, full, sched domain */ | 595 | /* Special case for the 99% of systems with one, full, sched domain */ |
609 | if (is_sched_load_balance(&top_cpuset)) { | 596 | if (is_sched_load_balance(&top_cpuset)) { |
610 | ndoms = 1; | ||
611 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | 597 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); |
612 | if (!doms) | 598 | if (!doms) |
613 | goto rebuild; | 599 | goto done; |
600 | |||
614 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); | 601 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); |
615 | if (dattr) { | 602 | if (dattr) { |
616 | *dattr = SD_ATTR_INIT; | 603 | *dattr = SD_ATTR_INIT; |
617 | update_domain_attr_tree(dattr, &top_cpuset); | 604 | update_domain_attr_tree(dattr, &top_cpuset); |
618 | } | 605 | } |
619 | *doms = top_cpuset.cpus_allowed; | 606 | *doms = top_cpuset.cpus_allowed; |
620 | goto rebuild; | 607 | |
608 | ndoms = 1; | ||
609 | goto done; | ||
621 | } | 610 | } |
622 | 611 | ||
623 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); | 612 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); |
@@ -680,61 +669,141 @@ restart: | |||
680 | } | 669 | } |
681 | } | 670 | } |
682 | 671 | ||
683 | /* Convert <csn, csa> to <ndoms, doms> */ | 672 | /* |
673 | * Now we know how many domains to create. | ||
674 | * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. | ||
675 | */ | ||
684 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); | 676 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); |
685 | if (!doms) | 677 | if (!doms) { |
686 | goto rebuild; | 678 | ndoms = 0; |
679 | goto done; | ||
680 | } | ||
681 | |||
682 | /* | ||
683 | * The rest of the code, including the scheduler, can deal with | ||
684 | * dattr==NULL case. No need to abort if alloc fails. | ||
685 | */ | ||
687 | dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); | 686 | dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); |
688 | 687 | ||
689 | for (nslot = 0, i = 0; i < csn; i++) { | 688 | for (nslot = 0, i = 0; i < csn; i++) { |
690 | struct cpuset *a = csa[i]; | 689 | struct cpuset *a = csa[i]; |
690 | cpumask_t *dp; | ||
691 | int apn = a->pn; | 691 | int apn = a->pn; |
692 | 692 | ||
693 | if (apn >= 0) { | 693 | if (apn < 0) { |
694 | cpumask_t *dp = doms + nslot; | 694 | /* Skip completed partitions */ |
695 | 695 | continue; | |
696 | if (nslot == ndoms) { | 696 | } |
697 | static int warnings = 10; | 697 | |
698 | if (warnings) { | 698 | dp = doms + nslot; |
699 | printk(KERN_WARNING | 699 | |
700 | "rebuild_sched_domains confused:" | 700 | if (nslot == ndoms) { |
701 | " nslot %d, ndoms %d, csn %d, i %d," | 701 | static int warnings = 10; |
702 | " apn %d\n", | 702 | if (warnings) { |
703 | nslot, ndoms, csn, i, apn); | 703 | printk(KERN_WARNING |
704 | warnings--; | 704 | "rebuild_sched_domains confused:" |
705 | } | 705 | " nslot %d, ndoms %d, csn %d, i %d," |
706 | continue; | 706 | " apn %d\n", |
707 | nslot, ndoms, csn, i, apn); | ||
708 | warnings--; | ||
707 | } | 709 | } |
710 | continue; | ||
711 | } | ||
708 | 712 | ||
709 | cpus_clear(*dp); | 713 | cpus_clear(*dp); |
710 | if (dattr) | 714 | if (dattr) |
711 | *(dattr + nslot) = SD_ATTR_INIT; | 715 | *(dattr + nslot) = SD_ATTR_INIT; |
712 | for (j = i; j < csn; j++) { | 716 | for (j = i; j < csn; j++) { |
713 | struct cpuset *b = csa[j]; | 717 | struct cpuset *b = csa[j]; |
714 | 718 | ||
715 | if (apn == b->pn) { | 719 | if (apn == b->pn) { |
716 | cpus_or(*dp, *dp, b->cpus_allowed); | 720 | cpus_or(*dp, *dp, b->cpus_allowed); |
717 | b->pn = -1; | 721 | if (dattr) |
718 | if (dattr) | 722 | update_domain_attr_tree(dattr + nslot, b); |
719 | update_domain_attr_tree(dattr | 723 | |
720 | + nslot, b); | 724 | /* Done with this partition */ |
721 | } | 725 | b->pn = -1; |
722 | } | 726 | } |
723 | nslot++; | ||
724 | } | 727 | } |
728 | nslot++; | ||
725 | } | 729 | } |
726 | BUG_ON(nslot != ndoms); | 730 | BUG_ON(nslot != ndoms); |
727 | 731 | ||
728 | rebuild: | 732 | done: |
729 | /* Have scheduler rebuild sched domains */ | 733 | kfree(csa); |
734 | |||
735 | *domains = doms; | ||
736 | *attributes = dattr; | ||
737 | return ndoms; | ||
738 | } | ||
739 | |||
740 | /* | ||
741 | * Rebuild scheduler domains. | ||
742 | * | ||
743 | * Call with neither cgroup_mutex held nor within get_online_cpus(). | ||
744 | * Takes both cgroup_mutex and get_online_cpus(). | ||
745 | * | ||
746 | * Cannot be directly called from cpuset code handling changes | ||
747 | * to the cpuset pseudo-filesystem, because it cannot be called | ||
748 | * from code that already holds cgroup_mutex. | ||
749 | */ | ||
750 | static void do_rebuild_sched_domains(struct work_struct *unused) | ||
751 | { | ||
752 | struct sched_domain_attr *attr; | ||
753 | cpumask_t *doms; | ||
754 | int ndoms; | ||
755 | |||
730 | get_online_cpus(); | 756 | get_online_cpus(); |
731 | partition_sched_domains(ndoms, doms, dattr); | 757 | |
758 | /* Generate domain masks and attrs */ | ||
759 | cgroup_lock(); | ||
760 | ndoms = generate_sched_domains(&doms, &attr); | ||
761 | cgroup_unlock(); | ||
762 | |||
763 | /* Have scheduler rebuild the domains */ | ||
764 | partition_sched_domains(ndoms, doms, attr); | ||
765 | |||
732 | put_online_cpus(); | 766 | put_online_cpus(); |
767 | } | ||
733 | 768 | ||
734 | done: | 769 | static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); |
735 | kfree(csa); | 770 | |
736 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ | 771 | /* |
737 | /* Don't kfree(dattr) -- partition_sched_domains() does that. */ | 772 | * Rebuild scheduler domains, asynchronously via workqueue. |
773 | * | ||
774 | * If the flag 'sched_load_balance' of any cpuset with non-empty | ||
775 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset | ||
776 | * which has that flag enabled, or if any cpuset with a non-empty | ||
777 | * 'cpus' is removed, then call this routine to rebuild the | ||
778 | * scheduler's dynamic sched domains. | ||
779 | * | ||
780 | * The rebuild_sched_domains() and partition_sched_domains() | ||
781 | * routines must nest cgroup_lock() inside get_online_cpus(), | ||
782 | * but such cpuset changes as these must nest that locking the | ||
783 | * other way, holding cgroup_lock() for much of the code. | ||
784 | * | ||
785 | * So in order to avoid an ABBA deadlock, the cpuset code handling | ||
786 | * these user changes delegates the actual sched domain rebuilding | ||
787 | * to a separate workqueue thread, which ends up processing the | ||
788 | * above do_rebuild_sched_domains() function. | ||
789 | */ | ||
790 | static void async_rebuild_sched_domains(void) | ||
791 | { | ||
792 | schedule_work(&rebuild_sched_domains_work); | ||
793 | } | ||
794 | |||
795 | /* | ||
796 | * Accomplishes the same scheduler domain rebuild as the above | ||
797 | * async_rebuild_sched_domains(), however it directly calls the | ||
798 | * rebuild routine synchronously rather than calling it via an | ||
799 | * asynchronous work thread. | ||
800 | * | ||
801 | * This can only be called from code that is not holding | ||
802 | * cgroup_mutex (not nested in a cgroup_lock() call.) | ||
803 | */ | ||
804 | void rebuild_sched_domains(void) | ||
805 | { | ||
806 | do_rebuild_sched_domains(NULL); | ||
738 | } | 807 | } |
739 | 808 | ||
740 | /** | 809 | /** |
@@ -774,37 +843,25 @@ static void cpuset_change_cpumask(struct task_struct *tsk, | |||
774 | /** | 843 | /** |
775 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. | 844 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. |
776 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed | 845 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed |
846 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | ||
777 | * | 847 | * |
778 | * Called with cgroup_mutex held | 848 | * Called with cgroup_mutex held |
779 | * | 849 | * |
780 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 850 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, |
781 | * calling callback functions for each. | 851 | * calling callback functions for each. |
782 | * | 852 | * |
783 | * Return 0 if successful, -errno if not. | 853 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 |
854 | * if @heap != NULL. | ||
784 | */ | 855 | */ |
785 | static int update_tasks_cpumask(struct cpuset *cs) | 856 | static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) |
786 | { | 857 | { |
787 | struct cgroup_scanner scan; | 858 | struct cgroup_scanner scan; |
788 | struct ptr_heap heap; | ||
789 | int retval; | ||
790 | |||
791 | /* | ||
792 | * cgroup_scan_tasks() will initialize heap->gt for us. | ||
793 | * heap_init() is still needed here for we should not change | ||
794 | * cs->cpus_allowed when heap_init() fails. | ||
795 | */ | ||
796 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); | ||
797 | if (retval) | ||
798 | return retval; | ||
799 | 859 | ||
800 | scan.cg = cs->css.cgroup; | 860 | scan.cg = cs->css.cgroup; |
801 | scan.test_task = cpuset_test_cpumask; | 861 | scan.test_task = cpuset_test_cpumask; |
802 | scan.process_task = cpuset_change_cpumask; | 862 | scan.process_task = cpuset_change_cpumask; |
803 | scan.heap = &heap; | 863 | scan.heap = heap; |
804 | retval = cgroup_scan_tasks(&scan); | 864 | cgroup_scan_tasks(&scan); |
805 | |||
806 | heap_free(&heap); | ||
807 | return retval; | ||
808 | } | 865 | } |
809 | 866 | ||
810 | /** | 867 | /** |
@@ -814,6 +871,7 @@ static int update_tasks_cpumask(struct cpuset *cs) | |||
814 | */ | 871 | */ |
815 | static int update_cpumask(struct cpuset *cs, const char *buf) | 872 | static int update_cpumask(struct cpuset *cs, const char *buf) |
816 | { | 873 | { |
874 | struct ptr_heap heap; | ||
817 | struct cpuset trialcs; | 875 | struct cpuset trialcs; |
818 | int retval; | 876 | int retval; |
819 | int is_load_balanced; | 877 | int is_load_balanced; |
@@ -848,6 +906,10 @@ static int update_cpumask(struct cpuset *cs, const char *buf) | |||
848 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) | 906 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) |
849 | return 0; | 907 | return 0; |
850 | 908 | ||
909 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); | ||
910 | if (retval) | ||
911 | return retval; | ||
912 | |||
851 | is_load_balanced = is_sched_load_balance(&trialcs); | 913 | is_load_balanced = is_sched_load_balance(&trialcs); |
852 | 914 | ||
853 | mutex_lock(&callback_mutex); | 915 | mutex_lock(&callback_mutex); |
@@ -858,12 +920,12 @@ static int update_cpumask(struct cpuset *cs, const char *buf) | |||
858 | * Scan tasks in the cpuset, and update the cpumasks of any | 920 | * Scan tasks in the cpuset, and update the cpumasks of any |
859 | * that need an update. | 921 | * that need an update. |
860 | */ | 922 | */ |
861 | retval = update_tasks_cpumask(cs); | 923 | update_tasks_cpumask(cs, &heap); |
862 | if (retval < 0) | 924 | |
863 | return retval; | 925 | heap_free(&heap); |
864 | 926 | ||
865 | if (is_load_balanced) | 927 | if (is_load_balanced) |
866 | rebuild_sched_domains(); | 928 | async_rebuild_sched_domains(); |
867 | return 0; | 929 | return 0; |
868 | } | 930 | } |
869 | 931 | ||
@@ -1090,7 +1152,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
1090 | if (val != cs->relax_domain_level) { | 1152 | if (val != cs->relax_domain_level) { |
1091 | cs->relax_domain_level = val; | 1153 | cs->relax_domain_level = val; |
1092 | if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) | 1154 | if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) |
1093 | rebuild_sched_domains(); | 1155 | async_rebuild_sched_domains(); |
1094 | } | 1156 | } |
1095 | 1157 | ||
1096 | return 0; | 1158 | return 0; |
@@ -1131,7 +1193,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | |||
1131 | mutex_unlock(&callback_mutex); | 1193 | mutex_unlock(&callback_mutex); |
1132 | 1194 | ||
1133 | if (cpus_nonempty && balance_flag_changed) | 1195 | if (cpus_nonempty && balance_flag_changed) |
1134 | rebuild_sched_domains(); | 1196 | async_rebuild_sched_domains(); |
1135 | 1197 | ||
1136 | return 0; | 1198 | return 0; |
1137 | } | 1199 | } |
@@ -1492,6 +1554,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) | |||
1492 | default: | 1554 | default: |
1493 | BUG(); | 1555 | BUG(); |
1494 | } | 1556 | } |
1557 | |||
1558 | /* Unreachable but makes gcc happy */ | ||
1559 | return 0; | ||
1495 | } | 1560 | } |
1496 | 1561 | ||
1497 | static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) | 1562 | static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) |
@@ -1504,6 +1569,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) | |||
1504 | default: | 1569 | default: |
1505 | BUG(); | 1570 | BUG(); |
1506 | } | 1571 | } |
1572 | |||
1573 | /* Unrechable but makes gcc happy */ | ||
1574 | return 0; | ||
1507 | } | 1575 | } |
1508 | 1576 | ||
1509 | 1577 | ||
@@ -1692,15 +1760,9 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1692 | } | 1760 | } |
1693 | 1761 | ||
1694 | /* | 1762 | /* |
1695 | * Locking note on the strange update_flag() call below: | ||
1696 | * | ||
1697 | * If the cpuset being removed has its flag 'sched_load_balance' | 1763 | * If the cpuset being removed has its flag 'sched_load_balance' |
1698 | * enabled, then simulate turning sched_load_balance off, which | 1764 | * enabled, then simulate turning sched_load_balance off, which |
1699 | * will call rebuild_sched_domains(). The get_online_cpus() | 1765 | * will call async_rebuild_sched_domains(). |
1700 | * call in rebuild_sched_domains() must not be made while holding | ||
1701 | * callback_mutex. Elsewhere the kernel nests callback_mutex inside | ||
1702 | * get_online_cpus() calls. So the reverse nesting would risk an | ||
1703 | * ABBA deadlock. | ||
1704 | */ | 1766 | */ |
1705 | 1767 | ||
1706 | static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | 1768 | static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) |
@@ -1719,7 +1781,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1719 | struct cgroup_subsys cpuset_subsys = { | 1781 | struct cgroup_subsys cpuset_subsys = { |
1720 | .name = "cpuset", | 1782 | .name = "cpuset", |
1721 | .create = cpuset_create, | 1783 | .create = cpuset_create, |
1722 | .destroy = cpuset_destroy, | 1784 | .destroy = cpuset_destroy, |
1723 | .can_attach = cpuset_can_attach, | 1785 | .can_attach = cpuset_can_attach, |
1724 | .attach = cpuset_attach, | 1786 | .attach = cpuset_attach, |
1725 | .populate = cpuset_populate, | 1787 | .populate = cpuset_populate, |
@@ -1811,7 +1873,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) | |||
1811 | } | 1873 | } |
1812 | 1874 | ||
1813 | /* | 1875 | /* |
1814 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs | 1876 | * If CPU and/or memory hotplug handlers, below, unplug any CPUs |
1815 | * or memory nodes, we need to walk over the cpuset hierarchy, | 1877 | * or memory nodes, we need to walk over the cpuset hierarchy, |
1816 | * removing that CPU or node from all cpusets. If this removes the | 1878 | * removing that CPU or node from all cpusets. If this removes the |
1817 | * last CPU or node from a cpuset, then move the tasks in the empty | 1879 | * last CPU or node from a cpuset, then move the tasks in the empty |
@@ -1859,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
1859 | * that has tasks along with an empty 'mems'. But if we did see such | 1921 | * that has tasks along with an empty 'mems'. But if we did see such |
1860 | * a cpuset, we'd handle it just like we do if its 'cpus' was empty. | 1922 | * a cpuset, we'd handle it just like we do if its 'cpus' was empty. |
1861 | */ | 1923 | */ |
1862 | static void scan_for_empty_cpusets(const struct cpuset *root) | 1924 | static void scan_for_empty_cpusets(struct cpuset *root) |
1863 | { | 1925 | { |
1864 | LIST_HEAD(queue); | 1926 | LIST_HEAD(queue); |
1865 | struct cpuset *cp; /* scans cpusets being updated */ | 1927 | struct cpuset *cp; /* scans cpusets being updated */ |
@@ -1896,42 +1958,13 @@ static void scan_for_empty_cpusets(const struct cpuset *root) | |||
1896 | nodes_empty(cp->mems_allowed)) | 1958 | nodes_empty(cp->mems_allowed)) |
1897 | remove_tasks_in_empty_cpuset(cp); | 1959 | remove_tasks_in_empty_cpuset(cp); |
1898 | else { | 1960 | else { |
1899 | update_tasks_cpumask(cp); | 1961 | update_tasks_cpumask(cp, NULL); |
1900 | update_tasks_nodemask(cp, &oldmems); | 1962 | update_tasks_nodemask(cp, &oldmems); |
1901 | } | 1963 | } |
1902 | } | 1964 | } |
1903 | } | 1965 | } |
1904 | 1966 | ||
1905 | /* | 1967 | /* |
1906 | * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track | ||
1907 | * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to | ||
1908 | * track what's online after any CPU or memory node hotplug or unplug event. | ||
1909 | * | ||
1910 | * Since there are two callers of this routine, one for CPU hotplug | ||
1911 | * events and one for memory node hotplug events, we could have coded | ||
1912 | * two separate routines here. We code it as a single common routine | ||
1913 | * in order to minimize text size. | ||
1914 | */ | ||
1915 | |||
1916 | static void common_cpu_mem_hotplug_unplug(int rebuild_sd) | ||
1917 | { | ||
1918 | cgroup_lock(); | ||
1919 | |||
1920 | top_cpuset.cpus_allowed = cpu_online_map; | ||
1921 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | ||
1922 | scan_for_empty_cpusets(&top_cpuset); | ||
1923 | |||
1924 | /* | ||
1925 | * Scheduler destroys domains on hotplug events. | ||
1926 | * Rebuild them based on the current settings. | ||
1927 | */ | ||
1928 | if (rebuild_sd) | ||
1929 | rebuild_sched_domains(); | ||
1930 | |||
1931 | cgroup_unlock(); | ||
1932 | } | ||
1933 | |||
1934 | /* | ||
1935 | * The top_cpuset tracks what CPUs and Memory Nodes are online, | 1968 | * The top_cpuset tracks what CPUs and Memory Nodes are online, |
1936 | * period. This is necessary in order to make cpusets transparent | 1969 | * period. This is necessary in order to make cpusets transparent |
1937 | * (of no affect) on systems that are actively using CPU hotplug | 1970 | * (of no affect) on systems that are actively using CPU hotplug |
@@ -1939,40 +1972,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd) | |||
1939 | * | 1972 | * |
1940 | * This routine ensures that top_cpuset.cpus_allowed tracks | 1973 | * This routine ensures that top_cpuset.cpus_allowed tracks |
1941 | * cpu_online_map on each CPU hotplug (cpuhp) event. | 1974 | * cpu_online_map on each CPU hotplug (cpuhp) event. |
1975 | * | ||
1976 | * Called within get_online_cpus(). Needs to call cgroup_lock() | ||
1977 | * before calling generate_sched_domains(). | ||
1942 | */ | 1978 | */ |
1943 | 1979 | static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | |
1944 | static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, | ||
1945 | unsigned long phase, void *unused_cpu) | 1980 | unsigned long phase, void *unused_cpu) |
1946 | { | 1981 | { |
1982 | struct sched_domain_attr *attr; | ||
1983 | cpumask_t *doms; | ||
1984 | int ndoms; | ||
1985 | |||
1947 | switch (phase) { | 1986 | switch (phase) { |
1948 | case CPU_UP_CANCELED: | ||
1949 | case CPU_UP_CANCELED_FROZEN: | ||
1950 | case CPU_DOWN_FAILED: | ||
1951 | case CPU_DOWN_FAILED_FROZEN: | ||
1952 | case CPU_ONLINE: | 1987 | case CPU_ONLINE: |
1953 | case CPU_ONLINE_FROZEN: | 1988 | case CPU_ONLINE_FROZEN: |
1954 | case CPU_DEAD: | 1989 | case CPU_DEAD: |
1955 | case CPU_DEAD_FROZEN: | 1990 | case CPU_DEAD_FROZEN: |
1956 | common_cpu_mem_hotplug_unplug(1); | ||
1957 | break; | 1991 | break; |
1992 | |||
1958 | default: | 1993 | default: |
1959 | return NOTIFY_DONE; | 1994 | return NOTIFY_DONE; |
1960 | } | 1995 | } |
1961 | 1996 | ||
1997 | cgroup_lock(); | ||
1998 | top_cpuset.cpus_allowed = cpu_online_map; | ||
1999 | scan_for_empty_cpusets(&top_cpuset); | ||
2000 | ndoms = generate_sched_domains(&doms, &attr); | ||
2001 | cgroup_unlock(); | ||
2002 | |||
2003 | /* Have scheduler rebuild the domains */ | ||
2004 | partition_sched_domains(ndoms, doms, attr); | ||
2005 | |||
1962 | return NOTIFY_OK; | 2006 | return NOTIFY_OK; |
1963 | } | 2007 | } |
1964 | 2008 | ||
1965 | #ifdef CONFIG_MEMORY_HOTPLUG | 2009 | #ifdef CONFIG_MEMORY_HOTPLUG |
1966 | /* | 2010 | /* |
1967 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. | 2011 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. |
1968 | * Call this routine anytime after you change | 2012 | * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. |
1969 | * node_states[N_HIGH_MEMORY]. | 2013 | * See also the previous routine cpuset_track_online_cpus(). |
1970 | * See also the previous routine cpuset_handle_cpuhp(). | ||
1971 | */ | 2014 | */ |
1972 | |||
1973 | void cpuset_track_online_nodes(void) | 2015 | void cpuset_track_online_nodes(void) |
1974 | { | 2016 | { |
1975 | common_cpu_mem_hotplug_unplug(0); | 2017 | cgroup_lock(); |
2018 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | ||
2019 | scan_for_empty_cpusets(&top_cpuset); | ||
2020 | cgroup_unlock(); | ||
1976 | } | 2021 | } |
1977 | #endif | 2022 | #endif |
1978 | 2023 | ||
@@ -1987,7 +2032,7 @@ void __init cpuset_init_smp(void) | |||
1987 | top_cpuset.cpus_allowed = cpu_online_map; | 2032 | top_cpuset.cpus_allowed = cpu_online_map; |
1988 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2033 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
1989 | 2034 | ||
1990 | hotcpu_notifier(cpuset_handle_cpuhp, 0); | 2035 | hotcpu_notifier(cpuset_track_online_cpus, 0); |
1991 | } | 2036 | } |
1992 | 2037 | ||
1993 | /** | 2038 | /** |