diff options
Diffstat (limited to 'kernel/cpuset.c')
| -rw-r--r-- | kernel/cpuset.c | 349 |
1 files changed, 197 insertions, 152 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d5ab79cf516d..827cd9adccb2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -14,6 +14,8 @@ | |||
| 14 | * 2003-10-22 Updates by Stephen Hemminger. | 14 | * 2003-10-22 Updates by Stephen Hemminger. |
| 15 | * 2004 May-July Rework by Paul Jackson. | 15 | * 2004 May-July Rework by Paul Jackson. |
| 16 | * 2006 Rework by Paul Menage to use generic cgroups | 16 | * 2006 Rework by Paul Menage to use generic cgroups |
| 17 | * 2008 Rework of the scheduler domains and CPU hotplug handling | ||
| 18 | * by Max Krasnyansky | ||
| 17 | * | 19 | * |
| 18 | * This file is subject to the terms and conditions of the GNU General Public | 20 | * This file is subject to the terms and conditions of the GNU General Public |
| 19 | * License. See the file COPYING in the main directory of the Linux | 21 | * License. See the file COPYING in the main directory of the Linux |
| @@ -236,9 +238,11 @@ static struct cpuset top_cpuset = { | |||
| 236 | 238 | ||
| 237 | static DEFINE_MUTEX(callback_mutex); | 239 | static DEFINE_MUTEX(callback_mutex); |
| 238 | 240 | ||
| 239 | /* This is ugly, but preserves the userspace API for existing cpuset | 241 | /* |
| 242 | * This is ugly, but preserves the userspace API for existing cpuset | ||
| 240 | * users. If someone tries to mount the "cpuset" filesystem, we | 243 | * users. If someone tries to mount the "cpuset" filesystem, we |
| 241 | * silently switch it to mount "cgroup" instead */ | 244 | * silently switch it to mount "cgroup" instead |
| 245 | */ | ||
| 242 | static int cpuset_get_sb(struct file_system_type *fs_type, | 246 | static int cpuset_get_sb(struct file_system_type *fs_type, |
| 243 | int flags, const char *unused_dev_name, | 247 | int flags, const char *unused_dev_name, |
| 244 | void *data, struct vfsmount *mnt) | 248 | void *data, struct vfsmount *mnt) |
| @@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
| 473 | } | 477 | } |
| 474 | 478 | ||
| 475 | /* | 479 | /* |
| 476 | * Helper routine for rebuild_sched_domains(). | 480 | * Helper routine for generate_sched_domains(). |
| 477 | * Do cpusets a, b have overlapping cpus_allowed masks? | 481 | * Do cpusets a, b have overlapping cpus_allowed masks? |
| 478 | */ | 482 | */ |
| 479 | |||
| 480 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | 483 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) |
| 481 | { | 484 | { |
| 482 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); | 485 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); |
| @@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
| 518 | } | 521 | } |
| 519 | 522 | ||
| 520 | /* | 523 | /* |
| 521 | * rebuild_sched_domains() | 524 | * generate_sched_domains() |
| 522 | * | 525 | * |
| 523 | * This routine will be called to rebuild the scheduler's dynamic | 526 | * This function builds a partial partition of the systems CPUs |
| 524 | * sched domains: | 527 | * A 'partial partition' is a set of non-overlapping subsets whose |
| 525 | * - if the flag 'sched_load_balance' of any cpuset with non-empty | 528 | * union is a subset of that set. |
| 526 | * 'cpus' changes, | 529 | * The output of this function needs to be passed to kernel/sched.c |
| 527 | * - or if the 'cpus' allowed changes in any cpuset which has that | 530 | * partition_sched_domains() routine, which will rebuild the scheduler's |
| 528 | * flag enabled, | 531 | * load balancing domains (sched domains) as specified by that partial |
| 529 | * - or if the 'sched_relax_domain_level' of any cpuset which has | 532 | * partition. |
| 530 | * that flag enabled and with non-empty 'cpus' changes, | ||
| 531 | * - or if any cpuset with non-empty 'cpus' is removed, | ||
| 532 | * - or if a cpu gets offlined. | ||
| 533 | * | ||
| 534 | * This routine builds a partial partition of the systems CPUs | ||
| 535 | * (the set of non-overlappping cpumask_t's in the array 'part' | ||
| 536 | * below), and passes that partial partition to the kernel/sched.c | ||
| 537 | * partition_sched_domains() routine, which will rebuild the | ||
| 538 | * schedulers load balancing domains (sched domains) as specified | ||
| 539 | * by that partial partition. A 'partial partition' is a set of | ||
| 540 | * non-overlapping subsets whose union is a subset of that set. | ||
| 541 | * | 533 | * |
| 542 | * See "What is sched_load_balance" in Documentation/cpusets.txt | 534 | * See "What is sched_load_balance" in Documentation/cpusets.txt |
| 543 | * for a background explanation of this. | 535 | * for a background explanation of this. |
| @@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
| 547 | * domains when operating in the severe memory shortage situations | 539 | * domains when operating in the severe memory shortage situations |
| 548 | * that could cause allocation failures below. | 540 | * that could cause allocation failures below. |
| 549 | * | 541 | * |
| 550 | * Call with cgroup_mutex held. May take callback_mutex during | 542 | * Must be called with cgroup_lock held. |
| 551 | * call due to the kfifo_alloc() and kmalloc() calls. May nest | ||
| 552 | * a call to the get_online_cpus()/put_online_cpus() pair. | ||
| 553 | * Must not be called holding callback_mutex, because we must not | ||
| 554 | * call get_online_cpus() while holding callback_mutex. Elsewhere | ||
| 555 | * the kernel nests callback_mutex inside get_online_cpus() calls. | ||
| 556 | * So the reverse nesting would risk an ABBA deadlock. | ||
| 557 | * | 543 | * |
| 558 | * The three key local variables below are: | 544 | * The three key local variables below are: |
| 559 | * q - a linked-list queue of cpuset pointers, used to implement a | 545 | * q - a linked-list queue of cpuset pointers, used to implement a |
| @@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
| 588 | * element of the partition (one sched domain) to be passed to | 574 | * element of the partition (one sched domain) to be passed to |
| 589 | * partition_sched_domains(). | 575 | * partition_sched_domains(). |
| 590 | */ | 576 | */ |
| 591 | 577 | static int generate_sched_domains(cpumask_t **domains, | |
| 592 | void rebuild_sched_domains(void) | 578 | struct sched_domain_attr **attributes) |
| 593 | { | 579 | { |
| 594 | LIST_HEAD(q); /* queue of cpusets to be scanned*/ | 580 | LIST_HEAD(q); /* queue of cpusets to be scanned */ |
| 595 | struct cpuset *cp; /* scans q */ | 581 | struct cpuset *cp; /* scans q */ |
| 596 | struct cpuset **csa; /* array of all cpuset ptrs */ | 582 | struct cpuset **csa; /* array of all cpuset ptrs */ |
| 597 | int csn; /* how many cpuset ptrs in csa so far */ | 583 | int csn; /* how many cpuset ptrs in csa so far */ |
| @@ -601,23 +587,26 @@ void rebuild_sched_domains(void) | |||
| 601 | int ndoms; /* number of sched domains in result */ | 587 | int ndoms; /* number of sched domains in result */ |
| 602 | int nslot; /* next empty doms[] cpumask_t slot */ | 588 | int nslot; /* next empty doms[] cpumask_t slot */ |
| 603 | 589 | ||
| 604 | csa = NULL; | 590 | ndoms = 0; |
| 605 | doms = NULL; | 591 | doms = NULL; |
| 606 | dattr = NULL; | 592 | dattr = NULL; |
| 593 | csa = NULL; | ||
| 607 | 594 | ||
| 608 | /* Special case for the 99% of systems with one, full, sched domain */ | 595 | /* Special case for the 99% of systems with one, full, sched domain */ |
| 609 | if (is_sched_load_balance(&top_cpuset)) { | 596 | if (is_sched_load_balance(&top_cpuset)) { |
| 610 | ndoms = 1; | ||
| 611 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | 597 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); |
| 612 | if (!doms) | 598 | if (!doms) |
| 613 | goto rebuild; | 599 | goto done; |
| 600 | |||
| 614 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); | 601 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); |
| 615 | if (dattr) { | 602 | if (dattr) { |
| 616 | *dattr = SD_ATTR_INIT; | 603 | *dattr = SD_ATTR_INIT; |
| 617 | update_domain_attr_tree(dattr, &top_cpuset); | 604 | update_domain_attr_tree(dattr, &top_cpuset); |
| 618 | } | 605 | } |
| 619 | *doms = top_cpuset.cpus_allowed; | 606 | *doms = top_cpuset.cpus_allowed; |
| 620 | goto rebuild; | 607 | |
| 608 | ndoms = 1; | ||
| 609 | goto done; | ||
| 621 | } | 610 | } |
| 622 | 611 | ||
| 623 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); | 612 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); |
| @@ -680,61 +669,141 @@ restart: | |||
| 680 | } | 669 | } |
| 681 | } | 670 | } |
| 682 | 671 | ||
| 683 | /* Convert <csn, csa> to <ndoms, doms> */ | 672 | /* |
| 673 | * Now we know how many domains to create. | ||
| 674 | * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. | ||
| 675 | */ | ||
| 684 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); | 676 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); |
| 685 | if (!doms) | 677 | if (!doms) { |
| 686 | goto rebuild; | 678 | ndoms = 0; |
| 679 | goto done; | ||
| 680 | } | ||
| 681 | |||
| 682 | /* | ||
| 683 | * The rest of the code, including the scheduler, can deal with | ||
| 684 | * dattr==NULL case. No need to abort if alloc fails. | ||
| 685 | */ | ||
| 687 | dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); | 686 | dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); |
| 688 | 687 | ||
| 689 | for (nslot = 0, i = 0; i < csn; i++) { | 688 | for (nslot = 0, i = 0; i < csn; i++) { |
| 690 | struct cpuset *a = csa[i]; | 689 | struct cpuset *a = csa[i]; |
| 690 | cpumask_t *dp; | ||
| 691 | int apn = a->pn; | 691 | int apn = a->pn; |
| 692 | 692 | ||
| 693 | if (apn >= 0) { | 693 | if (apn < 0) { |
| 694 | cpumask_t *dp = doms + nslot; | 694 | /* Skip completed partitions */ |
| 695 | 695 | continue; | |
| 696 | if (nslot == ndoms) { | 696 | } |
| 697 | static int warnings = 10; | 697 | |
| 698 | if (warnings) { | 698 | dp = doms + nslot; |
| 699 | printk(KERN_WARNING | 699 | |
| 700 | "rebuild_sched_domains confused:" | 700 | if (nslot == ndoms) { |
| 701 | " nslot %d, ndoms %d, csn %d, i %d," | 701 | static int warnings = 10; |
| 702 | " apn %d\n", | 702 | if (warnings) { |
| 703 | nslot, ndoms, csn, i, apn); | 703 | printk(KERN_WARNING |
| 704 | warnings--; | 704 | "rebuild_sched_domains confused:" |
| 705 | } | 705 | " nslot %d, ndoms %d, csn %d, i %d," |
| 706 | continue; | 706 | " apn %d\n", |
| 707 | nslot, ndoms, csn, i, apn); | ||
| 708 | warnings--; | ||
| 707 | } | 709 | } |
| 710 | continue; | ||
| 711 | } | ||
| 708 | 712 | ||
| 709 | cpus_clear(*dp); | 713 | cpus_clear(*dp); |
| 710 | if (dattr) | 714 | if (dattr) |
| 711 | *(dattr + nslot) = SD_ATTR_INIT; | 715 | *(dattr + nslot) = SD_ATTR_INIT; |
| 712 | for (j = i; j < csn; j++) { | 716 | for (j = i; j < csn; j++) { |
| 713 | struct cpuset *b = csa[j]; | 717 | struct cpuset *b = csa[j]; |
| 714 | 718 | ||
| 715 | if (apn == b->pn) { | 719 | if (apn == b->pn) { |
| 716 | cpus_or(*dp, *dp, b->cpus_allowed); | 720 | cpus_or(*dp, *dp, b->cpus_allowed); |
| 717 | b->pn = -1; | 721 | if (dattr) |
| 718 | if (dattr) | 722 | update_domain_attr_tree(dattr + nslot, b); |
| 719 | update_domain_attr_tree(dattr | 723 | |
| 720 | + nslot, b); | 724 | /* Done with this partition */ |
| 721 | } | 725 | b->pn = -1; |
| 722 | } | 726 | } |
| 723 | nslot++; | ||
| 724 | } | 727 | } |
| 728 | nslot++; | ||
| 725 | } | 729 | } |
| 726 | BUG_ON(nslot != ndoms); | 730 | BUG_ON(nslot != ndoms); |
| 727 | 731 | ||
| 728 | rebuild: | 732 | done: |
| 729 | /* Have scheduler rebuild sched domains */ | 733 | kfree(csa); |
| 734 | |||
| 735 | *domains = doms; | ||
| 736 | *attributes = dattr; | ||
| 737 | return ndoms; | ||
| 738 | } | ||
| 739 | |||
| 740 | /* | ||
| 741 | * Rebuild scheduler domains. | ||
| 742 | * | ||
| 743 | * Call with neither cgroup_mutex held nor within get_online_cpus(). | ||
| 744 | * Takes both cgroup_mutex and get_online_cpus(). | ||
| 745 | * | ||
| 746 | * Cannot be directly called from cpuset code handling changes | ||
| 747 | * to the cpuset pseudo-filesystem, because it cannot be called | ||
| 748 | * from code that already holds cgroup_mutex. | ||
| 749 | */ | ||
| 750 | static void do_rebuild_sched_domains(struct work_struct *unused) | ||
| 751 | { | ||
| 752 | struct sched_domain_attr *attr; | ||
| 753 | cpumask_t *doms; | ||
| 754 | int ndoms; | ||
| 755 | |||
| 730 | get_online_cpus(); | 756 | get_online_cpus(); |
| 731 | partition_sched_domains(ndoms, doms, dattr); | 757 | |
| 758 | /* Generate domain masks and attrs */ | ||
| 759 | cgroup_lock(); | ||
| 760 | ndoms = generate_sched_domains(&doms, &attr); | ||
| 761 | cgroup_unlock(); | ||
| 762 | |||
| 763 | /* Have scheduler rebuild the domains */ | ||
| 764 | partition_sched_domains(ndoms, doms, attr); | ||
| 765 | |||
| 732 | put_online_cpus(); | 766 | put_online_cpus(); |
| 767 | } | ||
| 733 | 768 | ||
| 734 | done: | 769 | static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); |
| 735 | kfree(csa); | 770 | |
| 736 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ | 771 | /* |
| 737 | /* Don't kfree(dattr) -- partition_sched_domains() does that. */ | 772 | * Rebuild scheduler domains, asynchronously via workqueue. |
| 773 | * | ||
| 774 | * If the flag 'sched_load_balance' of any cpuset with non-empty | ||
| 775 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset | ||
| 776 | * which has that flag enabled, or if any cpuset with a non-empty | ||
| 777 | * 'cpus' is removed, then call this routine to rebuild the | ||
| 778 | * scheduler's dynamic sched domains. | ||
| 779 | * | ||
| 780 | * The rebuild_sched_domains() and partition_sched_domains() | ||
| 781 | * routines must nest cgroup_lock() inside get_online_cpus(), | ||
| 782 | * but such cpuset changes as these must nest that locking the | ||
| 783 | * other way, holding cgroup_lock() for much of the code. | ||
| 784 | * | ||
| 785 | * So in order to avoid an ABBA deadlock, the cpuset code handling | ||
| 786 | * these user changes delegates the actual sched domain rebuilding | ||
| 787 | * to a separate workqueue thread, which ends up processing the | ||
| 788 | * above do_rebuild_sched_domains() function. | ||
| 789 | */ | ||
| 790 | static void async_rebuild_sched_domains(void) | ||
| 791 | { | ||
| 792 | schedule_work(&rebuild_sched_domains_work); | ||
| 793 | } | ||
| 794 | |||
| 795 | /* | ||
| 796 | * Accomplishes the same scheduler domain rebuild as the above | ||
| 797 | * async_rebuild_sched_domains(), however it directly calls the | ||
| 798 | * rebuild routine synchronously rather than calling it via an | ||
| 799 | * asynchronous work thread. | ||
| 800 | * | ||
| 801 | * This can only be called from code that is not holding | ||
| 802 | * cgroup_mutex (not nested in a cgroup_lock() call.) | ||
| 803 | */ | ||
| 804 | void rebuild_sched_domains(void) | ||
| 805 | { | ||
| 806 | do_rebuild_sched_domains(NULL); | ||
| 738 | } | 807 | } |
| 739 | 808 | ||
| 740 | /** | 809 | /** |
| @@ -774,37 +843,25 @@ static void cpuset_change_cpumask(struct task_struct *tsk, | |||
| 774 | /** | 843 | /** |
| 775 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. | 844 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. |
| 776 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed | 845 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed |
| 846 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | ||
| 777 | * | 847 | * |
| 778 | * Called with cgroup_mutex held | 848 | * Called with cgroup_mutex held |
| 779 | * | 849 | * |
| 780 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 850 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, |
| 781 | * calling callback functions for each. | 851 | * calling callback functions for each. |
| 782 | * | 852 | * |
| 783 | * Return 0 if successful, -errno if not. | 853 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 |
| 854 | * if @heap != NULL. | ||
| 784 | */ | 855 | */ |
| 785 | static int update_tasks_cpumask(struct cpuset *cs) | 856 | static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) |
| 786 | { | 857 | { |
| 787 | struct cgroup_scanner scan; | 858 | struct cgroup_scanner scan; |
| 788 | struct ptr_heap heap; | ||
| 789 | int retval; | ||
| 790 | |||
| 791 | /* | ||
| 792 | * cgroup_scan_tasks() will initialize heap->gt for us. | ||
| 793 | * heap_init() is still needed here for we should not change | ||
| 794 | * cs->cpus_allowed when heap_init() fails. | ||
| 795 | */ | ||
| 796 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); | ||
| 797 | if (retval) | ||
| 798 | return retval; | ||
| 799 | 859 | ||
| 800 | scan.cg = cs->css.cgroup; | 860 | scan.cg = cs->css.cgroup; |
| 801 | scan.test_task = cpuset_test_cpumask; | 861 | scan.test_task = cpuset_test_cpumask; |
| 802 | scan.process_task = cpuset_change_cpumask; | 862 | scan.process_task = cpuset_change_cpumask; |
| 803 | scan.heap = &heap; | 863 | scan.heap = heap; |
| 804 | retval = cgroup_scan_tasks(&scan); | 864 | cgroup_scan_tasks(&scan); |
| 805 | |||
| 806 | heap_free(&heap); | ||
| 807 | return retval; | ||
| 808 | } | 865 | } |
| 809 | 866 | ||
| 810 | /** | 867 | /** |
| @@ -814,6 +871,7 @@ static int update_tasks_cpumask(struct cpuset *cs) | |||
| 814 | */ | 871 | */ |
| 815 | static int update_cpumask(struct cpuset *cs, const char *buf) | 872 | static int update_cpumask(struct cpuset *cs, const char *buf) |
| 816 | { | 873 | { |
| 874 | struct ptr_heap heap; | ||
| 817 | struct cpuset trialcs; | 875 | struct cpuset trialcs; |
| 818 | int retval; | 876 | int retval; |
| 819 | int is_load_balanced; | 877 | int is_load_balanced; |
| @@ -848,6 +906,10 @@ static int update_cpumask(struct cpuset *cs, const char *buf) | |||
| 848 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) | 906 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) |
| 849 | return 0; | 907 | return 0; |
| 850 | 908 | ||
| 909 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); | ||
| 910 | if (retval) | ||
| 911 | return retval; | ||
| 912 | |||
| 851 | is_load_balanced = is_sched_load_balance(&trialcs); | 913 | is_load_balanced = is_sched_load_balance(&trialcs); |
| 852 | 914 | ||
| 853 | mutex_lock(&callback_mutex); | 915 | mutex_lock(&callback_mutex); |
| @@ -858,12 +920,12 @@ static int update_cpumask(struct cpuset *cs, const char *buf) | |||
| 858 | * Scan tasks in the cpuset, and update the cpumasks of any | 920 | * Scan tasks in the cpuset, and update the cpumasks of any |
| 859 | * that need an update. | 921 | * that need an update. |
| 860 | */ | 922 | */ |
| 861 | retval = update_tasks_cpumask(cs); | 923 | update_tasks_cpumask(cs, &heap); |
| 862 | if (retval < 0) | 924 | |
| 863 | return retval; | 925 | heap_free(&heap); |
| 864 | 926 | ||
| 865 | if (is_load_balanced) | 927 | if (is_load_balanced) |
| 866 | rebuild_sched_domains(); | 928 | async_rebuild_sched_domains(); |
| 867 | return 0; | 929 | return 0; |
| 868 | } | 930 | } |
| 869 | 931 | ||
| @@ -1090,7 +1152,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
| 1090 | if (val != cs->relax_domain_level) { | 1152 | if (val != cs->relax_domain_level) { |
| 1091 | cs->relax_domain_level = val; | 1153 | cs->relax_domain_level = val; |
| 1092 | if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) | 1154 | if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) |
| 1093 | rebuild_sched_domains(); | 1155 | async_rebuild_sched_domains(); |
| 1094 | } | 1156 | } |
| 1095 | 1157 | ||
| 1096 | return 0; | 1158 | return 0; |
| @@ -1131,7 +1193,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | |||
| 1131 | mutex_unlock(&callback_mutex); | 1193 | mutex_unlock(&callback_mutex); |
| 1132 | 1194 | ||
| 1133 | if (cpus_nonempty && balance_flag_changed) | 1195 | if (cpus_nonempty && balance_flag_changed) |
| 1134 | rebuild_sched_domains(); | 1196 | async_rebuild_sched_domains(); |
| 1135 | 1197 | ||
| 1136 | return 0; | 1198 | return 0; |
| 1137 | } | 1199 | } |
| @@ -1492,6 +1554,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) | |||
| 1492 | default: | 1554 | default: |
| 1493 | BUG(); | 1555 | BUG(); |
| 1494 | } | 1556 | } |
| 1557 | |||
| 1558 | /* Unreachable but makes gcc happy */ | ||
| 1559 | return 0; | ||
| 1495 | } | 1560 | } |
| 1496 | 1561 | ||
| 1497 | static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) | 1562 | static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) |
| @@ -1504,6 +1569,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) | |||
| 1504 | default: | 1569 | default: |
| 1505 | BUG(); | 1570 | BUG(); |
| 1506 | } | 1571 | } |
| 1572 | |||
| 1573 | /* Unrechable but makes gcc happy */ | ||
| 1574 | return 0; | ||
| 1507 | } | 1575 | } |
| 1508 | 1576 | ||
| 1509 | 1577 | ||
| @@ -1692,15 +1760,9 @@ static struct cgroup_subsys_state *cpuset_create( | |||
| 1692 | } | 1760 | } |
| 1693 | 1761 | ||
| 1694 | /* | 1762 | /* |
| 1695 | * Locking note on the strange update_flag() call below: | ||
| 1696 | * | ||
| 1697 | * If the cpuset being removed has its flag 'sched_load_balance' | 1763 | * If the cpuset being removed has its flag 'sched_load_balance' |
| 1698 | * enabled, then simulate turning sched_load_balance off, which | 1764 | * enabled, then simulate turning sched_load_balance off, which |
| 1699 | * will call rebuild_sched_domains(). The get_online_cpus() | 1765 | * will call async_rebuild_sched_domains(). |
| 1700 | * call in rebuild_sched_domains() must not be made while holding | ||
| 1701 | * callback_mutex. Elsewhere the kernel nests callback_mutex inside | ||
| 1702 | * get_online_cpus() calls. So the reverse nesting would risk an | ||
| 1703 | * ABBA deadlock. | ||
| 1704 | */ | 1766 | */ |
| 1705 | 1767 | ||
| 1706 | static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | 1768 | static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) |
| @@ -1719,7 +1781,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 1719 | struct cgroup_subsys cpuset_subsys = { | 1781 | struct cgroup_subsys cpuset_subsys = { |
| 1720 | .name = "cpuset", | 1782 | .name = "cpuset", |
| 1721 | .create = cpuset_create, | 1783 | .create = cpuset_create, |
| 1722 | .destroy = cpuset_destroy, | 1784 | .destroy = cpuset_destroy, |
| 1723 | .can_attach = cpuset_can_attach, | 1785 | .can_attach = cpuset_can_attach, |
| 1724 | .attach = cpuset_attach, | 1786 | .attach = cpuset_attach, |
| 1725 | .populate = cpuset_populate, | 1787 | .populate = cpuset_populate, |
| @@ -1811,7 +1873,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) | |||
| 1811 | } | 1873 | } |
| 1812 | 1874 | ||
| 1813 | /* | 1875 | /* |
| 1814 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs | 1876 | * If CPU and/or memory hotplug handlers, below, unplug any CPUs |
| 1815 | * or memory nodes, we need to walk over the cpuset hierarchy, | 1877 | * or memory nodes, we need to walk over the cpuset hierarchy, |
| 1816 | * removing that CPU or node from all cpusets. If this removes the | 1878 | * removing that CPU or node from all cpusets. If this removes the |
| 1817 | * last CPU or node from a cpuset, then move the tasks in the empty | 1879 | * last CPU or node from a cpuset, then move the tasks in the empty |
| @@ -1896,42 +1958,13 @@ static void scan_for_empty_cpusets(const struct cpuset *root) | |||
| 1896 | nodes_empty(cp->mems_allowed)) | 1958 | nodes_empty(cp->mems_allowed)) |
| 1897 | remove_tasks_in_empty_cpuset(cp); | 1959 | remove_tasks_in_empty_cpuset(cp); |
| 1898 | else { | 1960 | else { |
| 1899 | update_tasks_cpumask(cp); | 1961 | update_tasks_cpumask(cp, NULL); |
| 1900 | update_tasks_nodemask(cp, &oldmems); | 1962 | update_tasks_nodemask(cp, &oldmems); |
| 1901 | } | 1963 | } |
| 1902 | } | 1964 | } |
| 1903 | } | 1965 | } |
| 1904 | 1966 | ||
| 1905 | /* | 1967 | /* |
| 1906 | * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track | ||
| 1907 | * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to | ||
| 1908 | * track what's online after any CPU or memory node hotplug or unplug event. | ||
| 1909 | * | ||
| 1910 | * Since there are two callers of this routine, one for CPU hotplug | ||
| 1911 | * events and one for memory node hotplug events, we could have coded | ||
| 1912 | * two separate routines here. We code it as a single common routine | ||
| 1913 | * in order to minimize text size. | ||
| 1914 | */ | ||
| 1915 | |||
| 1916 | static void common_cpu_mem_hotplug_unplug(int rebuild_sd) | ||
| 1917 | { | ||
| 1918 | cgroup_lock(); | ||
| 1919 | |||
| 1920 | top_cpuset.cpus_allowed = cpu_online_map; | ||
| 1921 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | ||
| 1922 | scan_for_empty_cpusets(&top_cpuset); | ||
| 1923 | |||
| 1924 | /* | ||
| 1925 | * Scheduler destroys domains on hotplug events. | ||
| 1926 | * Rebuild them based on the current settings. | ||
| 1927 | */ | ||
| 1928 | if (rebuild_sd) | ||
| 1929 | rebuild_sched_domains(); | ||
| 1930 | |||
| 1931 | cgroup_unlock(); | ||
| 1932 | } | ||
| 1933 | |||
| 1934 | /* | ||
| 1935 | * The top_cpuset tracks what CPUs and Memory Nodes are online, | 1968 | * The top_cpuset tracks what CPUs and Memory Nodes are online, |
| 1936 | * period. This is necessary in order to make cpusets transparent | 1969 | * period. This is necessary in order to make cpusets transparent |
| 1937 | * (of no affect) on systems that are actively using CPU hotplug | 1970 | * (of no affect) on systems that are actively using CPU hotplug |
| @@ -1939,40 +1972,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd) | |||
| 1939 | * | 1972 | * |
| 1940 | * This routine ensures that top_cpuset.cpus_allowed tracks | 1973 | * This routine ensures that top_cpuset.cpus_allowed tracks |
| 1941 | * cpu_online_map on each CPU hotplug (cpuhp) event. | 1974 | * cpu_online_map on each CPU hotplug (cpuhp) event. |
| 1975 | * | ||
| 1976 | * Called within get_online_cpus(). Needs to call cgroup_lock() | ||
| 1977 | * before calling generate_sched_domains(). | ||
| 1942 | */ | 1978 | */ |
| 1943 | 1979 | static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | |
| 1944 | static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, | ||
| 1945 | unsigned long phase, void *unused_cpu) | 1980 | unsigned long phase, void *unused_cpu) |
| 1946 | { | 1981 | { |
| 1982 | struct sched_domain_attr *attr; | ||
| 1983 | cpumask_t *doms; | ||
| 1984 | int ndoms; | ||
| 1985 | |||
| 1947 | switch (phase) { | 1986 | switch (phase) { |
| 1948 | case CPU_UP_CANCELED: | ||
| 1949 | case CPU_UP_CANCELED_FROZEN: | ||
| 1950 | case CPU_DOWN_FAILED: | ||
| 1951 | case CPU_DOWN_FAILED_FROZEN: | ||
| 1952 | case CPU_ONLINE: | 1987 | case CPU_ONLINE: |
| 1953 | case CPU_ONLINE_FROZEN: | 1988 | case CPU_ONLINE_FROZEN: |
| 1954 | case CPU_DEAD: | 1989 | case CPU_DEAD: |
| 1955 | case CPU_DEAD_FROZEN: | 1990 | case CPU_DEAD_FROZEN: |
| 1956 | common_cpu_mem_hotplug_unplug(1); | ||
| 1957 | break; | 1991 | break; |
| 1992 | |||
| 1958 | default: | 1993 | default: |
| 1959 | return NOTIFY_DONE; | 1994 | return NOTIFY_DONE; |
| 1960 | } | 1995 | } |
| 1961 | 1996 | ||
| 1997 | cgroup_lock(); | ||
| 1998 | top_cpuset.cpus_allowed = cpu_online_map; | ||
| 1999 | scan_for_empty_cpusets(&top_cpuset); | ||
| 2000 | ndoms = generate_sched_domains(&doms, &attr); | ||
| 2001 | cgroup_unlock(); | ||
| 2002 | |||
| 2003 | /* Have scheduler rebuild the domains */ | ||
| 2004 | partition_sched_domains(ndoms, doms, attr); | ||
| 2005 | |||
| 1962 | return NOTIFY_OK; | 2006 | return NOTIFY_OK; |
| 1963 | } | 2007 | } |
| 1964 | 2008 | ||
| 1965 | #ifdef CONFIG_MEMORY_HOTPLUG | 2009 | #ifdef CONFIG_MEMORY_HOTPLUG |
| 1966 | /* | 2010 | /* |
| 1967 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. | 2011 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. |
| 1968 | * Call this routine anytime after you change | 2012 | * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. |
| 1969 | * node_states[N_HIGH_MEMORY]. | 2013 | * See also the previous routine cpuset_track_online_cpus(). |
| 1970 | * See also the previous routine cpuset_handle_cpuhp(). | ||
| 1971 | */ | 2014 | */ |
| 1972 | |||
| 1973 | void cpuset_track_online_nodes(void) | 2015 | void cpuset_track_online_nodes(void) |
| 1974 | { | 2016 | { |
| 1975 | common_cpu_mem_hotplug_unplug(0); | 2017 | cgroup_lock(); |
| 2018 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | ||
| 2019 | scan_for_empty_cpusets(&top_cpuset); | ||
| 2020 | cgroup_unlock(); | ||
| 1976 | } | 2021 | } |
| 1977 | #endif | 2022 | #endif |
| 1978 | 2023 | ||
| @@ -1987,7 +2032,7 @@ void __init cpuset_init_smp(void) | |||
| 1987 | top_cpuset.cpus_allowed = cpu_online_map; | 2032 | top_cpuset.cpus_allowed = cpu_online_map; |
| 1988 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2033 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
| 1989 | 2034 | ||
| 1990 | hotcpu_notifier(cpuset_handle_cpuhp, 0); | 2035 | hotcpu_notifier(cpuset_track_online_cpus, 0); |
| 1991 | } | 2036 | } |
| 1992 | 2037 | ||
| 1993 | /** | 2038 | /** |
