diff options
| -rw-r--r-- | include/linux/cpuset.h | 2 | ||||
| -rw-r--r-- | kernel/cpuset.c | 312 | ||||
| -rw-r--r-- | kernel/sched.c | 19 |
3 files changed, 196 insertions, 137 deletions
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index e8f450c499b0..2691926fb506 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
| @@ -160,7 +160,7 @@ static inline int current_cpuset_is_being_rebound(void) | |||
| 160 | 160 | ||
| 161 | static inline void rebuild_sched_domains(void) | 161 | static inline void rebuild_sched_domains(void) |
| 162 | { | 162 | { |
| 163 | partition_sched_domains(0, NULL, NULL); | 163 | partition_sched_domains(1, NULL, NULL); |
| 164 | } | 164 | } |
| 165 | 165 | ||
| 166 | #endif /* !CONFIG_CPUSETS */ | 166 | #endif /* !CONFIG_CPUSETS */ |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d5ab79cf516d..f227bc172690 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -14,6 +14,8 @@ | |||
| 14 | * 2003-10-22 Updates by Stephen Hemminger. | 14 | * 2003-10-22 Updates by Stephen Hemminger. |
| 15 | * 2004 May-July Rework by Paul Jackson. | 15 | * 2004 May-July Rework by Paul Jackson. |
| 16 | * 2006 Rework by Paul Menage to use generic cgroups | 16 | * 2006 Rework by Paul Menage to use generic cgroups |
| 17 | * 2008 Rework of the scheduler domains and CPU hotplug handling | ||
| 18 | * by Max Krasnyansky | ||
| 17 | * | 19 | * |
| 18 | * This file is subject to the terms and conditions of the GNU General Public | 20 | * This file is subject to the terms and conditions of the GNU General Public |
| 19 | * License. See the file COPYING in the main directory of the Linux | 21 | * License. See the file COPYING in the main directory of the Linux |
| @@ -236,9 +238,11 @@ static struct cpuset top_cpuset = { | |||
| 236 | 238 | ||
| 237 | static DEFINE_MUTEX(callback_mutex); | 239 | static DEFINE_MUTEX(callback_mutex); |
| 238 | 240 | ||
| 239 | /* This is ugly, but preserves the userspace API for existing cpuset | 241 | /* |
| 242 | * This is ugly, but preserves the userspace API for existing cpuset | ||
| 240 | * users. If someone tries to mount the "cpuset" filesystem, we | 243 | * users. If someone tries to mount the "cpuset" filesystem, we |
| 241 | * silently switch it to mount "cgroup" instead */ | 244 | * silently switch it to mount "cgroup" instead |
| 245 | */ | ||
| 242 | static int cpuset_get_sb(struct file_system_type *fs_type, | 246 | static int cpuset_get_sb(struct file_system_type *fs_type, |
| 243 | int flags, const char *unused_dev_name, | 247 | int flags, const char *unused_dev_name, |
| 244 | void *data, struct vfsmount *mnt) | 248 | void *data, struct vfsmount *mnt) |
| @@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
| 473 | } | 477 | } |
| 474 | 478 | ||
| 475 | /* | 479 | /* |
| 476 | * Helper routine for rebuild_sched_domains(). | 480 | * Helper routine for generate_sched_domains(). |
| 477 | * Do cpusets a, b have overlapping cpus_allowed masks? | 481 | * Do cpusets a, b have overlapping cpus_allowed masks? |
| 478 | */ | 482 | */ |
| 479 | |||
| 480 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | 483 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) |
| 481 | { | 484 | { |
| 482 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); | 485 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); |
| @@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
| 518 | } | 521 | } |
| 519 | 522 | ||
| 520 | /* | 523 | /* |
| 521 | * rebuild_sched_domains() | 524 | * generate_sched_domains() |
| 522 | * | 525 | * |
| 523 | * This routine will be called to rebuild the scheduler's dynamic | 526 | * This function builds a partial partition of the systems CPUs |
| 524 | * sched domains: | 527 | * A 'partial partition' is a set of non-overlapping subsets whose |
| 525 | * - if the flag 'sched_load_balance' of any cpuset with non-empty | 528 | * union is a subset of that set. |
| 526 | * 'cpus' changes, | 529 | * The output of this function needs to be passed to kernel/sched.c |
| 527 | * - or if the 'cpus' allowed changes in any cpuset which has that | 530 | * partition_sched_domains() routine, which will rebuild the scheduler's |
| 528 | * flag enabled, | 531 | * load balancing domains (sched domains) as specified by that partial |
| 529 | * - or if the 'sched_relax_domain_level' of any cpuset which has | 532 | * partition. |
| 530 | * that flag enabled and with non-empty 'cpus' changes, | ||
| 531 | * - or if any cpuset with non-empty 'cpus' is removed, | ||
| 532 | * - or if a cpu gets offlined. | ||
| 533 | * | ||
| 534 | * This routine builds a partial partition of the systems CPUs | ||
| 535 | * (the set of non-overlappping cpumask_t's in the array 'part' | ||
| 536 | * below), and passes that partial partition to the kernel/sched.c | ||
| 537 | * partition_sched_domains() routine, which will rebuild the | ||
| 538 | * schedulers load balancing domains (sched domains) as specified | ||
| 539 | * by that partial partition. A 'partial partition' is a set of | ||
| 540 | * non-overlapping subsets whose union is a subset of that set. | ||
| 541 | * | 533 | * |
| 542 | * See "What is sched_load_balance" in Documentation/cpusets.txt | 534 | * See "What is sched_load_balance" in Documentation/cpusets.txt |
| 543 | * for a background explanation of this. | 535 | * for a background explanation of this. |
| @@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
| 547 | * domains when operating in the severe memory shortage situations | 539 | * domains when operating in the severe memory shortage situations |
| 548 | * that could cause allocation failures below. | 540 | * that could cause allocation failures below. |
| 549 | * | 541 | * |
| 550 | * Call with cgroup_mutex held. May take callback_mutex during | 542 | * Must be called with cgroup_lock held. |
| 551 | * call due to the kfifo_alloc() and kmalloc() calls. May nest | ||
| 552 | * a call to the get_online_cpus()/put_online_cpus() pair. | ||
| 553 | * Must not be called holding callback_mutex, because we must not | ||
| 554 | * call get_online_cpus() while holding callback_mutex. Elsewhere | ||
| 555 | * the kernel nests callback_mutex inside get_online_cpus() calls. | ||
| 556 | * So the reverse nesting would risk an ABBA deadlock. | ||
| 557 | * | 543 | * |
| 558 | * The three key local variables below are: | 544 | * The three key local variables below are: |
| 559 | * q - a linked-list queue of cpuset pointers, used to implement a | 545 | * q - a linked-list queue of cpuset pointers, used to implement a |
| @@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
| 588 | * element of the partition (one sched domain) to be passed to | 574 | * element of the partition (one sched domain) to be passed to |
| 589 | * partition_sched_domains(). | 575 | * partition_sched_domains(). |
| 590 | */ | 576 | */ |
| 591 | 577 | static int generate_sched_domains(cpumask_t **domains, | |
| 592 | void rebuild_sched_domains(void) | 578 | struct sched_domain_attr **attributes) |
| 593 | { | 579 | { |
| 594 | LIST_HEAD(q); /* queue of cpusets to be scanned*/ | 580 | LIST_HEAD(q); /* queue of cpusets to be scanned */ |
| 595 | struct cpuset *cp; /* scans q */ | 581 | struct cpuset *cp; /* scans q */ |
| 596 | struct cpuset **csa; /* array of all cpuset ptrs */ | 582 | struct cpuset **csa; /* array of all cpuset ptrs */ |
| 597 | int csn; /* how many cpuset ptrs in csa so far */ | 583 | int csn; /* how many cpuset ptrs in csa so far */ |
| @@ -601,23 +587,26 @@ void rebuild_sched_domains(void) | |||
| 601 | int ndoms; /* number of sched domains in result */ | 587 | int ndoms; /* number of sched domains in result */ |
| 602 | int nslot; /* next empty doms[] cpumask_t slot */ | 588 | int nslot; /* next empty doms[] cpumask_t slot */ |
| 603 | 589 | ||
| 604 | csa = NULL; | 590 | ndoms = 0; |
| 605 | doms = NULL; | 591 | doms = NULL; |
| 606 | dattr = NULL; | 592 | dattr = NULL; |
| 593 | csa = NULL; | ||
| 607 | 594 | ||
| 608 | /* Special case for the 99% of systems with one, full, sched domain */ | 595 | /* Special case for the 99% of systems with one, full, sched domain */ |
| 609 | if (is_sched_load_balance(&top_cpuset)) { | 596 | if (is_sched_load_balance(&top_cpuset)) { |
| 610 | ndoms = 1; | ||
| 611 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | 597 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); |
| 612 | if (!doms) | 598 | if (!doms) |
| 613 | goto rebuild; | 599 | goto done; |
| 600 | |||
| 614 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); | 601 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); |
| 615 | if (dattr) { | 602 | if (dattr) { |
| 616 | *dattr = SD_ATTR_INIT; | 603 | *dattr = SD_ATTR_INIT; |
| 617 | update_domain_attr_tree(dattr, &top_cpuset); | 604 | update_domain_attr_tree(dattr, &top_cpuset); |
| 618 | } | 605 | } |
| 619 | *doms = top_cpuset.cpus_allowed; | 606 | *doms = top_cpuset.cpus_allowed; |
| 620 | goto rebuild; | 607 | |
| 608 | ndoms = 1; | ||
| 609 | goto done; | ||
| 621 | } | 610 | } |
| 622 | 611 | ||
| 623 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); | 612 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); |
| @@ -680,61 +669,141 @@ restart: | |||
| 680 | } | 669 | } |
| 681 | } | 670 | } |
| 682 | 671 | ||
| 683 | /* Convert <csn, csa> to <ndoms, doms> */ | 672 | /* |
| 673 | * Now we know how many domains to create. | ||
| 674 | * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. | ||
| 675 | */ | ||
| 684 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); | 676 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); |
| 685 | if (!doms) | 677 | if (!doms) { |
| 686 | goto rebuild; | 678 | ndoms = 0; |
| 679 | goto done; | ||
| 680 | } | ||
| 681 | |||
| 682 | /* | ||
| 683 | * The rest of the code, including the scheduler, can deal with | ||
| 684 | * dattr==NULL case. No need to abort if alloc fails. | ||
| 685 | */ | ||
| 687 | dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); | 686 | dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); |
| 688 | 687 | ||
| 689 | for (nslot = 0, i = 0; i < csn; i++) { | 688 | for (nslot = 0, i = 0; i < csn; i++) { |
| 690 | struct cpuset *a = csa[i]; | 689 | struct cpuset *a = csa[i]; |
| 690 | cpumask_t *dp; | ||
| 691 | int apn = a->pn; | 691 | int apn = a->pn; |
| 692 | 692 | ||
| 693 | if (apn >= 0) { | 693 | if (apn < 0) { |
| 694 | cpumask_t *dp = doms + nslot; | 694 | /* Skip completed partitions */ |
| 695 | 695 | continue; | |
| 696 | if (nslot == ndoms) { | 696 | } |
| 697 | static int warnings = 10; | 697 | |
| 698 | if (warnings) { | 698 | dp = doms + nslot; |
| 699 | printk(KERN_WARNING | 699 | |
| 700 | "rebuild_sched_domains confused:" | 700 | if (nslot == ndoms) { |
| 701 | " nslot %d, ndoms %d, csn %d, i %d," | 701 | static int warnings = 10; |
| 702 | " apn %d\n", | 702 | if (warnings) { |
| 703 | nslot, ndoms, csn, i, apn); | 703 | printk(KERN_WARNING |
| 704 | warnings--; | 704 | "rebuild_sched_domains confused:" |
| 705 | } | 705 | " nslot %d, ndoms %d, csn %d, i %d," |
| 706 | continue; | 706 | " apn %d\n", |
| 707 | nslot, ndoms, csn, i, apn); | ||
| 708 | warnings--; | ||
| 707 | } | 709 | } |
| 710 | continue; | ||
| 711 | } | ||
| 708 | 712 | ||
| 709 | cpus_clear(*dp); | 713 | cpus_clear(*dp); |
| 710 | if (dattr) | 714 | if (dattr) |
| 711 | *(dattr + nslot) = SD_ATTR_INIT; | 715 | *(dattr + nslot) = SD_ATTR_INIT; |
| 712 | for (j = i; j < csn; j++) { | 716 | for (j = i; j < csn; j++) { |
| 713 | struct cpuset *b = csa[j]; | 717 | struct cpuset *b = csa[j]; |
| 714 | 718 | ||
| 715 | if (apn == b->pn) { | 719 | if (apn == b->pn) { |
| 716 | cpus_or(*dp, *dp, b->cpus_allowed); | 720 | cpus_or(*dp, *dp, b->cpus_allowed); |
| 717 | b->pn = -1; | 721 | if (dattr) |
| 718 | if (dattr) | 722 | update_domain_attr_tree(dattr + nslot, b); |
| 719 | update_domain_attr_tree(dattr | 723 | |
| 720 | + nslot, b); | 724 | /* Done with this partition */ |
| 721 | } | 725 | b->pn = -1; |
| 722 | } | 726 | } |
| 723 | nslot++; | ||
| 724 | } | 727 | } |
| 728 | nslot++; | ||
| 725 | } | 729 | } |
| 726 | BUG_ON(nslot != ndoms); | 730 | BUG_ON(nslot != ndoms); |
| 727 | 731 | ||
| 728 | rebuild: | 732 | done: |
| 729 | /* Have scheduler rebuild sched domains */ | 733 | kfree(csa); |
| 734 | |||
| 735 | *domains = doms; | ||
| 736 | *attributes = dattr; | ||
| 737 | return ndoms; | ||
| 738 | } | ||
| 739 | |||
| 740 | /* | ||
| 741 | * Rebuild scheduler domains. | ||
| 742 | * | ||
| 743 | * Call with neither cgroup_mutex held nor within get_online_cpus(). | ||
| 744 | * Takes both cgroup_mutex and get_online_cpus(). | ||
| 745 | * | ||
| 746 | * Cannot be directly called from cpuset code handling changes | ||
| 747 | * to the cpuset pseudo-filesystem, because it cannot be called | ||
| 748 | * from code that already holds cgroup_mutex. | ||
| 749 | */ | ||
| 750 | static void do_rebuild_sched_domains(struct work_struct *unused) | ||
| 751 | { | ||
| 752 | struct sched_domain_attr *attr; | ||
| 753 | cpumask_t *doms; | ||
| 754 | int ndoms; | ||
| 755 | |||
| 730 | get_online_cpus(); | 756 | get_online_cpus(); |
| 731 | partition_sched_domains(ndoms, doms, dattr); | 757 | |
| 758 | /* Generate domain masks and attrs */ | ||
| 759 | cgroup_lock(); | ||
| 760 | ndoms = generate_sched_domains(&doms, &attr); | ||
| 761 | cgroup_unlock(); | ||
| 762 | |||
| 763 | /* Have scheduler rebuild the domains */ | ||
| 764 | partition_sched_domains(ndoms, doms, attr); | ||
| 765 | |||
| 732 | put_online_cpus(); | 766 | put_online_cpus(); |
| 767 | } | ||
| 733 | 768 | ||
| 734 | done: | 769 | static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); |
| 735 | kfree(csa); | 770 | |
| 736 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ | 771 | /* |
| 737 | /* Don't kfree(dattr) -- partition_sched_domains() does that. */ | 772 | * Rebuild scheduler domains, asynchronously via workqueue. |
| 773 | * | ||
| 774 | * If the flag 'sched_load_balance' of any cpuset with non-empty | ||
| 775 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset | ||
| 776 | * which has that flag enabled, or if any cpuset with a non-empty | ||
| 777 | * 'cpus' is removed, then call this routine to rebuild the | ||
| 778 | * scheduler's dynamic sched domains. | ||
| 779 | * | ||
| 780 | * The rebuild_sched_domains() and partition_sched_domains() | ||
| 781 | * routines must nest cgroup_lock() inside get_online_cpus(), | ||
| 782 | * but such cpuset changes as these must nest that locking the | ||
| 783 | * other way, holding cgroup_lock() for much of the code. | ||
| 784 | * | ||
| 785 | * So in order to avoid an ABBA deadlock, the cpuset code handling | ||
| 786 | * these user changes delegates the actual sched domain rebuilding | ||
| 787 | * to a separate workqueue thread, which ends up processing the | ||
| 788 | * above do_rebuild_sched_domains() function. | ||
| 789 | */ | ||
| 790 | static void async_rebuild_sched_domains(void) | ||
| 791 | { | ||
| 792 | schedule_work(&rebuild_sched_domains_work); | ||
| 793 | } | ||
| 794 | |||
| 795 | /* | ||
| 796 | * Accomplishes the same scheduler domain rebuild as the above | ||
| 797 | * async_rebuild_sched_domains(), however it directly calls the | ||
| 798 | * rebuild routine synchronously rather than calling it via an | ||
| 799 | * asynchronous work thread. | ||
| 800 | * | ||
| 801 | * This can only be called from code that is not holding | ||
| 802 | * cgroup_mutex (not nested in a cgroup_lock() call.) | ||
| 803 | */ | ||
| 804 | void rebuild_sched_domains(void) | ||
| 805 | { | ||
| 806 | do_rebuild_sched_domains(NULL); | ||
| 738 | } | 807 | } |
| 739 | 808 | ||
| 740 | /** | 809 | /** |
| @@ -863,7 +932,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf) | |||
| 863 | return retval; | 932 | return retval; |
| 864 | 933 | ||
| 865 | if (is_load_balanced) | 934 | if (is_load_balanced) |
| 866 | rebuild_sched_domains(); | 935 | async_rebuild_sched_domains(); |
| 867 | return 0; | 936 | return 0; |
| 868 | } | 937 | } |
| 869 | 938 | ||
| @@ -1090,7 +1159,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
| 1090 | if (val != cs->relax_domain_level) { | 1159 | if (val != cs->relax_domain_level) { |
| 1091 | cs->relax_domain_level = val; | 1160 | cs->relax_domain_level = val; |
| 1092 | if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) | 1161 | if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) |
| 1093 | rebuild_sched_domains(); | 1162 | async_rebuild_sched_domains(); |
| 1094 | } | 1163 | } |
| 1095 | 1164 | ||
| 1096 | return 0; | 1165 | return 0; |
| @@ -1131,7 +1200,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | |||
| 1131 | mutex_unlock(&callback_mutex); | 1200 | mutex_unlock(&callback_mutex); |
| 1132 | 1201 | ||
| 1133 | if (cpus_nonempty && balance_flag_changed) | 1202 | if (cpus_nonempty && balance_flag_changed) |
| 1134 | rebuild_sched_domains(); | 1203 | async_rebuild_sched_domains(); |
| 1135 | 1204 | ||
| 1136 | return 0; | 1205 | return 0; |
| 1137 | } | 1206 | } |
| @@ -1492,6 +1561,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) | |||
| 1492 | default: | 1561 | default: |
| 1493 | BUG(); | 1562 | BUG(); |
| 1494 | } | 1563 | } |
| 1564 | |||
| 1565 | /* Unreachable but makes gcc happy */ | ||
| 1566 | return 0; | ||
| 1495 | } | 1567 | } |
| 1496 | 1568 | ||
| 1497 | static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) | 1569 | static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) |
| @@ -1504,6 +1576,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) | |||
| 1504 | default: | 1576 | default: |
| 1505 | BUG(); | 1577 | BUG(); |
| 1506 | } | 1578 | } |
| 1579 | |||
| 1580 | /* Unrechable but makes gcc happy */ | ||
| 1581 | return 0; | ||
| 1507 | } | 1582 | } |
| 1508 | 1583 | ||
| 1509 | 1584 | ||
| @@ -1692,15 +1767,9 @@ static struct cgroup_subsys_state *cpuset_create( | |||
| 1692 | } | 1767 | } |
| 1693 | 1768 | ||
| 1694 | /* | 1769 | /* |
| 1695 | * Locking note on the strange update_flag() call below: | ||
| 1696 | * | ||
| 1697 | * If the cpuset being removed has its flag 'sched_load_balance' | 1770 | * If the cpuset being removed has its flag 'sched_load_balance' |
| 1698 | * enabled, then simulate turning sched_load_balance off, which | 1771 | * enabled, then simulate turning sched_load_balance off, which |
| 1699 | * will call rebuild_sched_domains(). The get_online_cpus() | 1772 | * will call async_rebuild_sched_domains(). |
| 1700 | * call in rebuild_sched_domains() must not be made while holding | ||
| 1701 | * callback_mutex. Elsewhere the kernel nests callback_mutex inside | ||
| 1702 | * get_online_cpus() calls. So the reverse nesting would risk an | ||
| 1703 | * ABBA deadlock. | ||
| 1704 | */ | 1773 | */ |
| 1705 | 1774 | ||
| 1706 | static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | 1775 | static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) |
| @@ -1719,7 +1788,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 1719 | struct cgroup_subsys cpuset_subsys = { | 1788 | struct cgroup_subsys cpuset_subsys = { |
| 1720 | .name = "cpuset", | 1789 | .name = "cpuset", |
| 1721 | .create = cpuset_create, | 1790 | .create = cpuset_create, |
| 1722 | .destroy = cpuset_destroy, | 1791 | .destroy = cpuset_destroy, |
| 1723 | .can_attach = cpuset_can_attach, | 1792 | .can_attach = cpuset_can_attach, |
| 1724 | .attach = cpuset_attach, | 1793 | .attach = cpuset_attach, |
| 1725 | .populate = cpuset_populate, | 1794 | .populate = cpuset_populate, |
| @@ -1811,7 +1880,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) | |||
| 1811 | } | 1880 | } |
| 1812 | 1881 | ||
| 1813 | /* | 1882 | /* |
| 1814 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs | 1883 | * If CPU and/or memory hotplug handlers, below, unplug any CPUs |
| 1815 | * or memory nodes, we need to walk over the cpuset hierarchy, | 1884 | * or memory nodes, we need to walk over the cpuset hierarchy, |
| 1816 | * removing that CPU or node from all cpusets. If this removes the | 1885 | * removing that CPU or node from all cpusets. If this removes the |
| 1817 | * last CPU or node from a cpuset, then move the tasks in the empty | 1886 | * last CPU or node from a cpuset, then move the tasks in the empty |
| @@ -1903,35 +1972,6 @@ static void scan_for_empty_cpusets(const struct cpuset *root) | |||
| 1903 | } | 1972 | } |
| 1904 | 1973 | ||
| 1905 | /* | 1974 | /* |
| 1906 | * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track | ||
| 1907 | * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to | ||
| 1908 | * track what's online after any CPU or memory node hotplug or unplug event. | ||
| 1909 | * | ||
| 1910 | * Since there are two callers of this routine, one for CPU hotplug | ||
| 1911 | * events and one for memory node hotplug events, we could have coded | ||
| 1912 | * two separate routines here. We code it as a single common routine | ||
| 1913 | * in order to minimize text size. | ||
| 1914 | */ | ||
| 1915 | |||
| 1916 | static void common_cpu_mem_hotplug_unplug(int rebuild_sd) | ||
| 1917 | { | ||
| 1918 | cgroup_lock(); | ||
| 1919 | |||
| 1920 | top_cpuset.cpus_allowed = cpu_online_map; | ||
| 1921 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | ||
| 1922 | scan_for_empty_cpusets(&top_cpuset); | ||
| 1923 | |||
| 1924 | /* | ||
| 1925 | * Scheduler destroys domains on hotplug events. | ||
| 1926 | * Rebuild them based on the current settings. | ||
| 1927 | */ | ||
| 1928 | if (rebuild_sd) | ||
| 1929 | rebuild_sched_domains(); | ||
| 1930 | |||
| 1931 | cgroup_unlock(); | ||
| 1932 | } | ||
| 1933 | |||
| 1934 | /* | ||
| 1935 | * The top_cpuset tracks what CPUs and Memory Nodes are online, | 1975 | * The top_cpuset tracks what CPUs and Memory Nodes are online, |
| 1936 | * period. This is necessary in order to make cpusets transparent | 1976 | * period. This is necessary in order to make cpusets transparent |
| 1937 | * (of no affect) on systems that are actively using CPU hotplug | 1977 | * (of no affect) on systems that are actively using CPU hotplug |
| @@ -1939,40 +1979,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd) | |||
| 1939 | * | 1979 | * |
| 1940 | * This routine ensures that top_cpuset.cpus_allowed tracks | 1980 | * This routine ensures that top_cpuset.cpus_allowed tracks |
| 1941 | * cpu_online_map on each CPU hotplug (cpuhp) event. | 1981 | * cpu_online_map on each CPU hotplug (cpuhp) event. |
| 1982 | * | ||
| 1983 | * Called within get_online_cpus(). Needs to call cgroup_lock() | ||
| 1984 | * before calling generate_sched_domains(). | ||
| 1942 | */ | 1985 | */ |
| 1943 | 1986 | static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | |
| 1944 | static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, | ||
| 1945 | unsigned long phase, void *unused_cpu) | 1987 | unsigned long phase, void *unused_cpu) |
| 1946 | { | 1988 | { |
| 1989 | struct sched_domain_attr *attr; | ||
| 1990 | cpumask_t *doms; | ||
| 1991 | int ndoms; | ||
| 1992 | |||
| 1947 | switch (phase) { | 1993 | switch (phase) { |
| 1948 | case CPU_UP_CANCELED: | ||
| 1949 | case CPU_UP_CANCELED_FROZEN: | ||
| 1950 | case CPU_DOWN_FAILED: | ||
| 1951 | case CPU_DOWN_FAILED_FROZEN: | ||
| 1952 | case CPU_ONLINE: | 1994 | case CPU_ONLINE: |
| 1953 | case CPU_ONLINE_FROZEN: | 1995 | case CPU_ONLINE_FROZEN: |
| 1954 | case CPU_DEAD: | 1996 | case CPU_DEAD: |
| 1955 | case CPU_DEAD_FROZEN: | 1997 | case CPU_DEAD_FROZEN: |
| 1956 | common_cpu_mem_hotplug_unplug(1); | ||
| 1957 | break; | 1998 | break; |
| 1999 | |||
| 1958 | default: | 2000 | default: |
| 1959 | return NOTIFY_DONE; | 2001 | return NOTIFY_DONE; |
| 1960 | } | 2002 | } |
| 1961 | 2003 | ||
| 2004 | cgroup_lock(); | ||
| 2005 | top_cpuset.cpus_allowed = cpu_online_map; | ||
| 2006 | scan_for_empty_cpusets(&top_cpuset); | ||
| 2007 | ndoms = generate_sched_domains(&doms, &attr); | ||
| 2008 | cgroup_unlock(); | ||
| 2009 | |||
| 2010 | /* Have scheduler rebuild the domains */ | ||
| 2011 | partition_sched_domains(ndoms, doms, attr); | ||
| 2012 | |||
| 1962 | return NOTIFY_OK; | 2013 | return NOTIFY_OK; |
| 1963 | } | 2014 | } |
| 1964 | 2015 | ||
| 1965 | #ifdef CONFIG_MEMORY_HOTPLUG | 2016 | #ifdef CONFIG_MEMORY_HOTPLUG |
| 1966 | /* | 2017 | /* |
| 1967 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. | 2018 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. |
| 1968 | * Call this routine anytime after you change | 2019 | * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. |
| 1969 | * node_states[N_HIGH_MEMORY]. | 2020 | * See also the previous routine cpuset_track_online_cpus(). |
| 1970 | * See also the previous routine cpuset_handle_cpuhp(). | ||
| 1971 | */ | 2021 | */ |
| 1972 | |||
| 1973 | void cpuset_track_online_nodes(void) | 2022 | void cpuset_track_online_nodes(void) |
| 1974 | { | 2023 | { |
| 1975 | common_cpu_mem_hotplug_unplug(0); | 2024 | cgroup_lock(); |
| 2025 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | ||
| 2026 | scan_for_empty_cpusets(&top_cpuset); | ||
| 2027 | cgroup_unlock(); | ||
| 1976 | } | 2028 | } |
| 1977 | #endif | 2029 | #endif |
| 1978 | 2030 | ||
| @@ -1987,7 +2039,7 @@ void __init cpuset_init_smp(void) | |||
| 1987 | top_cpuset.cpus_allowed = cpu_online_map; | 2039 | top_cpuset.cpus_allowed = cpu_online_map; |
| 1988 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2040 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
| 1989 | 2041 | ||
| 1990 | hotcpu_notifier(cpuset_handle_cpuhp, 0); | 2042 | hotcpu_notifier(cpuset_track_online_cpus, 0); |
| 1991 | } | 2043 | } |
| 1992 | 2044 | ||
| 1993 | /** | 2045 | /** |
diff --git a/kernel/sched.c b/kernel/sched.c index 1a5f73c1fcdc..cc1f81b50b82 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -7696,24 +7696,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | |||
| 7696 | * and partition_sched_domains() will fallback to the single partition | 7696 | * and partition_sched_domains() will fallback to the single partition |
| 7697 | * 'fallback_doms', it also forces the domains to be rebuilt. | 7697 | * 'fallback_doms', it also forces the domains to be rebuilt. |
| 7698 | * | 7698 | * |
| 7699 | * If doms_new==NULL it will be replaced with cpu_online_map. | ||
| 7700 | * ndoms_new==0 is a special case for destroying existing domains. | ||
| 7701 | * It will not create the default domain. | ||
| 7702 | * | ||
| 7699 | * Call with hotplug lock held | 7703 | * Call with hotplug lock held |
| 7700 | */ | 7704 | */ |
| 7701 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, | 7705 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, |
| 7702 | struct sched_domain_attr *dattr_new) | 7706 | struct sched_domain_attr *dattr_new) |
| 7703 | { | 7707 | { |
| 7704 | int i, j; | 7708 | int i, j, n; |
| 7705 | 7709 | ||
| 7706 | mutex_lock(&sched_domains_mutex); | 7710 | mutex_lock(&sched_domains_mutex); |
| 7707 | 7711 | ||
| 7708 | /* always unregister in case we don't destroy any domains */ | 7712 | /* always unregister in case we don't destroy any domains */ |
| 7709 | unregister_sched_domain_sysctl(); | 7713 | unregister_sched_domain_sysctl(); |
| 7710 | 7714 | ||
| 7711 | if (doms_new == NULL) | 7715 | n = doms_new ? ndoms_new : 0; |
| 7712 | ndoms_new = 0; | ||
| 7713 | 7716 | ||
| 7714 | /* Destroy deleted domains */ | 7717 | /* Destroy deleted domains */ |
| 7715 | for (i = 0; i < ndoms_cur; i++) { | 7718 | for (i = 0; i < ndoms_cur; i++) { |
| 7716 | for (j = 0; j < ndoms_new; j++) { | 7719 | for (j = 0; j < n; j++) { |
| 7717 | if (cpus_equal(doms_cur[i], doms_new[j]) | 7720 | if (cpus_equal(doms_cur[i], doms_new[j]) |
| 7718 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | 7721 | && dattrs_equal(dattr_cur, i, dattr_new, j)) |
| 7719 | goto match1; | 7722 | goto match1; |
| @@ -7726,7 +7729,6 @@ match1: | |||
| 7726 | 7729 | ||
| 7727 | if (doms_new == NULL) { | 7730 | if (doms_new == NULL) { |
| 7728 | ndoms_cur = 0; | 7731 | ndoms_cur = 0; |
| 7729 | ndoms_new = 1; | ||
| 7730 | doms_new = &fallback_doms; | 7732 | doms_new = &fallback_doms; |
| 7731 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); | 7733 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); |
| 7732 | dattr_new = NULL; | 7734 | dattr_new = NULL; |
| @@ -7763,8 +7765,13 @@ match2: | |||
| 7763 | int arch_reinit_sched_domains(void) | 7765 | int arch_reinit_sched_domains(void) |
| 7764 | { | 7766 | { |
| 7765 | get_online_cpus(); | 7767 | get_online_cpus(); |
| 7768 | |||
| 7769 | /* Destroy domains first to force the rebuild */ | ||
| 7770 | partition_sched_domains(0, NULL, NULL); | ||
| 7771 | |||
| 7766 | rebuild_sched_domains(); | 7772 | rebuild_sched_domains(); |
| 7767 | put_online_cpus(); | 7773 | put_online_cpus(); |
| 7774 | |||
| 7768 | return 0; | 7775 | return 0; |
| 7769 | } | 7776 | } |
| 7770 | 7777 | ||
| @@ -7848,7 +7855,7 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
| 7848 | case CPU_ONLINE_FROZEN: | 7855 | case CPU_ONLINE_FROZEN: |
| 7849 | case CPU_DEAD: | 7856 | case CPU_DEAD: |
| 7850 | case CPU_DEAD_FROZEN: | 7857 | case CPU_DEAD_FROZEN: |
| 7851 | partition_sched_domains(0, NULL, NULL); | 7858 | partition_sched_domains(1, NULL, NULL); |
| 7852 | return NOTIFY_OK; | 7859 | return NOTIFY_OK; |
| 7853 | 7860 | ||
| 7854 | default: | 7861 | default: |
