aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-10-28 11:26:12 -0400
committerIngo Molnar <mingo@elte.hu>2008-10-28 11:26:12 -0400
commit7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch)
treee730a4565e0318140d2fbd2f0415d18a339d7336 /kernel/cpuset.c
parent41b9eb264c8407655db57b60b4457fe1b2ec9977 (diff)
parent0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff)
Merge commit 'v2.6.28-rc2' into x86/pci-ioapic-boot-irq-quirks
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c702
1 files changed, 380 insertions, 322 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 459d601947a8..3e00526f52ec 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -14,6 +14,8 @@
14 * 2003-10-22 Updates by Stephen Hemminger. 14 * 2003-10-22 Updates by Stephen Hemminger.
15 * 2004 May-July Rework by Paul Jackson. 15 * 2004 May-July Rework by Paul Jackson.
16 * 2006 Rework by Paul Menage to use generic cgroups 16 * 2006 Rework by Paul Menage to use generic cgroups
17 * 2008 Rework of the scheduler domains and CPU hotplug handling
18 * by Max Krasnyansky
17 * 19 *
18 * This file is subject to the terms and conditions of the GNU General Public 20 * This file is subject to the terms and conditions of the GNU General Public
19 * License. See the file COPYING in the main directory of the Linux 21 * License. See the file COPYING in the main directory of the Linux
@@ -54,7 +56,6 @@
54#include <asm/uaccess.h> 56#include <asm/uaccess.h>
55#include <asm/atomic.h> 57#include <asm/atomic.h>
56#include <linux/mutex.h> 58#include <linux/mutex.h>
57#include <linux/kfifo.h>
58#include <linux/workqueue.h> 59#include <linux/workqueue.h>
59#include <linux/cgroup.h> 60#include <linux/cgroup.h>
60 61
@@ -227,10 +228,6 @@ static struct cpuset top_cpuset = {
227 * The task_struct fields mems_allowed and mems_generation may only 228 * The task_struct fields mems_allowed and mems_generation may only
228 * be accessed in the context of that task, so require no locks. 229 * be accessed in the context of that task, so require no locks.
229 * 230 *
230 * The cpuset_common_file_write handler for operations that modify
231 * the cpuset hierarchy holds cgroup_mutex across the entire operation,
232 * single threading all such cpuset modifications across the system.
233 *
234 * The cpuset_common_file_read() handlers only hold callback_mutex across 231 * The cpuset_common_file_read() handlers only hold callback_mutex across
235 * small pieces of code, such as when reading out possibly multi-word 232 * small pieces of code, such as when reading out possibly multi-word
236 * cpumasks and nodemasks. 233 * cpumasks and nodemasks.
@@ -241,9 +238,11 @@ static struct cpuset top_cpuset = {
241 238
242static DEFINE_MUTEX(callback_mutex); 239static DEFINE_MUTEX(callback_mutex);
243 240
244/* This is ugly, but preserves the userspace API for existing cpuset 241/*
242 * This is ugly, but preserves the userspace API for existing cpuset
245 * users. If someone tries to mount the "cpuset" filesystem, we 243 * users. If someone tries to mount the "cpuset" filesystem, we
246 * silently switch it to mount "cgroup" instead */ 244 * silently switch it to mount "cgroup" instead
245 */
247static int cpuset_get_sb(struct file_system_type *fs_type, 246static int cpuset_get_sb(struct file_system_type *fs_type,
248 int flags, const char *unused_dev_name, 247 int flags, const char *unused_dev_name,
249 void *data, struct vfsmount *mnt) 248 void *data, struct vfsmount *mnt)
@@ -369,7 +368,7 @@ void cpuset_update_task_memory_state(void)
369 my_cpusets_mem_gen = top_cpuset.mems_generation; 368 my_cpusets_mem_gen = top_cpuset.mems_generation;
370 } else { 369 } else {
371 rcu_read_lock(); 370 rcu_read_lock();
372 my_cpusets_mem_gen = task_cs(current)->mems_generation; 371 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
373 rcu_read_unlock(); 372 rcu_read_unlock();
374 } 373 }
375 374
@@ -478,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
478} 477}
479 478
480/* 479/*
481 * Helper routine for rebuild_sched_domains(). 480 * Helper routine for generate_sched_domains().
482 * Do cpusets a, b have overlapping cpus_allowed masks? 481 * Do cpusets a, b have overlapping cpus_allowed masks?
483 */ 482 */
484
485static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 483static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
486{ 484{
487 return cpus_intersects(a->cpus_allowed, b->cpus_allowed); 485 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
@@ -490,29 +488,48 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
490static void 488static void
491update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) 489update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
492{ 490{
493 if (!dattr)
494 return;
495 if (dattr->relax_domain_level < c->relax_domain_level) 491 if (dattr->relax_domain_level < c->relax_domain_level)
496 dattr->relax_domain_level = c->relax_domain_level; 492 dattr->relax_domain_level = c->relax_domain_level;
497 return; 493 return;
498} 494}
499 495
496static void
497update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
498{
499 LIST_HEAD(q);
500
501 list_add(&c->stack_list, &q);
502 while (!list_empty(&q)) {
503 struct cpuset *cp;
504 struct cgroup *cont;
505 struct cpuset *child;
506
507 cp = list_first_entry(&q, struct cpuset, stack_list);
508 list_del(q.next);
509
510 if (cpus_empty(cp->cpus_allowed))
511 continue;
512
513 if (is_sched_load_balance(cp))
514 update_domain_attr(dattr, cp);
515
516 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
517 child = cgroup_cs(cont);
518 list_add_tail(&child->stack_list, &q);
519 }
520 }
521}
522
500/* 523/*
501 * rebuild_sched_domains() 524 * generate_sched_domains()
502 * 525 *
503 * If the flag 'sched_load_balance' of any cpuset with non-empty 526 * This function builds a partial partition of the systems CPUs
504 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset 527 * A 'partial partition' is a set of non-overlapping subsets whose
505 * which has that flag enabled, or if any cpuset with a non-empty 528 * union is a subset of that set.
506 * 'cpus' is removed, then call this routine to rebuild the 529 * The output of this function needs to be passed to kernel/sched.c
507 * scheduler's dynamic sched domains. 530 * partition_sched_domains() routine, which will rebuild the scheduler's
508 * 531 * load balancing domains (sched domains) as specified by that partial
509 * This routine builds a partial partition of the systems CPUs 532 * partition.
510 * (the set of non-overlappping cpumask_t's in the array 'part'
511 * below), and passes that partial partition to the kernel/sched.c
512 * partition_sched_domains() routine, which will rebuild the
513 * schedulers load balancing domains (sched domains) as specified
514 * by that partial partition. A 'partial partition' is a set of
515 * non-overlapping subsets whose union is a subset of that set.
516 * 533 *
517 * See "What is sched_load_balance" in Documentation/cpusets.txt 534 * See "What is sched_load_balance" in Documentation/cpusets.txt
518 * for a background explanation of this. 535 * for a background explanation of this.
@@ -522,16 +539,10 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
522 * domains when operating in the severe memory shortage situations 539 * domains when operating in the severe memory shortage situations
523 * that could cause allocation failures below. 540 * that could cause allocation failures below.
524 * 541 *
525 * Call with cgroup_mutex held. May take callback_mutex during 542 * Must be called with cgroup_lock held.
526 * call due to the kfifo_alloc() and kmalloc() calls. May nest
527 * a call to the get_online_cpus()/put_online_cpus() pair.
528 * Must not be called holding callback_mutex, because we must not
529 * call get_online_cpus() while holding callback_mutex. Elsewhere
530 * the kernel nests callback_mutex inside get_online_cpus() calls.
531 * So the reverse nesting would risk an ABBA deadlock.
532 * 543 *
533 * The three key local variables below are: 544 * The three key local variables below are:
534 * q - a kfifo queue of cpuset pointers, used to implement a 545 * q - a linked-list queue of cpuset pointers, used to implement a
535 * top-down scan of all cpusets. This scan loads a pointer 546 * top-down scan of all cpusets. This scan loads a pointer
536 * to each cpuset marked is_sched_load_balance into the 547 * to each cpuset marked is_sched_load_balance into the
537 * array 'csa'. For our purposes, rebuilding the schedulers 548 * array 'csa'. For our purposes, rebuilding the schedulers
@@ -563,10 +574,10 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
563 * element of the partition (one sched domain) to be passed to 574 * element of the partition (one sched domain) to be passed to
564 * partition_sched_domains(). 575 * partition_sched_domains().
565 */ 576 */
566 577static int generate_sched_domains(cpumask_t **domains,
567static void rebuild_sched_domains(void) 578 struct sched_domain_attr **attributes)
568{ 579{
569 struct kfifo *q; /* queue of cpusets to be scanned */ 580 LIST_HEAD(q); /* queue of cpusets to be scanned */
570 struct cpuset *cp; /* scans q */ 581 struct cpuset *cp; /* scans q */
571 struct cpuset **csa; /* array of all cpuset ptrs */ 582 struct cpuset **csa; /* array of all cpuset ptrs */
572 int csn; /* how many cpuset ptrs in csa so far */ 583 int csn; /* how many cpuset ptrs in csa so far */
@@ -576,44 +587,58 @@ static void rebuild_sched_domains(void)
576 int ndoms; /* number of sched domains in result */ 587 int ndoms; /* number of sched domains in result */
577 int nslot; /* next empty doms[] cpumask_t slot */ 588 int nslot; /* next empty doms[] cpumask_t slot */
578 589
579 q = NULL; 590 ndoms = 0;
580 csa = NULL;
581 doms = NULL; 591 doms = NULL;
582 dattr = NULL; 592 dattr = NULL;
593 csa = NULL;
583 594
584 /* Special case for the 99% of systems with one, full, sched domain */ 595 /* Special case for the 99% of systems with one, full, sched domain */
585 if (is_sched_load_balance(&top_cpuset)) { 596 if (is_sched_load_balance(&top_cpuset)) {
586 ndoms = 1;
587 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 597 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
588 if (!doms) 598 if (!doms)
589 goto rebuild; 599 goto done;
600
590 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); 601 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
591 if (dattr) { 602 if (dattr) {
592 *dattr = SD_ATTR_INIT; 603 *dattr = SD_ATTR_INIT;
593 update_domain_attr(dattr, &top_cpuset); 604 update_domain_attr_tree(dattr, &top_cpuset);
594 } 605 }
595 *doms = top_cpuset.cpus_allowed; 606 *doms = top_cpuset.cpus_allowed;
596 goto rebuild;
597 }
598 607
599 q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL); 608 ndoms = 1;
600 if (IS_ERR(q))
601 goto done; 609 goto done;
610 }
611
602 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); 612 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
603 if (!csa) 613 if (!csa)
604 goto done; 614 goto done;
605 csn = 0; 615 csn = 0;
606 616
607 cp = &top_cpuset; 617 list_add(&top_cpuset.stack_list, &q);
608 __kfifo_put(q, (void *)&cp, sizeof(cp)); 618 while (!list_empty(&q)) {
609 while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
610 struct cgroup *cont; 619 struct cgroup *cont;
611 struct cpuset *child; /* scans child cpusets of cp */ 620 struct cpuset *child; /* scans child cpusets of cp */
612 if (is_sched_load_balance(cp)) 621
622 cp = list_first_entry(&q, struct cpuset, stack_list);
623 list_del(q.next);
624
625 if (cpus_empty(cp->cpus_allowed))
626 continue;
627
628 /*
629 * All child cpusets contain a subset of the parent's cpus, so
630 * just skip them, and then we call update_domain_attr_tree()
631 * to calc relax_domain_level of the corresponding sched
632 * domain.
633 */
634 if (is_sched_load_balance(cp)) {
613 csa[csn++] = cp; 635 csa[csn++] = cp;
636 continue;
637 }
638
614 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 639 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
615 child = cgroup_cs(cont); 640 child = cgroup_cs(cont);
616 __kfifo_put(q, (void *)&child, sizeof(cp)); 641 list_add_tail(&child->stack_list, &q);
617 } 642 }
618 } 643 }
619 644
@@ -644,91 +669,141 @@ restart:
644 } 669 }
645 } 670 }
646 671
647 /* Convert <csn, csa> to <ndoms, doms> */ 672 /*
673 * Now we know how many domains to create.
674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
675 */
648 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
649 if (!doms) 677 if (!doms) {
650 goto rebuild; 678 ndoms = 0;
679 goto done;
680 }
681
682 /*
683 * The rest of the code, including the scheduler, can deal with
684 * dattr==NULL case. No need to abort if alloc fails.
685 */
651 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); 686 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
652 687
653 for (nslot = 0, i = 0; i < csn; i++) { 688 for (nslot = 0, i = 0; i < csn; i++) {
654 struct cpuset *a = csa[i]; 689 struct cpuset *a = csa[i];
690 cpumask_t *dp;
655 int apn = a->pn; 691 int apn = a->pn;
656 692
657 if (apn >= 0) { 693 if (apn < 0) {
658 cpumask_t *dp = doms + nslot; 694 /* Skip completed partitions */
659 695 continue;
660 if (nslot == ndoms) { 696 }
661 static int warnings = 10; 697
662 if (warnings) { 698 dp = doms + nslot;
663 printk(KERN_WARNING 699
664 "rebuild_sched_domains confused:" 700 if (nslot == ndoms) {
665 " nslot %d, ndoms %d, csn %d, i %d," 701 static int warnings = 10;
666 " apn %d\n", 702 if (warnings) {
667 nslot, ndoms, csn, i, apn); 703 printk(KERN_WARNING
668 warnings--; 704 "rebuild_sched_domains confused:"
669 } 705 " nslot %d, ndoms %d, csn %d, i %d,"
670 continue; 706 " apn %d\n",
707 nslot, ndoms, csn, i, apn);
708 warnings--;
671 } 709 }
710 continue;
711 }
672 712
673 cpus_clear(*dp); 713 cpus_clear(*dp);
674 if (dattr) 714 if (dattr)
675 *(dattr + nslot) = SD_ATTR_INIT; 715 *(dattr + nslot) = SD_ATTR_INIT;
676 for (j = i; j < csn; j++) { 716 for (j = i; j < csn; j++) {
677 struct cpuset *b = csa[j]; 717 struct cpuset *b = csa[j];
678 718
679 if (apn == b->pn) { 719 if (apn == b->pn) {
680 cpus_or(*dp, *dp, b->cpus_allowed); 720 cpus_or(*dp, *dp, b->cpus_allowed);
681 b->pn = -1; 721 if (dattr)
682 update_domain_attr(dattr, b); 722 update_domain_attr_tree(dattr + nslot, b);
683 } 723
724 /* Done with this partition */
725 b->pn = -1;
684 } 726 }
685 nslot++;
686 } 727 }
728 nslot++;
687 } 729 }
688 BUG_ON(nslot != ndoms); 730 BUG_ON(nslot != ndoms);
689 731
690rebuild:
691 /* Have scheduler rebuild sched domains */
692 get_online_cpus();
693 partition_sched_domains(ndoms, doms, dattr);
694 put_online_cpus();
695
696done: 732done:
697 if (q && !IS_ERR(q))
698 kfifo_free(q);
699 kfree(csa); 733 kfree(csa);
700 /* Don't kfree(doms) -- partition_sched_domains() does that. */ 734
701 /* Don't kfree(dattr) -- partition_sched_domains() does that. */ 735 *domains = doms;
736 *attributes = dattr;
737 return ndoms;
702} 738}
703 739
704static inline int started_after_time(struct task_struct *t1, 740/*
705 struct timespec *time, 741 * Rebuild scheduler domains.
706 struct task_struct *t2) 742 *
743 * Call with neither cgroup_mutex held nor within get_online_cpus().
744 * Takes both cgroup_mutex and get_online_cpus().
745 *
746 * Cannot be directly called from cpuset code handling changes
747 * to the cpuset pseudo-filesystem, because it cannot be called
748 * from code that already holds cgroup_mutex.
749 */
750static void do_rebuild_sched_domains(struct work_struct *unused)
707{ 751{
708 int start_diff = timespec_compare(&t1->start_time, time); 752 struct sched_domain_attr *attr;
709 if (start_diff > 0) { 753 cpumask_t *doms;
710 return 1; 754 int ndoms;
711 } else if (start_diff < 0) { 755
712 return 0; 756 get_online_cpus();
713 } else { 757
714 /* 758 /* Generate domain masks and attrs */
715 * Arbitrarily, if two processes started at the same 759 cgroup_lock();
716 * time, we'll say that the lower pointer value 760 ndoms = generate_sched_domains(&doms, &attr);
717 * started first. Note that t2 may have exited by now 761 cgroup_unlock();
718 * so this may not be a valid pointer any longer, but 762
719 * that's fine - it still serves to distinguish 763 /* Have scheduler rebuild the domains */
720 * between two tasks started (effectively) 764 partition_sched_domains(ndoms, doms, attr);
721 * simultaneously. 765
722 */ 766 put_online_cpus();
723 return t1 > t2; 767}
724 } 768
769static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
770
771/*
772 * Rebuild scheduler domains, asynchronously via workqueue.
773 *
774 * If the flag 'sched_load_balance' of any cpuset with non-empty
775 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
776 * which has that flag enabled, or if any cpuset with a non-empty
777 * 'cpus' is removed, then call this routine to rebuild the
778 * scheduler's dynamic sched domains.
779 *
780 * The rebuild_sched_domains() and partition_sched_domains()
781 * routines must nest cgroup_lock() inside get_online_cpus(),
782 * but such cpuset changes as these must nest that locking the
783 * other way, holding cgroup_lock() for much of the code.
784 *
785 * So in order to avoid an ABBA deadlock, the cpuset code handling
786 * these user changes delegates the actual sched domain rebuilding
787 * to a separate workqueue thread, which ends up processing the
788 * above do_rebuild_sched_domains() function.
789 */
790static void async_rebuild_sched_domains(void)
791{
792 schedule_work(&rebuild_sched_domains_work);
725} 793}
726 794
727static inline int started_after(void *p1, void *p2) 795/*
796 * Accomplishes the same scheduler domain rebuild as the above
797 * async_rebuild_sched_domains(), however it directly calls the
798 * rebuild routine synchronously rather than calling it via an
799 * asynchronous work thread.
800 *
801 * This can only be called from code that is not holding
802 * cgroup_mutex (not nested in a cgroup_lock() call.)
803 */
804void rebuild_sched_domains(void)
728{ 805{
729 struct task_struct *t1 = p1; 806 do_rebuild_sched_domains(NULL);
730 struct task_struct *t2 = p2;
731 return started_after_time(t1, &t2->start_time, t2);
732} 807}
733 808
734/** 809/**
@@ -766,15 +841,38 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
766} 841}
767 842
768/** 843/**
844 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
845 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
846 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
847 *
848 * Called with cgroup_mutex held
849 *
850 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
851 * calling callback functions for each.
852 *
853 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
854 * if @heap != NULL.
855 */
856static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
857{
858 struct cgroup_scanner scan;
859
860 scan.cg = cs->css.cgroup;
861 scan.test_task = cpuset_test_cpumask;
862 scan.process_task = cpuset_change_cpumask;
863 scan.heap = heap;
864 cgroup_scan_tasks(&scan);
865}
866
867/**
769 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 868 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
770 * @cs: the cpuset to consider 869 * @cs: the cpuset to consider
771 * @buf: buffer of cpu numbers written to this cpuset 870 * @buf: buffer of cpu numbers written to this cpuset
772 */ 871 */
773static int update_cpumask(struct cpuset *cs, char *buf) 872static int update_cpumask(struct cpuset *cs, const char *buf)
774{ 873{
775 struct cpuset trialcs;
776 struct cgroup_scanner scan;
777 struct ptr_heap heap; 874 struct ptr_heap heap;
875 struct cpuset trialcs;
778 int retval; 876 int retval;
779 int is_load_balanced; 877 int is_load_balanced;
780 878
@@ -790,7 +888,6 @@ static int update_cpumask(struct cpuset *cs, char *buf)
790 * that parsing. The validate_change() call ensures that cpusets 888 * that parsing. The validate_change() call ensures that cpusets
791 * with tasks have cpus. 889 * with tasks have cpus.
792 */ 890 */
793 buf = strstrip(buf);
794 if (!*buf) { 891 if (!*buf) {
795 cpus_clear(trialcs.cpus_allowed); 892 cpus_clear(trialcs.cpus_allowed);
796 } else { 893 } else {
@@ -809,7 +906,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
809 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) 906 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
810 return 0; 907 return 0;
811 908
812 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); 909 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
813 if (retval) 910 if (retval)
814 return retval; 911 return retval;
815 912
@@ -823,15 +920,12 @@ static int update_cpumask(struct cpuset *cs, char *buf)
823 * Scan tasks in the cpuset, and update the cpumasks of any 920 * Scan tasks in the cpuset, and update the cpumasks of any
824 * that need an update. 921 * that need an update.
825 */ 922 */
826 scan.cg = cs->css.cgroup; 923 update_tasks_cpumask(cs, &heap);
827 scan.test_task = cpuset_test_cpumask; 924
828 scan.process_task = cpuset_change_cpumask;
829 scan.heap = &heap;
830 cgroup_scan_tasks(&scan);
831 heap_free(&heap); 925 heap_free(&heap);
832 926
833 if (is_load_balanced) 927 if (is_load_balanced)
834 rebuild_sched_domains(); 928 async_rebuild_sched_domains();
835 return 0; 929 return 0;
836} 930}
837 931
@@ -884,74 +978,25 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
884 mutex_unlock(&callback_mutex); 978 mutex_unlock(&callback_mutex);
885} 979}
886 980
887/*
888 * Handle user request to change the 'mems' memory placement
889 * of a cpuset. Needs to validate the request, update the
890 * cpusets mems_allowed and mems_generation, and for each
891 * task in the cpuset, rebind any vma mempolicies and if
892 * the cpuset is marked 'memory_migrate', migrate the tasks
893 * pages to the new memory.
894 *
895 * Call with cgroup_mutex held. May take callback_mutex during call.
896 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
897 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
898 * their mempolicies to the cpusets new mems_allowed.
899 */
900
901static void *cpuset_being_rebound; 981static void *cpuset_being_rebound;
902 982
903static int update_nodemask(struct cpuset *cs, char *buf) 983/**
984 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
985 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
986 * @oldmem: old mems_allowed of cpuset cs
987 *
988 * Called with cgroup_mutex held
989 * Return 0 if successful, -errno if not.
990 */
991static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
904{ 992{
905 struct cpuset trialcs;
906 nodemask_t oldmem;
907 struct task_struct *p; 993 struct task_struct *p;
908 struct mm_struct **mmarray; 994 struct mm_struct **mmarray;
909 int i, n, ntasks; 995 int i, n, ntasks;
910 int migrate; 996 int migrate;
911 int fudge; 997 int fudge;
912 int retval;
913 struct cgroup_iter it; 998 struct cgroup_iter it;
914 999 int retval;
915 /*
916 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
917 * it's read-only
918 */
919 if (cs == &top_cpuset)
920 return -EACCES;
921
922 trialcs = *cs;
923
924 /*
925 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
926 * Since nodelist_parse() fails on an empty mask, we special case
927 * that parsing. The validate_change() call ensures that cpusets
928 * with tasks have memory.
929 */
930 buf = strstrip(buf);
931 if (!*buf) {
932 nodes_clear(trialcs.mems_allowed);
933 } else {
934 retval = nodelist_parse(buf, trialcs.mems_allowed);
935 if (retval < 0)
936 goto done;
937
938 if (!nodes_subset(trialcs.mems_allowed,
939 node_states[N_HIGH_MEMORY]))
940 return -EINVAL;
941 }
942 oldmem = cs->mems_allowed;
943 if (nodes_equal(oldmem, trialcs.mems_allowed)) {
944 retval = 0; /* Too easy - nothing to do */
945 goto done;
946 }
947 retval = validate_change(cs, &trialcs);
948 if (retval < 0)
949 goto done;
950
951 mutex_lock(&callback_mutex);
952 cs->mems_allowed = trialcs.mems_allowed;
953 cs->mems_generation = cpuset_mems_generation++;
954 mutex_unlock(&callback_mutex);
955 1000
956 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1001 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
957 1002
@@ -1018,7 +1063,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
1018 1063
1019 mpol_rebind_mm(mm, &cs->mems_allowed); 1064 mpol_rebind_mm(mm, &cs->mems_allowed);
1020 if (migrate) 1065 if (migrate)
1021 cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed); 1066 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1022 mmput(mm); 1067 mmput(mm);
1023 } 1068 }
1024 1069
@@ -1030,6 +1075,70 @@ done:
1030 return retval; 1075 return retval;
1031} 1076}
1032 1077
1078/*
1079 * Handle user request to change the 'mems' memory placement
1080 * of a cpuset. Needs to validate the request, update the
1081 * cpusets mems_allowed and mems_generation, and for each
1082 * task in the cpuset, rebind any vma mempolicies and if
1083 * the cpuset is marked 'memory_migrate', migrate the tasks
1084 * pages to the new memory.
1085 *
1086 * Call with cgroup_mutex held. May take callback_mutex during call.
1087 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1088 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1089 * their mempolicies to the cpusets new mems_allowed.
1090 */
1091static int update_nodemask(struct cpuset *cs, const char *buf)
1092{
1093 struct cpuset trialcs;
1094 nodemask_t oldmem;
1095 int retval;
1096
1097 /*
1098 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
1099 * it's read-only
1100 */
1101 if (cs == &top_cpuset)
1102 return -EACCES;
1103
1104 trialcs = *cs;
1105
1106 /*
1107 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
1108 * Since nodelist_parse() fails on an empty mask, we special case
1109 * that parsing. The validate_change() call ensures that cpusets
1110 * with tasks have memory.
1111 */
1112 if (!*buf) {
1113 nodes_clear(trialcs.mems_allowed);
1114 } else {
1115 retval = nodelist_parse(buf, trialcs.mems_allowed);
1116 if (retval < 0)
1117 goto done;
1118
1119 if (!nodes_subset(trialcs.mems_allowed,
1120 node_states[N_HIGH_MEMORY]))
1121 return -EINVAL;
1122 }
1123 oldmem = cs->mems_allowed;
1124 if (nodes_equal(oldmem, trialcs.mems_allowed)) {
1125 retval = 0; /* Too easy - nothing to do */
1126 goto done;
1127 }
1128 retval = validate_change(cs, &trialcs);
1129 if (retval < 0)
1130 goto done;
1131
1132 mutex_lock(&callback_mutex);
1133 cs->mems_allowed = trialcs.mems_allowed;
1134 cs->mems_generation = cpuset_mems_generation++;
1135 mutex_unlock(&callback_mutex);
1136
1137 retval = update_tasks_nodemask(cs, &oldmem);
1138done:
1139 return retval;
1140}
1141
1033int current_cpuset_is_being_rebound(void) 1142int current_cpuset_is_being_rebound(void)
1034{ 1143{
1035 return task_cs(current) == cpuset_being_rebound; 1144 return task_cs(current) == cpuset_being_rebound;
@@ -1042,7 +1151,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1042 1151
1043 if (val != cs->relax_domain_level) { 1152 if (val != cs->relax_domain_level) {
1044 cs->relax_domain_level = val; 1153 cs->relax_domain_level = val;
1045 rebuild_sched_domains(); 1154 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
1155 async_rebuild_sched_domains();
1046 } 1156 }
1047 1157
1048 return 0; 1158 return 0;
@@ -1062,7 +1172,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1062{ 1172{
1063 struct cpuset trialcs; 1173 struct cpuset trialcs;
1064 int err; 1174 int err;
1065 int cpus_nonempty, balance_flag_changed; 1175 int balance_flag_changed;
1066 1176
1067 trialcs = *cs; 1177 trialcs = *cs;
1068 if (turning_on) 1178 if (turning_on)
@@ -1074,7 +1184,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1074 if (err < 0) 1184 if (err < 0)
1075 return err; 1185 return err;
1076 1186
1077 cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);
1078 balance_flag_changed = (is_sched_load_balance(cs) != 1187 balance_flag_changed = (is_sched_load_balance(cs) !=
1079 is_sched_load_balance(&trialcs)); 1188 is_sched_load_balance(&trialcs));
1080 1189
@@ -1082,8 +1191,8 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1082 cs->flags = trialcs.flags; 1191 cs->flags = trialcs.flags;
1083 mutex_unlock(&callback_mutex); 1192 mutex_unlock(&callback_mutex);
1084 1193
1085 if (cpus_nonempty && balance_flag_changed) 1194 if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed)
1086 rebuild_sched_domains(); 1195 async_rebuild_sched_domains();
1087 1196
1088 return 0; 1197 return 0;
1089} 1198}
@@ -1254,72 +1363,14 @@ typedef enum {
1254 FILE_SPREAD_SLAB, 1363 FILE_SPREAD_SLAB,
1255} cpuset_filetype_t; 1364} cpuset_filetype_t;
1256 1365
1257static ssize_t cpuset_common_file_write(struct cgroup *cont,
1258 struct cftype *cft,
1259 struct file *file,
1260 const char __user *userbuf,
1261 size_t nbytes, loff_t *unused_ppos)
1262{
1263 struct cpuset *cs = cgroup_cs(cont);
1264 cpuset_filetype_t type = cft->private;
1265 char *buffer;
1266 int retval = 0;
1267
1268 /* Crude upper limit on largest legitimate cpulist user might write. */
1269 if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES))
1270 return -E2BIG;
1271
1272 /* +1 for nul-terminator */
1273 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
1274 if (!buffer)
1275 return -ENOMEM;
1276
1277 if (copy_from_user(buffer, userbuf, nbytes)) {
1278 retval = -EFAULT;
1279 goto out1;
1280 }
1281 buffer[nbytes] = 0; /* nul-terminate */
1282
1283 cgroup_lock();
1284
1285 if (cgroup_is_removed(cont)) {
1286 retval = -ENODEV;
1287 goto out2;
1288 }
1289
1290 switch (type) {
1291 case FILE_CPULIST:
1292 retval = update_cpumask(cs, buffer);
1293 break;
1294 case FILE_MEMLIST:
1295 retval = update_nodemask(cs, buffer);
1296 break;
1297 default:
1298 retval = -EINVAL;
1299 goto out2;
1300 }
1301
1302 if (retval == 0)
1303 retval = nbytes;
1304out2:
1305 cgroup_unlock();
1306out1:
1307 kfree(buffer);
1308 return retval;
1309}
1310
1311static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1366static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1312{ 1367{
1313 int retval = 0; 1368 int retval = 0;
1314 struct cpuset *cs = cgroup_cs(cgrp); 1369 struct cpuset *cs = cgroup_cs(cgrp);
1315 cpuset_filetype_t type = cft->private; 1370 cpuset_filetype_t type = cft->private;
1316 1371
1317 cgroup_lock(); 1372 if (!cgroup_lock_live_group(cgrp))
1318
1319 if (cgroup_is_removed(cgrp)) {
1320 cgroup_unlock();
1321 return -ENODEV; 1373 return -ENODEV;
1322 }
1323 1374
1324 switch (type) { 1375 switch (type) {
1325 case FILE_CPU_EXCLUSIVE: 1376 case FILE_CPU_EXCLUSIVE:
@@ -1365,12 +1416,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1365 struct cpuset *cs = cgroup_cs(cgrp); 1416 struct cpuset *cs = cgroup_cs(cgrp);
1366 cpuset_filetype_t type = cft->private; 1417 cpuset_filetype_t type = cft->private;
1367 1418
1368 cgroup_lock(); 1419 if (!cgroup_lock_live_group(cgrp))
1369
1370 if (cgroup_is_removed(cgrp)) {
1371 cgroup_unlock();
1372 return -ENODEV; 1420 return -ENODEV;
1373 } 1421
1374 switch (type) { 1422 switch (type) {
1375 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1423 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1376 retval = update_relax_domain_level(cs, val); 1424 retval = update_relax_domain_level(cs, val);
@@ -1384,6 +1432,32 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1384} 1432}
1385 1433
1386/* 1434/*
1435 * Common handling for a write to a "cpus" or "mems" file.
1436 */
1437static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1438 const char *buf)
1439{
1440 int retval = 0;
1441
1442 if (!cgroup_lock_live_group(cgrp))
1443 return -ENODEV;
1444
1445 switch (cft->private) {
1446 case FILE_CPULIST:
1447 retval = update_cpumask(cgroup_cs(cgrp), buf);
1448 break;
1449 case FILE_MEMLIST:
1450 retval = update_nodemask(cgroup_cs(cgrp), buf);
1451 break;
1452 default:
1453 retval = -EINVAL;
1454 break;
1455 }
1456 cgroup_unlock();
1457 return retval;
1458}
1459
1460/*
1387 * These ascii lists should be read in a single call, by using a user 1461 * These ascii lists should be read in a single call, by using a user
1388 * buffer large enough to hold the entire map. If read in smaller 1462 * buffer large enough to hold the entire map. If read in smaller
1389 * chunks, there is no guarantee of atomicity. Since the display format 1463 * chunks, there is no guarantee of atomicity. Since the display format
@@ -1479,6 +1553,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1479 default: 1553 default:
1480 BUG(); 1554 BUG();
1481 } 1555 }
1556
1557 /* Unreachable but makes gcc happy */
1558 return 0;
1482} 1559}
1483 1560
1484static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) 1561static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
@@ -1491,6 +1568,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
1491 default: 1568 default:
1492 BUG(); 1569 BUG();
1493 } 1570 }
1571
1572 /* Unrechable but makes gcc happy */
1573 return 0;
1494} 1574}
1495 1575
1496 1576
@@ -1502,14 +1582,16 @@ static struct cftype files[] = {
1502 { 1582 {
1503 .name = "cpus", 1583 .name = "cpus",
1504 .read = cpuset_common_file_read, 1584 .read = cpuset_common_file_read,
1505 .write = cpuset_common_file_write, 1585 .write_string = cpuset_write_resmask,
1586 .max_write_len = (100U + 6 * NR_CPUS),
1506 .private = FILE_CPULIST, 1587 .private = FILE_CPULIST,
1507 }, 1588 },
1508 1589
1509 { 1590 {
1510 .name = "mems", 1591 .name = "mems",
1511 .read = cpuset_common_file_read, 1592 .read = cpuset_common_file_read,
1512 .write = cpuset_common_file_write, 1593 .write_string = cpuset_write_resmask,
1594 .max_write_len = (100U + 6 * MAX_NUMNODES),
1513 .private = FILE_MEMLIST, 1595 .private = FILE_MEMLIST,
1514 }, 1596 },
1515 1597
@@ -1677,15 +1759,9 @@ static struct cgroup_subsys_state *cpuset_create(
1677} 1759}
1678 1760
1679/* 1761/*
1680 * Locking note on the strange update_flag() call below:
1681 *
1682 * If the cpuset being removed has its flag 'sched_load_balance' 1762 * If the cpuset being removed has its flag 'sched_load_balance'
1683 * enabled, then simulate turning sched_load_balance off, which 1763 * enabled, then simulate turning sched_load_balance off, which
1684 * will call rebuild_sched_domains(). The get_online_cpus() 1764 * will call async_rebuild_sched_domains().
1685 * call in rebuild_sched_domains() must not be made while holding
1686 * callback_mutex. Elsewhere the kernel nests callback_mutex inside
1687 * get_online_cpus() calls. So the reverse nesting would risk an
1688 * ABBA deadlock.
1689 */ 1765 */
1690 1766
1691static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 1767static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -1704,7 +1780,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1704struct cgroup_subsys cpuset_subsys = { 1780struct cgroup_subsys cpuset_subsys = {
1705 .name = "cpuset", 1781 .name = "cpuset",
1706 .create = cpuset_create, 1782 .create = cpuset_create,
1707 .destroy = cpuset_destroy, 1783 .destroy = cpuset_destroy,
1708 .can_attach = cpuset_can_attach, 1784 .can_attach = cpuset_can_attach,
1709 .attach = cpuset_attach, 1785 .attach = cpuset_attach,
1710 .populate = cpuset_populate, 1786 .populate = cpuset_populate,
@@ -1790,13 +1866,13 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1790 scan.scan.heap = NULL; 1866 scan.scan.heap = NULL;
1791 scan.to = to->css.cgroup; 1867 scan.to = to->css.cgroup;
1792 1868
1793 if (cgroup_scan_tasks((struct cgroup_scanner *)&scan)) 1869 if (cgroup_scan_tasks(&scan.scan))
1794 printk(KERN_ERR "move_member_tasks_to_cpuset: " 1870 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1795 "cgroup_scan_tasks failed\n"); 1871 "cgroup_scan_tasks failed\n");
1796} 1872}
1797 1873
1798/* 1874/*
1799 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 1875 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
1800 * or memory nodes, we need to walk over the cpuset hierarchy, 1876 * or memory nodes, we need to walk over the cpuset hierarchy,
1801 * removing that CPU or node from all cpusets. If this removes the 1877 * removing that CPU or node from all cpusets. If this removes the
1802 * last CPU or node from a cpuset, then move the tasks in the empty 1878 * last CPU or node from a cpuset, then move the tasks in the empty
@@ -1844,31 +1920,31 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1844 * that has tasks along with an empty 'mems'. But if we did see such 1920 * that has tasks along with an empty 'mems'. But if we did see such
1845 * a cpuset, we'd handle it just like we do if its 'cpus' was empty. 1921 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
1846 */ 1922 */
1847static void scan_for_empty_cpusets(const struct cpuset *root) 1923static void scan_for_empty_cpusets(struct cpuset *root)
1848{ 1924{
1925 LIST_HEAD(queue);
1849 struct cpuset *cp; /* scans cpusets being updated */ 1926 struct cpuset *cp; /* scans cpusets being updated */
1850 struct cpuset *child; /* scans child cpusets of cp */ 1927 struct cpuset *child; /* scans child cpusets of cp */
1851 struct list_head queue;
1852 struct cgroup *cont; 1928 struct cgroup *cont;
1853 1929 nodemask_t oldmems;
1854 INIT_LIST_HEAD(&queue);
1855 1930
1856 list_add_tail((struct list_head *)&root->stack_list, &queue); 1931 list_add_tail((struct list_head *)&root->stack_list, &queue);
1857 1932
1858 while (!list_empty(&queue)) { 1933 while (!list_empty(&queue)) {
1859 cp = container_of(queue.next, struct cpuset, stack_list); 1934 cp = list_first_entry(&queue, struct cpuset, stack_list);
1860 list_del(queue.next); 1935 list_del(queue.next);
1861 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 1936 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
1862 child = cgroup_cs(cont); 1937 child = cgroup_cs(cont);
1863 list_add_tail(&child->stack_list, &queue); 1938 list_add_tail(&child->stack_list, &queue);
1864 } 1939 }
1865 cont = cp->css.cgroup;
1866 1940
1867 /* Continue past cpusets with all cpus, mems online */ 1941 /* Continue past cpusets with all cpus, mems online */
1868 if (cpus_subset(cp->cpus_allowed, cpu_online_map) && 1942 if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
1869 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 1943 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
1870 continue; 1944 continue;
1871 1945
1946 oldmems = cp->mems_allowed;
1947
1872 /* Remove offline cpus and mems from this cpuset. */ 1948 /* Remove offline cpus and mems from this cpuset. */
1873 mutex_lock(&callback_mutex); 1949 mutex_lock(&callback_mutex);
1874 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); 1950 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
@@ -1880,39 +1956,14 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
1880 if (cpus_empty(cp->cpus_allowed) || 1956 if (cpus_empty(cp->cpus_allowed) ||
1881 nodes_empty(cp->mems_allowed)) 1957 nodes_empty(cp->mems_allowed))
1882 remove_tasks_in_empty_cpuset(cp); 1958 remove_tasks_in_empty_cpuset(cp);
1959 else {
1960 update_tasks_cpumask(cp, NULL);
1961 update_tasks_nodemask(cp, &oldmems);
1962 }
1883 } 1963 }
1884} 1964}
1885 1965
1886/* 1966/*
1887 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
1888 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
1889 * track what's online after any CPU or memory node hotplug or unplug event.
1890 *
1891 * Since there are two callers of this routine, one for CPU hotplug
1892 * events and one for memory node hotplug events, we could have coded
1893 * two separate routines here. We code it as a single common routine
1894 * in order to minimize text size.
1895 */
1896
1897static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
1898{
1899 cgroup_lock();
1900
1901 top_cpuset.cpus_allowed = cpu_online_map;
1902 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1903 scan_for_empty_cpusets(&top_cpuset);
1904
1905 /*
1906 * Scheduler destroys domains on hotplug events.
1907 * Rebuild them based on the current settings.
1908 */
1909 if (rebuild_sd)
1910 rebuild_sched_domains();
1911
1912 cgroup_unlock();
1913}
1914
1915/*
1916 * The top_cpuset tracks what CPUs and Memory Nodes are online, 1967 * The top_cpuset tracks what CPUs and Memory Nodes are online,
1917 * period. This is necessary in order to make cpusets transparent 1968 * period. This is necessary in order to make cpusets transparent
1918 * (of no affect) on systems that are actively using CPU hotplug 1969 * (of no affect) on systems that are actively using CPU hotplug
@@ -1920,40 +1971,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
1920 * 1971 *
1921 * This routine ensures that top_cpuset.cpus_allowed tracks 1972 * This routine ensures that top_cpuset.cpus_allowed tracks
1922 * cpu_online_map on each CPU hotplug (cpuhp) event. 1973 * cpu_online_map on each CPU hotplug (cpuhp) event.
1974 *
1975 * Called within get_online_cpus(). Needs to call cgroup_lock()
1976 * before calling generate_sched_domains().
1923 */ 1977 */
1924 1978static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1925static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
1926 unsigned long phase, void *unused_cpu) 1979 unsigned long phase, void *unused_cpu)
1927{ 1980{
1981 struct sched_domain_attr *attr;
1982 cpumask_t *doms;
1983 int ndoms;
1984
1928 switch (phase) { 1985 switch (phase) {
1929 case CPU_UP_CANCELED:
1930 case CPU_UP_CANCELED_FROZEN:
1931 case CPU_DOWN_FAILED:
1932 case CPU_DOWN_FAILED_FROZEN:
1933 case CPU_ONLINE: 1986 case CPU_ONLINE:
1934 case CPU_ONLINE_FROZEN: 1987 case CPU_ONLINE_FROZEN:
1935 case CPU_DEAD: 1988 case CPU_DEAD:
1936 case CPU_DEAD_FROZEN: 1989 case CPU_DEAD_FROZEN:
1937 common_cpu_mem_hotplug_unplug(1);
1938 break; 1990 break;
1991
1939 default: 1992 default:
1940 return NOTIFY_DONE; 1993 return NOTIFY_DONE;
1941 } 1994 }
1942 1995
1996 cgroup_lock();
1997 top_cpuset.cpus_allowed = cpu_online_map;
1998 scan_for_empty_cpusets(&top_cpuset);
1999 ndoms = generate_sched_domains(&doms, &attr);
2000 cgroup_unlock();
2001
2002 /* Have scheduler rebuild the domains */
2003 partition_sched_domains(ndoms, doms, attr);
2004
1943 return NOTIFY_OK; 2005 return NOTIFY_OK;
1944} 2006}
1945 2007
1946#ifdef CONFIG_MEMORY_HOTPLUG 2008#ifdef CONFIG_MEMORY_HOTPLUG
1947/* 2009/*
1948 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2010 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
1949 * Call this routine anytime after you change 2011 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
1950 * node_states[N_HIGH_MEMORY]. 2012 * See also the previous routine cpuset_track_online_cpus().
1951 * See also the previous routine cpuset_handle_cpuhp().
1952 */ 2013 */
1953
1954void cpuset_track_online_nodes(void) 2014void cpuset_track_online_nodes(void)
1955{ 2015{
1956 common_cpu_mem_hotplug_unplug(0); 2016 cgroup_lock();
2017 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2018 scan_for_empty_cpusets(&top_cpuset);
2019 cgroup_unlock();
1957} 2020}
1958#endif 2021#endif
1959 2022
@@ -1968,11 +2031,10 @@ void __init cpuset_init_smp(void)
1968 top_cpuset.cpus_allowed = cpu_online_map; 2031 top_cpuset.cpus_allowed = cpu_online_map;
1969 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2032 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1970 2033
1971 hotcpu_notifier(cpuset_handle_cpuhp, 0); 2034 hotcpu_notifier(cpuset_track_online_cpus, 0);
1972} 2035}
1973 2036
1974/** 2037/**
1975
1976 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 2038 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
1977 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 2039 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
1978 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. 2040 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
@@ -2374,19 +2436,15 @@ const struct file_operations proc_cpuset_operations = {
2374void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2436void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2375{ 2437{
2376 seq_printf(m, "Cpus_allowed:\t"); 2438 seq_printf(m, "Cpus_allowed:\t");
2377 m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, 2439 seq_cpumask(m, &task->cpus_allowed);
2378 task->cpus_allowed);
2379 seq_printf(m, "\n"); 2440 seq_printf(m, "\n");
2380 seq_printf(m, "Cpus_allowed_list:\t"); 2441 seq_printf(m, "Cpus_allowed_list:\t");
2381 m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, 2442 seq_cpumask_list(m, &task->cpus_allowed);
2382 task->cpus_allowed);
2383 seq_printf(m, "\n"); 2443 seq_printf(m, "\n");
2384 seq_printf(m, "Mems_allowed:\t"); 2444 seq_printf(m, "Mems_allowed:\t");
2385 m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, 2445 seq_nodemask(m, &task->mems_allowed);
2386 task->mems_allowed);
2387 seq_printf(m, "\n"); 2446 seq_printf(m, "\n");
2388 seq_printf(m, "Mems_allowed_list:\t"); 2447 seq_printf(m, "Mems_allowed_list:\t");
2389 m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, 2448 seq_nodemask_list(m, &task->mems_allowed);
2390 task->mems_allowed);
2391 seq_printf(m, "\n"); 2449 seq_printf(m, "\n");
2392} 2450}