diff options
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 705 |
1 files changed, 396 insertions, 309 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9fceb97e989c..eab7bd6628e0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -14,6 +14,8 @@ | |||
14 | * 2003-10-22 Updates by Stephen Hemminger. | 14 | * 2003-10-22 Updates by Stephen Hemminger. |
15 | * 2004 May-July Rework by Paul Jackson. | 15 | * 2004 May-July Rework by Paul Jackson. |
16 | * 2006 Rework by Paul Menage to use generic cgroups | 16 | * 2006 Rework by Paul Menage to use generic cgroups |
17 | * 2008 Rework of the scheduler domains and CPU hotplug handling | ||
18 | * by Max Krasnyansky | ||
17 | * | 19 | * |
18 | * This file is subject to the terms and conditions of the GNU General Public | 20 | * This file is subject to the terms and conditions of the GNU General Public |
19 | * License. See the file COPYING in the main directory of the Linux | 21 | * License. See the file COPYING in the main directory of the Linux |
@@ -54,7 +56,6 @@ | |||
54 | #include <asm/uaccess.h> | 56 | #include <asm/uaccess.h> |
55 | #include <asm/atomic.h> | 57 | #include <asm/atomic.h> |
56 | #include <linux/mutex.h> | 58 | #include <linux/mutex.h> |
57 | #include <linux/kfifo.h> | ||
58 | #include <linux/workqueue.h> | 59 | #include <linux/workqueue.h> |
59 | #include <linux/cgroup.h> | 60 | #include <linux/cgroup.h> |
60 | 61 | ||
@@ -227,10 +228,6 @@ static struct cpuset top_cpuset = { | |||
227 | * The task_struct fields mems_allowed and mems_generation may only | 228 | * The task_struct fields mems_allowed and mems_generation may only |
228 | * be accessed in the context of that task, so require no locks. | 229 | * be accessed in the context of that task, so require no locks. |
229 | * | 230 | * |
230 | * The cpuset_common_file_write handler for operations that modify | ||
231 | * the cpuset hierarchy holds cgroup_mutex across the entire operation, | ||
232 | * single threading all such cpuset modifications across the system. | ||
233 | * | ||
234 | * The cpuset_common_file_read() handlers only hold callback_mutex across | 231 | * The cpuset_common_file_read() handlers only hold callback_mutex across |
235 | * small pieces of code, such as when reading out possibly multi-word | 232 | * small pieces of code, such as when reading out possibly multi-word |
236 | * cpumasks and nodemasks. | 233 | * cpumasks and nodemasks. |
@@ -241,9 +238,11 @@ static struct cpuset top_cpuset = { | |||
241 | 238 | ||
242 | static DEFINE_MUTEX(callback_mutex); | 239 | static DEFINE_MUTEX(callback_mutex); |
243 | 240 | ||
244 | /* This is ugly, but preserves the userspace API for existing cpuset | 241 | /* |
242 | * This is ugly, but preserves the userspace API for existing cpuset | ||
245 | * users. If someone tries to mount the "cpuset" filesystem, we | 243 | * users. If someone tries to mount the "cpuset" filesystem, we |
246 | * silently switch it to mount "cgroup" instead */ | 244 | * silently switch it to mount "cgroup" instead |
245 | */ | ||
247 | static int cpuset_get_sb(struct file_system_type *fs_type, | 246 | static int cpuset_get_sb(struct file_system_type *fs_type, |
248 | int flags, const char *unused_dev_name, | 247 | int flags, const char *unused_dev_name, |
249 | void *data, struct vfsmount *mnt) | 248 | void *data, struct vfsmount *mnt) |
@@ -369,7 +368,7 @@ void cpuset_update_task_memory_state(void) | |||
369 | my_cpusets_mem_gen = top_cpuset.mems_generation; | 368 | my_cpusets_mem_gen = top_cpuset.mems_generation; |
370 | } else { | 369 | } else { |
371 | rcu_read_lock(); | 370 | rcu_read_lock(); |
372 | my_cpusets_mem_gen = task_cs(current)->mems_generation; | 371 | my_cpusets_mem_gen = task_cs(tsk)->mems_generation; |
373 | rcu_read_unlock(); | 372 | rcu_read_unlock(); |
374 | } | 373 | } |
375 | 374 | ||
@@ -478,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
478 | } | 477 | } |
479 | 478 | ||
480 | /* | 479 | /* |
481 | * Helper routine for rebuild_sched_domains(). | 480 | * Helper routine for generate_sched_domains(). |
482 | * Do cpusets a, b have overlapping cpus_allowed masks? | 481 | * Do cpusets a, b have overlapping cpus_allowed masks? |
483 | */ | 482 | */ |
484 | |||
485 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | 483 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) |
486 | { | 484 | { |
487 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); | 485 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); |
@@ -490,29 +488,48 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | |||
490 | static void | 488 | static void |
491 | update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | 489 | update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) |
492 | { | 490 | { |
493 | if (!dattr) | ||
494 | return; | ||
495 | if (dattr->relax_domain_level < c->relax_domain_level) | 491 | if (dattr->relax_domain_level < c->relax_domain_level) |
496 | dattr->relax_domain_level = c->relax_domain_level; | 492 | dattr->relax_domain_level = c->relax_domain_level; |
497 | return; | 493 | return; |
498 | } | 494 | } |
499 | 495 | ||
496 | static void | ||
497 | update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | ||
498 | { | ||
499 | LIST_HEAD(q); | ||
500 | |||
501 | list_add(&c->stack_list, &q); | ||
502 | while (!list_empty(&q)) { | ||
503 | struct cpuset *cp; | ||
504 | struct cgroup *cont; | ||
505 | struct cpuset *child; | ||
506 | |||
507 | cp = list_first_entry(&q, struct cpuset, stack_list); | ||
508 | list_del(q.next); | ||
509 | |||
510 | if (cpus_empty(cp->cpus_allowed)) | ||
511 | continue; | ||
512 | |||
513 | if (is_sched_load_balance(cp)) | ||
514 | update_domain_attr(dattr, cp); | ||
515 | |||
516 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | ||
517 | child = cgroup_cs(cont); | ||
518 | list_add_tail(&child->stack_list, &q); | ||
519 | } | ||
520 | } | ||
521 | } | ||
522 | |||
500 | /* | 523 | /* |
501 | * rebuild_sched_domains() | 524 | * generate_sched_domains() |
502 | * | 525 | * |
503 | * If the flag 'sched_load_balance' of any cpuset with non-empty | 526 | * This function builds a partial partition of the systems CPUs |
504 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset | 527 | * A 'partial partition' is a set of non-overlapping subsets whose |
505 | * which has that flag enabled, or if any cpuset with a non-empty | 528 | * union is a subset of that set. |
506 | * 'cpus' is removed, then call this routine to rebuild the | 529 | * The output of this function needs to be passed to kernel/sched.c |
507 | * scheduler's dynamic sched domains. | 530 | * partition_sched_domains() routine, which will rebuild the scheduler's |
508 | * | 531 | * load balancing domains (sched domains) as specified by that partial |
509 | * This routine builds a partial partition of the systems CPUs | 532 | * partition. |
510 | * (the set of non-overlappping cpumask_t's in the array 'part' | ||
511 | * below), and passes that partial partition to the kernel/sched.c | ||
512 | * partition_sched_domains() routine, which will rebuild the | ||
513 | * schedulers load balancing domains (sched domains) as specified | ||
514 | * by that partial partition. A 'partial partition' is a set of | ||
515 | * non-overlapping subsets whose union is a subset of that set. | ||
516 | * | 533 | * |
517 | * See "What is sched_load_balance" in Documentation/cpusets.txt | 534 | * See "What is sched_load_balance" in Documentation/cpusets.txt |
518 | * for a background explanation of this. | 535 | * for a background explanation of this. |
@@ -522,16 +539,10 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | |||
522 | * domains when operating in the severe memory shortage situations | 539 | * domains when operating in the severe memory shortage situations |
523 | * that could cause allocation failures below. | 540 | * that could cause allocation failures below. |
524 | * | 541 | * |
525 | * Call with cgroup_mutex held. May take callback_mutex during | 542 | * Must be called with cgroup_lock held. |
526 | * call due to the kfifo_alloc() and kmalloc() calls. May nest | ||
527 | * a call to the get_online_cpus()/put_online_cpus() pair. | ||
528 | * Must not be called holding callback_mutex, because we must not | ||
529 | * call get_online_cpus() while holding callback_mutex. Elsewhere | ||
530 | * the kernel nests callback_mutex inside get_online_cpus() calls. | ||
531 | * So the reverse nesting would risk an ABBA deadlock. | ||
532 | * | 543 | * |
533 | * The three key local variables below are: | 544 | * The three key local variables below are: |
534 | * q - a kfifo queue of cpuset pointers, used to implement a | 545 | * q - a linked-list queue of cpuset pointers, used to implement a |
535 | * top-down scan of all cpusets. This scan loads a pointer | 546 | * top-down scan of all cpusets. This scan loads a pointer |
536 | * to each cpuset marked is_sched_load_balance into the | 547 | * to each cpuset marked is_sched_load_balance into the |
537 | * array 'csa'. For our purposes, rebuilding the schedulers | 548 | * array 'csa'. For our purposes, rebuilding the schedulers |
@@ -563,10 +574,10 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | |||
563 | * element of the partition (one sched domain) to be passed to | 574 | * element of the partition (one sched domain) to be passed to |
564 | * partition_sched_domains(). | 575 | * partition_sched_domains(). |
565 | */ | 576 | */ |
566 | 577 | static int generate_sched_domains(cpumask_t **domains, | |
567 | static void rebuild_sched_domains(void) | 578 | struct sched_domain_attr **attributes) |
568 | { | 579 | { |
569 | struct kfifo *q; /* queue of cpusets to be scanned */ | 580 | LIST_HEAD(q); /* queue of cpusets to be scanned */ |
570 | struct cpuset *cp; /* scans q */ | 581 | struct cpuset *cp; /* scans q */ |
571 | struct cpuset **csa; /* array of all cpuset ptrs */ | 582 | struct cpuset **csa; /* array of all cpuset ptrs */ |
572 | int csn; /* how many cpuset ptrs in csa so far */ | 583 | int csn; /* how many cpuset ptrs in csa so far */ |
@@ -576,44 +587,58 @@ static void rebuild_sched_domains(void) | |||
576 | int ndoms; /* number of sched domains in result */ | 587 | int ndoms; /* number of sched domains in result */ |
577 | int nslot; /* next empty doms[] cpumask_t slot */ | 588 | int nslot; /* next empty doms[] cpumask_t slot */ |
578 | 589 | ||
579 | q = NULL; | 590 | ndoms = 0; |
580 | csa = NULL; | ||
581 | doms = NULL; | 591 | doms = NULL; |
582 | dattr = NULL; | 592 | dattr = NULL; |
593 | csa = NULL; | ||
583 | 594 | ||
584 | /* Special case for the 99% of systems with one, full, sched domain */ | 595 | /* Special case for the 99% of systems with one, full, sched domain */ |
585 | if (is_sched_load_balance(&top_cpuset)) { | 596 | if (is_sched_load_balance(&top_cpuset)) { |
586 | ndoms = 1; | ||
587 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | 597 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); |
588 | if (!doms) | 598 | if (!doms) |
589 | goto rebuild; | 599 | goto done; |
600 | |||
590 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); | 601 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); |
591 | if (dattr) { | 602 | if (dattr) { |
592 | *dattr = SD_ATTR_INIT; | 603 | *dattr = SD_ATTR_INIT; |
593 | update_domain_attr(dattr, &top_cpuset); | 604 | update_domain_attr_tree(dattr, &top_cpuset); |
594 | } | 605 | } |
595 | *doms = top_cpuset.cpus_allowed; | 606 | *doms = top_cpuset.cpus_allowed; |
596 | goto rebuild; | ||
597 | } | ||
598 | 607 | ||
599 | q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL); | 608 | ndoms = 1; |
600 | if (IS_ERR(q)) | ||
601 | goto done; | 609 | goto done; |
610 | } | ||
611 | |||
602 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); | 612 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); |
603 | if (!csa) | 613 | if (!csa) |
604 | goto done; | 614 | goto done; |
605 | csn = 0; | 615 | csn = 0; |
606 | 616 | ||
607 | cp = &top_cpuset; | 617 | list_add(&top_cpuset.stack_list, &q); |
608 | __kfifo_put(q, (void *)&cp, sizeof(cp)); | 618 | while (!list_empty(&q)) { |
609 | while (__kfifo_get(q, (void *)&cp, sizeof(cp))) { | ||
610 | struct cgroup *cont; | 619 | struct cgroup *cont; |
611 | struct cpuset *child; /* scans child cpusets of cp */ | 620 | struct cpuset *child; /* scans child cpusets of cp */ |
612 | if (is_sched_load_balance(cp)) | 621 | |
622 | cp = list_first_entry(&q, struct cpuset, stack_list); | ||
623 | list_del(q.next); | ||
624 | |||
625 | if (cpus_empty(cp->cpus_allowed)) | ||
626 | continue; | ||
627 | |||
628 | /* | ||
629 | * All child cpusets contain a subset of the parent's cpus, so | ||
630 | * just skip them, and then we call update_domain_attr_tree() | ||
631 | * to calc relax_domain_level of the corresponding sched | ||
632 | * domain. | ||
633 | */ | ||
634 | if (is_sched_load_balance(cp)) { | ||
613 | csa[csn++] = cp; | 635 | csa[csn++] = cp; |
636 | continue; | ||
637 | } | ||
638 | |||
614 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | 639 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { |
615 | child = cgroup_cs(cont); | 640 | child = cgroup_cs(cont); |
616 | __kfifo_put(q, (void *)&child, sizeof(cp)); | 641 | list_add_tail(&child->stack_list, &q); |
617 | } | 642 | } |
618 | } | 643 | } |
619 | 644 | ||
@@ -644,91 +669,141 @@ restart: | |||
644 | } | 669 | } |
645 | } | 670 | } |
646 | 671 | ||
647 | /* Convert <csn, csa> to <ndoms, doms> */ | 672 | /* |
673 | * Now we know how many domains to create. | ||
674 | * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. | ||
675 | */ | ||
648 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); | 676 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); |
649 | if (!doms) | 677 | if (!doms) { |
650 | goto rebuild; | 678 | ndoms = 0; |
679 | goto done; | ||
680 | } | ||
681 | |||
682 | /* | ||
683 | * The rest of the code, including the scheduler, can deal with | ||
684 | * dattr==NULL case. No need to abort if alloc fails. | ||
685 | */ | ||
651 | dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); | 686 | dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); |
652 | 687 | ||
653 | for (nslot = 0, i = 0; i < csn; i++) { | 688 | for (nslot = 0, i = 0; i < csn; i++) { |
654 | struct cpuset *a = csa[i]; | 689 | struct cpuset *a = csa[i]; |
690 | cpumask_t *dp; | ||
655 | int apn = a->pn; | 691 | int apn = a->pn; |
656 | 692 | ||
657 | if (apn >= 0) { | 693 | if (apn < 0) { |
658 | cpumask_t *dp = doms + nslot; | 694 | /* Skip completed partitions */ |
659 | 695 | continue; | |
660 | if (nslot == ndoms) { | 696 | } |
661 | static int warnings = 10; | 697 | |
662 | if (warnings) { | 698 | dp = doms + nslot; |
663 | printk(KERN_WARNING | 699 | |
664 | "rebuild_sched_domains confused:" | 700 | if (nslot == ndoms) { |
665 | " nslot %d, ndoms %d, csn %d, i %d," | 701 | static int warnings = 10; |
666 | " apn %d\n", | 702 | if (warnings) { |
667 | nslot, ndoms, csn, i, apn); | 703 | printk(KERN_WARNING |
668 | warnings--; | 704 | "rebuild_sched_domains confused:" |
669 | } | 705 | " nslot %d, ndoms %d, csn %d, i %d," |
670 | continue; | 706 | " apn %d\n", |
707 | nslot, ndoms, csn, i, apn); | ||
708 | warnings--; | ||
671 | } | 709 | } |
710 | continue; | ||
711 | } | ||
672 | 712 | ||
673 | cpus_clear(*dp); | 713 | cpus_clear(*dp); |
674 | if (dattr) | 714 | if (dattr) |
675 | *(dattr + nslot) = SD_ATTR_INIT; | 715 | *(dattr + nslot) = SD_ATTR_INIT; |
676 | for (j = i; j < csn; j++) { | 716 | for (j = i; j < csn; j++) { |
677 | struct cpuset *b = csa[j]; | 717 | struct cpuset *b = csa[j]; |
678 | 718 | ||
679 | if (apn == b->pn) { | 719 | if (apn == b->pn) { |
680 | cpus_or(*dp, *dp, b->cpus_allowed); | 720 | cpus_or(*dp, *dp, b->cpus_allowed); |
681 | b->pn = -1; | 721 | if (dattr) |
682 | update_domain_attr(dattr, b); | 722 | update_domain_attr_tree(dattr + nslot, b); |
683 | } | 723 | |
724 | /* Done with this partition */ | ||
725 | b->pn = -1; | ||
684 | } | 726 | } |
685 | nslot++; | ||
686 | } | 727 | } |
728 | nslot++; | ||
687 | } | 729 | } |
688 | BUG_ON(nslot != ndoms); | 730 | BUG_ON(nslot != ndoms); |
689 | 731 | ||
690 | rebuild: | ||
691 | /* Have scheduler rebuild sched domains */ | ||
692 | get_online_cpus(); | ||
693 | partition_sched_domains(ndoms, doms, dattr); | ||
694 | put_online_cpus(); | ||
695 | |||
696 | done: | 732 | done: |
697 | if (q && !IS_ERR(q)) | ||
698 | kfifo_free(q); | ||
699 | kfree(csa); | 733 | kfree(csa); |
700 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ | 734 | |
701 | /* Don't kfree(dattr) -- partition_sched_domains() does that. */ | 735 | *domains = doms; |
736 | *attributes = dattr; | ||
737 | return ndoms; | ||
702 | } | 738 | } |
703 | 739 | ||
704 | static inline int started_after_time(struct task_struct *t1, | 740 | /* |
705 | struct timespec *time, | 741 | * Rebuild scheduler domains. |
706 | struct task_struct *t2) | 742 | * |
743 | * Call with neither cgroup_mutex held nor within get_online_cpus(). | ||
744 | * Takes both cgroup_mutex and get_online_cpus(). | ||
745 | * | ||
746 | * Cannot be directly called from cpuset code handling changes | ||
747 | * to the cpuset pseudo-filesystem, because it cannot be called | ||
748 | * from code that already holds cgroup_mutex. | ||
749 | */ | ||
750 | static void do_rebuild_sched_domains(struct work_struct *unused) | ||
707 | { | 751 | { |
708 | int start_diff = timespec_compare(&t1->start_time, time); | 752 | struct sched_domain_attr *attr; |
709 | if (start_diff > 0) { | 753 | cpumask_t *doms; |
710 | return 1; | 754 | int ndoms; |
711 | } else if (start_diff < 0) { | 755 | |
712 | return 0; | 756 | get_online_cpus(); |
713 | } else { | 757 | |
714 | /* | 758 | /* Generate domain masks and attrs */ |
715 | * Arbitrarily, if two processes started at the same | 759 | cgroup_lock(); |
716 | * time, we'll say that the lower pointer value | 760 | ndoms = generate_sched_domains(&doms, &attr); |
717 | * started first. Note that t2 may have exited by now | 761 | cgroup_unlock(); |
718 | * so this may not be a valid pointer any longer, but | 762 | |
719 | * that's fine - it still serves to distinguish | 763 | /* Have scheduler rebuild the domains */ |
720 | * between two tasks started (effectively) | 764 | partition_sched_domains(ndoms, doms, attr); |
721 | * simultaneously. | 765 | |
722 | */ | 766 | put_online_cpus(); |
723 | return t1 > t2; | 767 | } |
724 | } | 768 | |
769 | static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); | ||
770 | |||
771 | /* | ||
772 | * Rebuild scheduler domains, asynchronously via workqueue. | ||
773 | * | ||
774 | * If the flag 'sched_load_balance' of any cpuset with non-empty | ||
775 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset | ||
776 | * which has that flag enabled, or if any cpuset with a non-empty | ||
777 | * 'cpus' is removed, then call this routine to rebuild the | ||
778 | * scheduler's dynamic sched domains. | ||
779 | * | ||
780 | * The rebuild_sched_domains() and partition_sched_domains() | ||
781 | * routines must nest cgroup_lock() inside get_online_cpus(), | ||
782 | * but such cpuset changes as these must nest that locking the | ||
783 | * other way, holding cgroup_lock() for much of the code. | ||
784 | * | ||
785 | * So in order to avoid an ABBA deadlock, the cpuset code handling | ||
786 | * these user changes delegates the actual sched domain rebuilding | ||
787 | * to a separate workqueue thread, which ends up processing the | ||
788 | * above do_rebuild_sched_domains() function. | ||
789 | */ | ||
790 | static void async_rebuild_sched_domains(void) | ||
791 | { | ||
792 | schedule_work(&rebuild_sched_domains_work); | ||
725 | } | 793 | } |
726 | 794 | ||
727 | static inline int started_after(void *p1, void *p2) | 795 | /* |
796 | * Accomplishes the same scheduler domain rebuild as the above | ||
797 | * async_rebuild_sched_domains(), however it directly calls the | ||
798 | * rebuild routine synchronously rather than calling it via an | ||
799 | * asynchronous work thread. | ||
800 | * | ||
801 | * This can only be called from code that is not holding | ||
802 | * cgroup_mutex (not nested in a cgroup_lock() call.) | ||
803 | */ | ||
804 | void rebuild_sched_domains(void) | ||
728 | { | 805 | { |
729 | struct task_struct *t1 = p1; | 806 | do_rebuild_sched_domains(NULL); |
730 | struct task_struct *t2 = p2; | ||
731 | return started_after_time(t1, &t2->start_time, t2); | ||
732 | } | 807 | } |
733 | 808 | ||
734 | /** | 809 | /** |
@@ -766,15 +841,38 @@ static void cpuset_change_cpumask(struct task_struct *tsk, | |||
766 | } | 841 | } |
767 | 842 | ||
768 | /** | 843 | /** |
844 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. | ||
845 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed | ||
846 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | ||
847 | * | ||
848 | * Called with cgroup_mutex held | ||
849 | * | ||
850 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | ||
851 | * calling callback functions for each. | ||
852 | * | ||
853 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | ||
854 | * if @heap != NULL. | ||
855 | */ | ||
856 | static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) | ||
857 | { | ||
858 | struct cgroup_scanner scan; | ||
859 | |||
860 | scan.cg = cs->css.cgroup; | ||
861 | scan.test_task = cpuset_test_cpumask; | ||
862 | scan.process_task = cpuset_change_cpumask; | ||
863 | scan.heap = heap; | ||
864 | cgroup_scan_tasks(&scan); | ||
865 | } | ||
866 | |||
867 | /** | ||
769 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it | 868 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it |
770 | * @cs: the cpuset to consider | 869 | * @cs: the cpuset to consider |
771 | * @buf: buffer of cpu numbers written to this cpuset | 870 | * @buf: buffer of cpu numbers written to this cpuset |
772 | */ | 871 | */ |
773 | static int update_cpumask(struct cpuset *cs, char *buf) | 872 | static int update_cpumask(struct cpuset *cs, const char *buf) |
774 | { | 873 | { |
775 | struct cpuset trialcs; | ||
776 | struct cgroup_scanner scan; | ||
777 | struct ptr_heap heap; | 874 | struct ptr_heap heap; |
875 | struct cpuset trialcs; | ||
778 | int retval; | 876 | int retval; |
779 | int is_load_balanced; | 877 | int is_load_balanced; |
780 | 878 | ||
@@ -790,7 +888,6 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
790 | * that parsing. The validate_change() call ensures that cpusets | 888 | * that parsing. The validate_change() call ensures that cpusets |
791 | * with tasks have cpus. | 889 | * with tasks have cpus. |
792 | */ | 890 | */ |
793 | buf = strstrip(buf); | ||
794 | if (!*buf) { | 891 | if (!*buf) { |
795 | cpus_clear(trialcs.cpus_allowed); | 892 | cpus_clear(trialcs.cpus_allowed); |
796 | } else { | 893 | } else { |
@@ -809,7 +906,7 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
809 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) | 906 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) |
810 | return 0; | 907 | return 0; |
811 | 908 | ||
812 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); | 909 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); |
813 | if (retval) | 910 | if (retval) |
814 | return retval; | 911 | return retval; |
815 | 912 | ||
@@ -823,15 +920,12 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
823 | * Scan tasks in the cpuset, and update the cpumasks of any | 920 | * Scan tasks in the cpuset, and update the cpumasks of any |
824 | * that need an update. | 921 | * that need an update. |
825 | */ | 922 | */ |
826 | scan.cg = cs->css.cgroup; | 923 | update_tasks_cpumask(cs, &heap); |
827 | scan.test_task = cpuset_test_cpumask; | 924 | |
828 | scan.process_task = cpuset_change_cpumask; | ||
829 | scan.heap = &heap; | ||
830 | cgroup_scan_tasks(&scan); | ||
831 | heap_free(&heap); | 925 | heap_free(&heap); |
832 | 926 | ||
833 | if (is_load_balanced) | 927 | if (is_load_balanced) |
834 | rebuild_sched_domains(); | 928 | async_rebuild_sched_domains(); |
835 | return 0; | 929 | return 0; |
836 | } | 930 | } |
837 | 931 | ||
@@ -884,74 +978,25 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | |||
884 | mutex_unlock(&callback_mutex); | 978 | mutex_unlock(&callback_mutex); |
885 | } | 979 | } |
886 | 980 | ||
887 | /* | ||
888 | * Handle user request to change the 'mems' memory placement | ||
889 | * of a cpuset. Needs to validate the request, update the | ||
890 | * cpusets mems_allowed and mems_generation, and for each | ||
891 | * task in the cpuset, rebind any vma mempolicies and if | ||
892 | * the cpuset is marked 'memory_migrate', migrate the tasks | ||
893 | * pages to the new memory. | ||
894 | * | ||
895 | * Call with cgroup_mutex held. May take callback_mutex during call. | ||
896 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | ||
897 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | ||
898 | * their mempolicies to the cpusets new mems_allowed. | ||
899 | */ | ||
900 | |||
901 | static void *cpuset_being_rebound; | 981 | static void *cpuset_being_rebound; |
902 | 982 | ||
903 | static int update_nodemask(struct cpuset *cs, char *buf) | 983 | /** |
984 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. | ||
985 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed | ||
986 | * @oldmem: old mems_allowed of cpuset cs | ||
987 | * | ||
988 | * Called with cgroup_mutex held | ||
989 | * Return 0 if successful, -errno if not. | ||
990 | */ | ||
991 | static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) | ||
904 | { | 992 | { |
905 | struct cpuset trialcs; | ||
906 | nodemask_t oldmem; | ||
907 | struct task_struct *p; | 993 | struct task_struct *p; |
908 | struct mm_struct **mmarray; | 994 | struct mm_struct **mmarray; |
909 | int i, n, ntasks; | 995 | int i, n, ntasks; |
910 | int migrate; | 996 | int migrate; |
911 | int fudge; | 997 | int fudge; |
912 | int retval; | ||
913 | struct cgroup_iter it; | 998 | struct cgroup_iter it; |
914 | 999 | int retval; | |
915 | /* | ||
916 | * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; | ||
917 | * it's read-only | ||
918 | */ | ||
919 | if (cs == &top_cpuset) | ||
920 | return -EACCES; | ||
921 | |||
922 | trialcs = *cs; | ||
923 | |||
924 | /* | ||
925 | * An empty mems_allowed is ok iff there are no tasks in the cpuset. | ||
926 | * Since nodelist_parse() fails on an empty mask, we special case | ||
927 | * that parsing. The validate_change() call ensures that cpusets | ||
928 | * with tasks have memory. | ||
929 | */ | ||
930 | buf = strstrip(buf); | ||
931 | if (!*buf) { | ||
932 | nodes_clear(trialcs.mems_allowed); | ||
933 | } else { | ||
934 | retval = nodelist_parse(buf, trialcs.mems_allowed); | ||
935 | if (retval < 0) | ||
936 | goto done; | ||
937 | |||
938 | if (!nodes_subset(trialcs.mems_allowed, | ||
939 | node_states[N_HIGH_MEMORY])) | ||
940 | return -EINVAL; | ||
941 | } | ||
942 | oldmem = cs->mems_allowed; | ||
943 | if (nodes_equal(oldmem, trialcs.mems_allowed)) { | ||
944 | retval = 0; /* Too easy - nothing to do */ | ||
945 | goto done; | ||
946 | } | ||
947 | retval = validate_change(cs, &trialcs); | ||
948 | if (retval < 0) | ||
949 | goto done; | ||
950 | |||
951 | mutex_lock(&callback_mutex); | ||
952 | cs->mems_allowed = trialcs.mems_allowed; | ||
953 | cs->mems_generation = cpuset_mems_generation++; | ||
954 | mutex_unlock(&callback_mutex); | ||
955 | 1000 | ||
956 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ | 1001 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ |
957 | 1002 | ||
@@ -1018,7 +1063,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
1018 | 1063 | ||
1019 | mpol_rebind_mm(mm, &cs->mems_allowed); | 1064 | mpol_rebind_mm(mm, &cs->mems_allowed); |
1020 | if (migrate) | 1065 | if (migrate) |
1021 | cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed); | 1066 | cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); |
1022 | mmput(mm); | 1067 | mmput(mm); |
1023 | } | 1068 | } |
1024 | 1069 | ||
@@ -1030,6 +1075,70 @@ done: | |||
1030 | return retval; | 1075 | return retval; |
1031 | } | 1076 | } |
1032 | 1077 | ||
1078 | /* | ||
1079 | * Handle user request to change the 'mems' memory placement | ||
1080 | * of a cpuset. Needs to validate the request, update the | ||
1081 | * cpusets mems_allowed and mems_generation, and for each | ||
1082 | * task in the cpuset, rebind any vma mempolicies and if | ||
1083 | * the cpuset is marked 'memory_migrate', migrate the tasks | ||
1084 | * pages to the new memory. | ||
1085 | * | ||
1086 | * Call with cgroup_mutex held. May take callback_mutex during call. | ||
1087 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | ||
1088 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | ||
1089 | * their mempolicies to the cpusets new mems_allowed. | ||
1090 | */ | ||
1091 | static int update_nodemask(struct cpuset *cs, const char *buf) | ||
1092 | { | ||
1093 | struct cpuset trialcs; | ||
1094 | nodemask_t oldmem; | ||
1095 | int retval; | ||
1096 | |||
1097 | /* | ||
1098 | * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; | ||
1099 | * it's read-only | ||
1100 | */ | ||
1101 | if (cs == &top_cpuset) | ||
1102 | return -EACCES; | ||
1103 | |||
1104 | trialcs = *cs; | ||
1105 | |||
1106 | /* | ||
1107 | * An empty mems_allowed is ok iff there are no tasks in the cpuset. | ||
1108 | * Since nodelist_parse() fails on an empty mask, we special case | ||
1109 | * that parsing. The validate_change() call ensures that cpusets | ||
1110 | * with tasks have memory. | ||
1111 | */ | ||
1112 | if (!*buf) { | ||
1113 | nodes_clear(trialcs.mems_allowed); | ||
1114 | } else { | ||
1115 | retval = nodelist_parse(buf, trialcs.mems_allowed); | ||
1116 | if (retval < 0) | ||
1117 | goto done; | ||
1118 | |||
1119 | if (!nodes_subset(trialcs.mems_allowed, | ||
1120 | node_states[N_HIGH_MEMORY])) | ||
1121 | return -EINVAL; | ||
1122 | } | ||
1123 | oldmem = cs->mems_allowed; | ||
1124 | if (nodes_equal(oldmem, trialcs.mems_allowed)) { | ||
1125 | retval = 0; /* Too easy - nothing to do */ | ||
1126 | goto done; | ||
1127 | } | ||
1128 | retval = validate_change(cs, &trialcs); | ||
1129 | if (retval < 0) | ||
1130 | goto done; | ||
1131 | |||
1132 | mutex_lock(&callback_mutex); | ||
1133 | cs->mems_allowed = trialcs.mems_allowed; | ||
1134 | cs->mems_generation = cpuset_mems_generation++; | ||
1135 | mutex_unlock(&callback_mutex); | ||
1136 | |||
1137 | retval = update_tasks_nodemask(cs, &oldmem); | ||
1138 | done: | ||
1139 | return retval; | ||
1140 | } | ||
1141 | |||
1033 | int current_cpuset_is_being_rebound(void) | 1142 | int current_cpuset_is_being_rebound(void) |
1034 | { | 1143 | { |
1035 | return task_cs(current) == cpuset_being_rebound; | 1144 | return task_cs(current) == cpuset_being_rebound; |
@@ -1042,7 +1151,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
1042 | 1151 | ||
1043 | if (val != cs->relax_domain_level) { | 1152 | if (val != cs->relax_domain_level) { |
1044 | cs->relax_domain_level = val; | 1153 | cs->relax_domain_level = val; |
1045 | rebuild_sched_domains(); | 1154 | if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) |
1155 | async_rebuild_sched_domains(); | ||
1046 | } | 1156 | } |
1047 | 1157 | ||
1048 | return 0; | 1158 | return 0; |
@@ -1083,7 +1193,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | |||
1083 | mutex_unlock(&callback_mutex); | 1193 | mutex_unlock(&callback_mutex); |
1084 | 1194 | ||
1085 | if (cpus_nonempty && balance_flag_changed) | 1195 | if (cpus_nonempty && balance_flag_changed) |
1086 | rebuild_sched_domains(); | 1196 | async_rebuild_sched_domains(); |
1087 | 1197 | ||
1088 | return 0; | 1198 | return 0; |
1089 | } | 1199 | } |
@@ -1194,6 +1304,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, | |||
1194 | 1304 | ||
1195 | if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | 1305 | if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) |
1196 | return -ENOSPC; | 1306 | return -ENOSPC; |
1307 | if (tsk->flags & PF_THREAD_BOUND) { | ||
1308 | cpumask_t mask; | ||
1309 | |||
1310 | mutex_lock(&callback_mutex); | ||
1311 | mask = cs->cpus_allowed; | ||
1312 | mutex_unlock(&callback_mutex); | ||
1313 | if (!cpus_equal(tsk->cpus_allowed, mask)) | ||
1314 | return -EINVAL; | ||
1315 | } | ||
1197 | 1316 | ||
1198 | return security_task_setscheduler(tsk, 0, NULL); | 1317 | return security_task_setscheduler(tsk, 0, NULL); |
1199 | } | 1318 | } |
@@ -1207,11 +1326,14 @@ static void cpuset_attach(struct cgroup_subsys *ss, | |||
1207 | struct mm_struct *mm; | 1326 | struct mm_struct *mm; |
1208 | struct cpuset *cs = cgroup_cs(cont); | 1327 | struct cpuset *cs = cgroup_cs(cont); |
1209 | struct cpuset *oldcs = cgroup_cs(oldcont); | 1328 | struct cpuset *oldcs = cgroup_cs(oldcont); |
1329 | int err; | ||
1210 | 1330 | ||
1211 | mutex_lock(&callback_mutex); | 1331 | mutex_lock(&callback_mutex); |
1212 | guarantee_online_cpus(cs, &cpus); | 1332 | guarantee_online_cpus(cs, &cpus); |
1213 | set_cpus_allowed_ptr(tsk, &cpus); | 1333 | err = set_cpus_allowed_ptr(tsk, &cpus); |
1214 | mutex_unlock(&callback_mutex); | 1334 | mutex_unlock(&callback_mutex); |
1335 | if (err) | ||
1336 | return; | ||
1215 | 1337 | ||
1216 | from = oldcs->mems_allowed; | 1338 | from = oldcs->mems_allowed; |
1217 | to = cs->mems_allowed; | 1339 | to = cs->mems_allowed; |
@@ -1242,72 +1364,14 @@ typedef enum { | |||
1242 | FILE_SPREAD_SLAB, | 1364 | FILE_SPREAD_SLAB, |
1243 | } cpuset_filetype_t; | 1365 | } cpuset_filetype_t; |
1244 | 1366 | ||
1245 | static ssize_t cpuset_common_file_write(struct cgroup *cont, | ||
1246 | struct cftype *cft, | ||
1247 | struct file *file, | ||
1248 | const char __user *userbuf, | ||
1249 | size_t nbytes, loff_t *unused_ppos) | ||
1250 | { | ||
1251 | struct cpuset *cs = cgroup_cs(cont); | ||
1252 | cpuset_filetype_t type = cft->private; | ||
1253 | char *buffer; | ||
1254 | int retval = 0; | ||
1255 | |||
1256 | /* Crude upper limit on largest legitimate cpulist user might write. */ | ||
1257 | if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES)) | ||
1258 | return -E2BIG; | ||
1259 | |||
1260 | /* +1 for nul-terminator */ | ||
1261 | buffer = kmalloc(nbytes + 1, GFP_KERNEL); | ||
1262 | if (!buffer) | ||
1263 | return -ENOMEM; | ||
1264 | |||
1265 | if (copy_from_user(buffer, userbuf, nbytes)) { | ||
1266 | retval = -EFAULT; | ||
1267 | goto out1; | ||
1268 | } | ||
1269 | buffer[nbytes] = 0; /* nul-terminate */ | ||
1270 | |||
1271 | cgroup_lock(); | ||
1272 | |||
1273 | if (cgroup_is_removed(cont)) { | ||
1274 | retval = -ENODEV; | ||
1275 | goto out2; | ||
1276 | } | ||
1277 | |||
1278 | switch (type) { | ||
1279 | case FILE_CPULIST: | ||
1280 | retval = update_cpumask(cs, buffer); | ||
1281 | break; | ||
1282 | case FILE_MEMLIST: | ||
1283 | retval = update_nodemask(cs, buffer); | ||
1284 | break; | ||
1285 | default: | ||
1286 | retval = -EINVAL; | ||
1287 | goto out2; | ||
1288 | } | ||
1289 | |||
1290 | if (retval == 0) | ||
1291 | retval = nbytes; | ||
1292 | out2: | ||
1293 | cgroup_unlock(); | ||
1294 | out1: | ||
1295 | kfree(buffer); | ||
1296 | return retval; | ||
1297 | } | ||
1298 | |||
1299 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | 1367 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) |
1300 | { | 1368 | { |
1301 | int retval = 0; | 1369 | int retval = 0; |
1302 | struct cpuset *cs = cgroup_cs(cgrp); | 1370 | struct cpuset *cs = cgroup_cs(cgrp); |
1303 | cpuset_filetype_t type = cft->private; | 1371 | cpuset_filetype_t type = cft->private; |
1304 | 1372 | ||
1305 | cgroup_lock(); | 1373 | if (!cgroup_lock_live_group(cgrp)) |
1306 | |||
1307 | if (cgroup_is_removed(cgrp)) { | ||
1308 | cgroup_unlock(); | ||
1309 | return -ENODEV; | 1374 | return -ENODEV; |
1310 | } | ||
1311 | 1375 | ||
1312 | switch (type) { | 1376 | switch (type) { |
1313 | case FILE_CPU_EXCLUSIVE: | 1377 | case FILE_CPU_EXCLUSIVE: |
@@ -1353,12 +1417,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | |||
1353 | struct cpuset *cs = cgroup_cs(cgrp); | 1417 | struct cpuset *cs = cgroup_cs(cgrp); |
1354 | cpuset_filetype_t type = cft->private; | 1418 | cpuset_filetype_t type = cft->private; |
1355 | 1419 | ||
1356 | cgroup_lock(); | 1420 | if (!cgroup_lock_live_group(cgrp)) |
1357 | |||
1358 | if (cgroup_is_removed(cgrp)) { | ||
1359 | cgroup_unlock(); | ||
1360 | return -ENODEV; | 1421 | return -ENODEV; |
1361 | } | 1422 | |
1362 | switch (type) { | 1423 | switch (type) { |
1363 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | 1424 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: |
1364 | retval = update_relax_domain_level(cs, val); | 1425 | retval = update_relax_domain_level(cs, val); |
@@ -1372,6 +1433,32 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | |||
1372 | } | 1433 | } |
1373 | 1434 | ||
1374 | /* | 1435 | /* |
1436 | * Common handling for a write to a "cpus" or "mems" file. | ||
1437 | */ | ||
1438 | static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | ||
1439 | const char *buf) | ||
1440 | { | ||
1441 | int retval = 0; | ||
1442 | |||
1443 | if (!cgroup_lock_live_group(cgrp)) | ||
1444 | return -ENODEV; | ||
1445 | |||
1446 | switch (cft->private) { | ||
1447 | case FILE_CPULIST: | ||
1448 | retval = update_cpumask(cgroup_cs(cgrp), buf); | ||
1449 | break; | ||
1450 | case FILE_MEMLIST: | ||
1451 | retval = update_nodemask(cgroup_cs(cgrp), buf); | ||
1452 | break; | ||
1453 | default: | ||
1454 | retval = -EINVAL; | ||
1455 | break; | ||
1456 | } | ||
1457 | cgroup_unlock(); | ||
1458 | return retval; | ||
1459 | } | ||
1460 | |||
1461 | /* | ||
1375 | * These ascii lists should be read in a single call, by using a user | 1462 | * These ascii lists should be read in a single call, by using a user |
1376 | * buffer large enough to hold the entire map. If read in smaller | 1463 | * buffer large enough to hold the entire map. If read in smaller |
1377 | * chunks, there is no guarantee of atomicity. Since the display format | 1464 | * chunks, there is no guarantee of atomicity. Since the display format |
@@ -1467,6 +1554,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) | |||
1467 | default: | 1554 | default: |
1468 | BUG(); | 1555 | BUG(); |
1469 | } | 1556 | } |
1557 | |||
1558 | /* Unreachable but makes gcc happy */ | ||
1559 | return 0; | ||
1470 | } | 1560 | } |
1471 | 1561 | ||
1472 | static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) | 1562 | static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) |
@@ -1479,6 +1569,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) | |||
1479 | default: | 1569 | default: |
1480 | BUG(); | 1570 | BUG(); |
1481 | } | 1571 | } |
1572 | |||
1573 | /* Unrechable but makes gcc happy */ | ||
1574 | return 0; | ||
1482 | } | 1575 | } |
1483 | 1576 | ||
1484 | 1577 | ||
@@ -1490,14 +1583,16 @@ static struct cftype files[] = { | |||
1490 | { | 1583 | { |
1491 | .name = "cpus", | 1584 | .name = "cpus", |
1492 | .read = cpuset_common_file_read, | 1585 | .read = cpuset_common_file_read, |
1493 | .write = cpuset_common_file_write, | 1586 | .write_string = cpuset_write_resmask, |
1587 | .max_write_len = (100U + 6 * NR_CPUS), | ||
1494 | .private = FILE_CPULIST, | 1588 | .private = FILE_CPULIST, |
1495 | }, | 1589 | }, |
1496 | 1590 | ||
1497 | { | 1591 | { |
1498 | .name = "mems", | 1592 | .name = "mems", |
1499 | .read = cpuset_common_file_read, | 1593 | .read = cpuset_common_file_read, |
1500 | .write = cpuset_common_file_write, | 1594 | .write_string = cpuset_write_resmask, |
1595 | .max_write_len = (100U + 6 * MAX_NUMNODES), | ||
1501 | .private = FILE_MEMLIST, | 1596 | .private = FILE_MEMLIST, |
1502 | }, | 1597 | }, |
1503 | 1598 | ||
@@ -1665,15 +1760,9 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1665 | } | 1760 | } |
1666 | 1761 | ||
1667 | /* | 1762 | /* |
1668 | * Locking note on the strange update_flag() call below: | ||
1669 | * | ||
1670 | * If the cpuset being removed has its flag 'sched_load_balance' | 1763 | * If the cpuset being removed has its flag 'sched_load_balance' |
1671 | * enabled, then simulate turning sched_load_balance off, which | 1764 | * enabled, then simulate turning sched_load_balance off, which |
1672 | * will call rebuild_sched_domains(). The get_online_cpus() | 1765 | * will call async_rebuild_sched_domains(). |
1673 | * call in rebuild_sched_domains() must not be made while holding | ||
1674 | * callback_mutex. Elsewhere the kernel nests callback_mutex inside | ||
1675 | * get_online_cpus() calls. So the reverse nesting would risk an | ||
1676 | * ABBA deadlock. | ||
1677 | */ | 1766 | */ |
1678 | 1767 | ||
1679 | static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | 1768 | static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) |
@@ -1692,7 +1781,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1692 | struct cgroup_subsys cpuset_subsys = { | 1781 | struct cgroup_subsys cpuset_subsys = { |
1693 | .name = "cpuset", | 1782 | .name = "cpuset", |
1694 | .create = cpuset_create, | 1783 | .create = cpuset_create, |
1695 | .destroy = cpuset_destroy, | 1784 | .destroy = cpuset_destroy, |
1696 | .can_attach = cpuset_can_attach, | 1785 | .can_attach = cpuset_can_attach, |
1697 | .attach = cpuset_attach, | 1786 | .attach = cpuset_attach, |
1698 | .populate = cpuset_populate, | 1787 | .populate = cpuset_populate, |
@@ -1778,13 +1867,13 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) | |||
1778 | scan.scan.heap = NULL; | 1867 | scan.scan.heap = NULL; |
1779 | scan.to = to->css.cgroup; | 1868 | scan.to = to->css.cgroup; |
1780 | 1869 | ||
1781 | if (cgroup_scan_tasks((struct cgroup_scanner *)&scan)) | 1870 | if (cgroup_scan_tasks(&scan.scan)) |
1782 | printk(KERN_ERR "move_member_tasks_to_cpuset: " | 1871 | printk(KERN_ERR "move_member_tasks_to_cpuset: " |
1783 | "cgroup_scan_tasks failed\n"); | 1872 | "cgroup_scan_tasks failed\n"); |
1784 | } | 1873 | } |
1785 | 1874 | ||
1786 | /* | 1875 | /* |
1787 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs | 1876 | * If CPU and/or memory hotplug handlers, below, unplug any CPUs |
1788 | * or memory nodes, we need to walk over the cpuset hierarchy, | 1877 | * or memory nodes, we need to walk over the cpuset hierarchy, |
1789 | * removing that CPU or node from all cpusets. If this removes the | 1878 | * removing that CPU or node from all cpusets. If this removes the |
1790 | * last CPU or node from a cpuset, then move the tasks in the empty | 1879 | * last CPU or node from a cpuset, then move the tasks in the empty |
@@ -1832,31 +1921,31 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
1832 | * that has tasks along with an empty 'mems'. But if we did see such | 1921 | * that has tasks along with an empty 'mems'. But if we did see such |
1833 | * a cpuset, we'd handle it just like we do if its 'cpus' was empty. | 1922 | * a cpuset, we'd handle it just like we do if its 'cpus' was empty. |
1834 | */ | 1923 | */ |
1835 | static void scan_for_empty_cpusets(const struct cpuset *root) | 1924 | static void scan_for_empty_cpusets(struct cpuset *root) |
1836 | { | 1925 | { |
1926 | LIST_HEAD(queue); | ||
1837 | struct cpuset *cp; /* scans cpusets being updated */ | 1927 | struct cpuset *cp; /* scans cpusets being updated */ |
1838 | struct cpuset *child; /* scans child cpusets of cp */ | 1928 | struct cpuset *child; /* scans child cpusets of cp */ |
1839 | struct list_head queue; | ||
1840 | struct cgroup *cont; | 1929 | struct cgroup *cont; |
1841 | 1930 | nodemask_t oldmems; | |
1842 | INIT_LIST_HEAD(&queue); | ||
1843 | 1931 | ||
1844 | list_add_tail((struct list_head *)&root->stack_list, &queue); | 1932 | list_add_tail((struct list_head *)&root->stack_list, &queue); |
1845 | 1933 | ||
1846 | while (!list_empty(&queue)) { | 1934 | while (!list_empty(&queue)) { |
1847 | cp = container_of(queue.next, struct cpuset, stack_list); | 1935 | cp = list_first_entry(&queue, struct cpuset, stack_list); |
1848 | list_del(queue.next); | 1936 | list_del(queue.next); |
1849 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | 1937 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { |
1850 | child = cgroup_cs(cont); | 1938 | child = cgroup_cs(cont); |
1851 | list_add_tail(&child->stack_list, &queue); | 1939 | list_add_tail(&child->stack_list, &queue); |
1852 | } | 1940 | } |
1853 | cont = cp->css.cgroup; | ||
1854 | 1941 | ||
1855 | /* Continue past cpusets with all cpus, mems online */ | 1942 | /* Continue past cpusets with all cpus, mems online */ |
1856 | if (cpus_subset(cp->cpus_allowed, cpu_online_map) && | 1943 | if (cpus_subset(cp->cpus_allowed, cpu_online_map) && |
1857 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) | 1944 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) |
1858 | continue; | 1945 | continue; |
1859 | 1946 | ||
1947 | oldmems = cp->mems_allowed; | ||
1948 | |||
1860 | /* Remove offline cpus and mems from this cpuset. */ | 1949 | /* Remove offline cpus and mems from this cpuset. */ |
1861 | mutex_lock(&callback_mutex); | 1950 | mutex_lock(&callback_mutex); |
1862 | cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); | 1951 | cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); |
@@ -1868,38 +1957,14 @@ static void scan_for_empty_cpusets(const struct cpuset *root) | |||
1868 | if (cpus_empty(cp->cpus_allowed) || | 1957 | if (cpus_empty(cp->cpus_allowed) || |
1869 | nodes_empty(cp->mems_allowed)) | 1958 | nodes_empty(cp->mems_allowed)) |
1870 | remove_tasks_in_empty_cpuset(cp); | 1959 | remove_tasks_in_empty_cpuset(cp); |
1960 | else { | ||
1961 | update_tasks_cpumask(cp, NULL); | ||
1962 | update_tasks_nodemask(cp, &oldmems); | ||
1963 | } | ||
1871 | } | 1964 | } |
1872 | } | 1965 | } |
1873 | 1966 | ||
1874 | /* | 1967 | /* |
1875 | * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track | ||
1876 | * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to | ||
1877 | * track what's online after any CPU or memory node hotplug or unplug event. | ||
1878 | * | ||
1879 | * Since there are two callers of this routine, one for CPU hotplug | ||
1880 | * events and one for memory node hotplug events, we could have coded | ||
1881 | * two separate routines here. We code it as a single common routine | ||
1882 | * in order to minimize text size. | ||
1883 | */ | ||
1884 | |||
1885 | static void common_cpu_mem_hotplug_unplug(void) | ||
1886 | { | ||
1887 | cgroup_lock(); | ||
1888 | |||
1889 | top_cpuset.cpus_allowed = cpu_online_map; | ||
1890 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | ||
1891 | scan_for_empty_cpusets(&top_cpuset); | ||
1892 | |||
1893 | /* | ||
1894 | * Scheduler destroys domains on hotplug events. | ||
1895 | * Rebuild them based on the current settings. | ||
1896 | */ | ||
1897 | rebuild_sched_domains(); | ||
1898 | |||
1899 | cgroup_unlock(); | ||
1900 | } | ||
1901 | |||
1902 | /* | ||
1903 | * The top_cpuset tracks what CPUs and Memory Nodes are online, | 1968 | * The top_cpuset tracks what CPUs and Memory Nodes are online, |
1904 | * period. This is necessary in order to make cpusets transparent | 1969 | * period. This is necessary in order to make cpusets transparent |
1905 | * (of no affect) on systems that are actively using CPU hotplug | 1970 | * (of no affect) on systems that are actively using CPU hotplug |
@@ -1907,29 +1972,52 @@ static void common_cpu_mem_hotplug_unplug(void) | |||
1907 | * | 1972 | * |
1908 | * This routine ensures that top_cpuset.cpus_allowed tracks | 1973 | * This routine ensures that top_cpuset.cpus_allowed tracks |
1909 | * cpu_online_map on each CPU hotplug (cpuhp) event. | 1974 | * cpu_online_map on each CPU hotplug (cpuhp) event. |
1975 | * | ||
1976 | * Called within get_online_cpus(). Needs to call cgroup_lock() | ||
1977 | * before calling generate_sched_domains(). | ||
1910 | */ | 1978 | */ |
1911 | 1979 | static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | |
1912 | static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, | ||
1913 | unsigned long phase, void *unused_cpu) | 1980 | unsigned long phase, void *unused_cpu) |
1914 | { | 1981 | { |
1915 | if (phase == CPU_DYING || phase == CPU_DYING_FROZEN) | 1982 | struct sched_domain_attr *attr; |
1983 | cpumask_t *doms; | ||
1984 | int ndoms; | ||
1985 | |||
1986 | switch (phase) { | ||
1987 | case CPU_ONLINE: | ||
1988 | case CPU_ONLINE_FROZEN: | ||
1989 | case CPU_DEAD: | ||
1990 | case CPU_DEAD_FROZEN: | ||
1991 | break; | ||
1992 | |||
1993 | default: | ||
1916 | return NOTIFY_DONE; | 1994 | return NOTIFY_DONE; |
1995 | } | ||
1917 | 1996 | ||
1918 | common_cpu_mem_hotplug_unplug(); | 1997 | cgroup_lock(); |
1919 | return 0; | 1998 | top_cpuset.cpus_allowed = cpu_online_map; |
1999 | scan_for_empty_cpusets(&top_cpuset); | ||
2000 | ndoms = generate_sched_domains(&doms, &attr); | ||
2001 | cgroup_unlock(); | ||
2002 | |||
2003 | /* Have scheduler rebuild the domains */ | ||
2004 | partition_sched_domains(ndoms, doms, attr); | ||
2005 | |||
2006 | return NOTIFY_OK; | ||
1920 | } | 2007 | } |
1921 | 2008 | ||
1922 | #ifdef CONFIG_MEMORY_HOTPLUG | 2009 | #ifdef CONFIG_MEMORY_HOTPLUG |
1923 | /* | 2010 | /* |
1924 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. | 2011 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. |
1925 | * Call this routine anytime after you change | 2012 | * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. |
1926 | * node_states[N_HIGH_MEMORY]. | 2013 | * See also the previous routine cpuset_track_online_cpus(). |
1927 | * See also the previous routine cpuset_handle_cpuhp(). | ||
1928 | */ | 2014 | */ |
1929 | |||
1930 | void cpuset_track_online_nodes(void) | 2015 | void cpuset_track_online_nodes(void) |
1931 | { | 2016 | { |
1932 | common_cpu_mem_hotplug_unplug(); | 2017 | cgroup_lock(); |
2018 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | ||
2019 | scan_for_empty_cpusets(&top_cpuset); | ||
2020 | cgroup_unlock(); | ||
1933 | } | 2021 | } |
1934 | #endif | 2022 | #endif |
1935 | 2023 | ||
@@ -1944,11 +2032,10 @@ void __init cpuset_init_smp(void) | |||
1944 | top_cpuset.cpus_allowed = cpu_online_map; | 2032 | top_cpuset.cpus_allowed = cpu_online_map; |
1945 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2033 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
1946 | 2034 | ||
1947 | hotcpu_notifier(cpuset_handle_cpuhp, 0); | 2035 | hotcpu_notifier(cpuset_track_online_cpus, 0); |
1948 | } | 2036 | } |
1949 | 2037 | ||
1950 | /** | 2038 | /** |
1951 | |||
1952 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. | 2039 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. |
1953 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. | 2040 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. |
1954 | * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. | 2041 | * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. |