diff options
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 432 |
1 files changed, 225 insertions, 207 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 459d601947a8..d5ab79cf516d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -54,7 +54,6 @@ | |||
54 | #include <asm/uaccess.h> | 54 | #include <asm/uaccess.h> |
55 | #include <asm/atomic.h> | 55 | #include <asm/atomic.h> |
56 | #include <linux/mutex.h> | 56 | #include <linux/mutex.h> |
57 | #include <linux/kfifo.h> | ||
58 | #include <linux/workqueue.h> | 57 | #include <linux/workqueue.h> |
59 | #include <linux/cgroup.h> | 58 | #include <linux/cgroup.h> |
60 | 59 | ||
@@ -227,10 +226,6 @@ static struct cpuset top_cpuset = { | |||
227 | * The task_struct fields mems_allowed and mems_generation may only | 226 | * The task_struct fields mems_allowed and mems_generation may only |
228 | * be accessed in the context of that task, so require no locks. | 227 | * be accessed in the context of that task, so require no locks. |
229 | * | 228 | * |
230 | * The cpuset_common_file_write handler for operations that modify | ||
231 | * the cpuset hierarchy holds cgroup_mutex across the entire operation, | ||
232 | * single threading all such cpuset modifications across the system. | ||
233 | * | ||
234 | * The cpuset_common_file_read() handlers only hold callback_mutex across | 229 | * The cpuset_common_file_read() handlers only hold callback_mutex across |
235 | * small pieces of code, such as when reading out possibly multi-word | 230 | * small pieces of code, such as when reading out possibly multi-word |
236 | * cpumasks and nodemasks. | 231 | * cpumasks and nodemasks. |
@@ -369,7 +364,7 @@ void cpuset_update_task_memory_state(void) | |||
369 | my_cpusets_mem_gen = top_cpuset.mems_generation; | 364 | my_cpusets_mem_gen = top_cpuset.mems_generation; |
370 | } else { | 365 | } else { |
371 | rcu_read_lock(); | 366 | rcu_read_lock(); |
372 | my_cpusets_mem_gen = task_cs(current)->mems_generation; | 367 | my_cpusets_mem_gen = task_cs(tsk)->mems_generation; |
373 | rcu_read_unlock(); | 368 | rcu_read_unlock(); |
374 | } | 369 | } |
375 | 370 | ||
@@ -490,21 +485,51 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | |||
490 | static void | 485 | static void |
491 | update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | 486 | update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) |
492 | { | 487 | { |
493 | if (!dattr) | ||
494 | return; | ||
495 | if (dattr->relax_domain_level < c->relax_domain_level) | 488 | if (dattr->relax_domain_level < c->relax_domain_level) |
496 | dattr->relax_domain_level = c->relax_domain_level; | 489 | dattr->relax_domain_level = c->relax_domain_level; |
497 | return; | 490 | return; |
498 | } | 491 | } |
499 | 492 | ||
493 | static void | ||
494 | update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | ||
495 | { | ||
496 | LIST_HEAD(q); | ||
497 | |||
498 | list_add(&c->stack_list, &q); | ||
499 | while (!list_empty(&q)) { | ||
500 | struct cpuset *cp; | ||
501 | struct cgroup *cont; | ||
502 | struct cpuset *child; | ||
503 | |||
504 | cp = list_first_entry(&q, struct cpuset, stack_list); | ||
505 | list_del(q.next); | ||
506 | |||
507 | if (cpus_empty(cp->cpus_allowed)) | ||
508 | continue; | ||
509 | |||
510 | if (is_sched_load_balance(cp)) | ||
511 | update_domain_attr(dattr, cp); | ||
512 | |||
513 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | ||
514 | child = cgroup_cs(cont); | ||
515 | list_add_tail(&child->stack_list, &q); | ||
516 | } | ||
517 | } | ||
518 | } | ||
519 | |||
500 | /* | 520 | /* |
501 | * rebuild_sched_domains() | 521 | * rebuild_sched_domains() |
502 | * | 522 | * |
503 | * If the flag 'sched_load_balance' of any cpuset with non-empty | 523 | * This routine will be called to rebuild the scheduler's dynamic |
504 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset | 524 | * sched domains: |
505 | * which has that flag enabled, or if any cpuset with a non-empty | 525 | * - if the flag 'sched_load_balance' of any cpuset with non-empty |
506 | * 'cpus' is removed, then call this routine to rebuild the | 526 | * 'cpus' changes, |
507 | * scheduler's dynamic sched domains. | 527 | * - or if the 'cpus' allowed changes in any cpuset which has that |
528 | * flag enabled, | ||
529 | * - or if the 'sched_relax_domain_level' of any cpuset which has | ||
530 | * that flag enabled and with non-empty 'cpus' changes, | ||
531 | * - or if any cpuset with non-empty 'cpus' is removed, | ||
532 | * - or if a cpu gets offlined. | ||
508 | * | 533 | * |
509 | * This routine builds a partial partition of the systems CPUs | 534 | * This routine builds a partial partition of the systems CPUs |
510 | * (the set of non-overlappping cpumask_t's in the array 'part' | 535 | * (the set of non-overlappping cpumask_t's in the array 'part' |
@@ -531,7 +556,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | |||
531 | * So the reverse nesting would risk an ABBA deadlock. | 556 | * So the reverse nesting would risk an ABBA deadlock. |
532 | * | 557 | * |
533 | * The three key local variables below are: | 558 | * The three key local variables below are: |
534 | * q - a kfifo queue of cpuset pointers, used to implement a | 559 | * q - a linked-list queue of cpuset pointers, used to implement a |
535 | * top-down scan of all cpusets. This scan loads a pointer | 560 | * top-down scan of all cpusets. This scan loads a pointer |
536 | * to each cpuset marked is_sched_load_balance into the | 561 | * to each cpuset marked is_sched_load_balance into the |
537 | * array 'csa'. For our purposes, rebuilding the schedulers | 562 | * array 'csa'. For our purposes, rebuilding the schedulers |
@@ -564,9 +589,9 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | |||
564 | * partition_sched_domains(). | 589 | * partition_sched_domains(). |
565 | */ | 590 | */ |
566 | 591 | ||
567 | static void rebuild_sched_domains(void) | 592 | void rebuild_sched_domains(void) |
568 | { | 593 | { |
569 | struct kfifo *q; /* queue of cpusets to be scanned */ | 594 | LIST_HEAD(q); /* queue of cpusets to be scanned*/ |
570 | struct cpuset *cp; /* scans q */ | 595 | struct cpuset *cp; /* scans q */ |
571 | struct cpuset **csa; /* array of all cpuset ptrs */ | 596 | struct cpuset **csa; /* array of all cpuset ptrs */ |
572 | int csn; /* how many cpuset ptrs in csa so far */ | 597 | int csn; /* how many cpuset ptrs in csa so far */ |
@@ -576,7 +601,6 @@ static void rebuild_sched_domains(void) | |||
576 | int ndoms; /* number of sched domains in result */ | 601 | int ndoms; /* number of sched domains in result */ |
577 | int nslot; /* next empty doms[] cpumask_t slot */ | 602 | int nslot; /* next empty doms[] cpumask_t slot */ |
578 | 603 | ||
579 | q = NULL; | ||
580 | csa = NULL; | 604 | csa = NULL; |
581 | doms = NULL; | 605 | doms = NULL; |
582 | dattr = NULL; | 606 | dattr = NULL; |
@@ -590,30 +614,42 @@ static void rebuild_sched_domains(void) | |||
590 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); | 614 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); |
591 | if (dattr) { | 615 | if (dattr) { |
592 | *dattr = SD_ATTR_INIT; | 616 | *dattr = SD_ATTR_INIT; |
593 | update_domain_attr(dattr, &top_cpuset); | 617 | update_domain_attr_tree(dattr, &top_cpuset); |
594 | } | 618 | } |
595 | *doms = top_cpuset.cpus_allowed; | 619 | *doms = top_cpuset.cpus_allowed; |
596 | goto rebuild; | 620 | goto rebuild; |
597 | } | 621 | } |
598 | 622 | ||
599 | q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL); | ||
600 | if (IS_ERR(q)) | ||
601 | goto done; | ||
602 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); | 623 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); |
603 | if (!csa) | 624 | if (!csa) |
604 | goto done; | 625 | goto done; |
605 | csn = 0; | 626 | csn = 0; |
606 | 627 | ||
607 | cp = &top_cpuset; | 628 | list_add(&top_cpuset.stack_list, &q); |
608 | __kfifo_put(q, (void *)&cp, sizeof(cp)); | 629 | while (!list_empty(&q)) { |
609 | while (__kfifo_get(q, (void *)&cp, sizeof(cp))) { | ||
610 | struct cgroup *cont; | 630 | struct cgroup *cont; |
611 | struct cpuset *child; /* scans child cpusets of cp */ | 631 | struct cpuset *child; /* scans child cpusets of cp */ |
612 | if (is_sched_load_balance(cp)) | 632 | |
633 | cp = list_first_entry(&q, struct cpuset, stack_list); | ||
634 | list_del(q.next); | ||
635 | |||
636 | if (cpus_empty(cp->cpus_allowed)) | ||
637 | continue; | ||
638 | |||
639 | /* | ||
640 | * All child cpusets contain a subset of the parent's cpus, so | ||
641 | * just skip them, and then we call update_domain_attr_tree() | ||
642 | * to calc relax_domain_level of the corresponding sched | ||
643 | * domain. | ||
644 | */ | ||
645 | if (is_sched_load_balance(cp)) { | ||
613 | csa[csn++] = cp; | 646 | csa[csn++] = cp; |
647 | continue; | ||
648 | } | ||
649 | |||
614 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | 650 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { |
615 | child = cgroup_cs(cont); | 651 | child = cgroup_cs(cont); |
616 | __kfifo_put(q, (void *)&child, sizeof(cp)); | 652 | list_add_tail(&child->stack_list, &q); |
617 | } | 653 | } |
618 | } | 654 | } |
619 | 655 | ||
@@ -679,7 +715,9 @@ restart: | |||
679 | if (apn == b->pn) { | 715 | if (apn == b->pn) { |
680 | cpus_or(*dp, *dp, b->cpus_allowed); | 716 | cpus_or(*dp, *dp, b->cpus_allowed); |
681 | b->pn = -1; | 717 | b->pn = -1; |
682 | update_domain_attr(dattr, b); | 718 | if (dattr) |
719 | update_domain_attr_tree(dattr | ||
720 | + nslot, b); | ||
683 | } | 721 | } |
684 | } | 722 | } |
685 | nslot++; | 723 | nslot++; |
@@ -694,43 +732,11 @@ rebuild: | |||
694 | put_online_cpus(); | 732 | put_online_cpus(); |
695 | 733 | ||
696 | done: | 734 | done: |
697 | if (q && !IS_ERR(q)) | ||
698 | kfifo_free(q); | ||
699 | kfree(csa); | 735 | kfree(csa); |
700 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ | 736 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ |
701 | /* Don't kfree(dattr) -- partition_sched_domains() does that. */ | 737 | /* Don't kfree(dattr) -- partition_sched_domains() does that. */ |
702 | } | 738 | } |
703 | 739 | ||
704 | static inline int started_after_time(struct task_struct *t1, | ||
705 | struct timespec *time, | ||
706 | struct task_struct *t2) | ||
707 | { | ||
708 | int start_diff = timespec_compare(&t1->start_time, time); | ||
709 | if (start_diff > 0) { | ||
710 | return 1; | ||
711 | } else if (start_diff < 0) { | ||
712 | return 0; | ||
713 | } else { | ||
714 | /* | ||
715 | * Arbitrarily, if two processes started at the same | ||
716 | * time, we'll say that the lower pointer value | ||
717 | * started first. Note that t2 may have exited by now | ||
718 | * so this may not be a valid pointer any longer, but | ||
719 | * that's fine - it still serves to distinguish | ||
720 | * between two tasks started (effectively) | ||
721 | * simultaneously. | ||
722 | */ | ||
723 | return t1 > t2; | ||
724 | } | ||
725 | } | ||
726 | |||
727 | static inline int started_after(void *p1, void *p2) | ||
728 | { | ||
729 | struct task_struct *t1 = p1; | ||
730 | struct task_struct *t2 = p2; | ||
731 | return started_after_time(t1, &t2->start_time, t2); | ||
732 | } | ||
733 | |||
734 | /** | 740 | /** |
735 | * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's | 741 | * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's |
736 | * @tsk: task to test | 742 | * @tsk: task to test |
@@ -766,15 +772,49 @@ static void cpuset_change_cpumask(struct task_struct *tsk, | |||
766 | } | 772 | } |
767 | 773 | ||
768 | /** | 774 | /** |
775 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. | ||
776 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed | ||
777 | * | ||
778 | * Called with cgroup_mutex held | ||
779 | * | ||
780 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | ||
781 | * calling callback functions for each. | ||
782 | * | ||
783 | * Return 0 if successful, -errno if not. | ||
784 | */ | ||
785 | static int update_tasks_cpumask(struct cpuset *cs) | ||
786 | { | ||
787 | struct cgroup_scanner scan; | ||
788 | struct ptr_heap heap; | ||
789 | int retval; | ||
790 | |||
791 | /* | ||
792 | * cgroup_scan_tasks() will initialize heap->gt for us. | ||
793 | * heap_init() is still needed here for we should not change | ||
794 | * cs->cpus_allowed when heap_init() fails. | ||
795 | */ | ||
796 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); | ||
797 | if (retval) | ||
798 | return retval; | ||
799 | |||
800 | scan.cg = cs->css.cgroup; | ||
801 | scan.test_task = cpuset_test_cpumask; | ||
802 | scan.process_task = cpuset_change_cpumask; | ||
803 | scan.heap = &heap; | ||
804 | retval = cgroup_scan_tasks(&scan); | ||
805 | |||
806 | heap_free(&heap); | ||
807 | return retval; | ||
808 | } | ||
809 | |||
810 | /** | ||
769 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it | 811 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it |
770 | * @cs: the cpuset to consider | 812 | * @cs: the cpuset to consider |
771 | * @buf: buffer of cpu numbers written to this cpuset | 813 | * @buf: buffer of cpu numbers written to this cpuset |
772 | */ | 814 | */ |
773 | static int update_cpumask(struct cpuset *cs, char *buf) | 815 | static int update_cpumask(struct cpuset *cs, const char *buf) |
774 | { | 816 | { |
775 | struct cpuset trialcs; | 817 | struct cpuset trialcs; |
776 | struct cgroup_scanner scan; | ||
777 | struct ptr_heap heap; | ||
778 | int retval; | 818 | int retval; |
779 | int is_load_balanced; | 819 | int is_load_balanced; |
780 | 820 | ||
@@ -790,7 +830,6 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
790 | * that parsing. The validate_change() call ensures that cpusets | 830 | * that parsing. The validate_change() call ensures that cpusets |
791 | * with tasks have cpus. | 831 | * with tasks have cpus. |
792 | */ | 832 | */ |
793 | buf = strstrip(buf); | ||
794 | if (!*buf) { | 833 | if (!*buf) { |
795 | cpus_clear(trialcs.cpus_allowed); | 834 | cpus_clear(trialcs.cpus_allowed); |
796 | } else { | 835 | } else { |
@@ -809,10 +848,6 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
809 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) | 848 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) |
810 | return 0; | 849 | return 0; |
811 | 850 | ||
812 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); | ||
813 | if (retval) | ||
814 | return retval; | ||
815 | |||
816 | is_load_balanced = is_sched_load_balance(&trialcs); | 851 | is_load_balanced = is_sched_load_balance(&trialcs); |
817 | 852 | ||
818 | mutex_lock(&callback_mutex); | 853 | mutex_lock(&callback_mutex); |
@@ -823,12 +858,9 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
823 | * Scan tasks in the cpuset, and update the cpumasks of any | 858 | * Scan tasks in the cpuset, and update the cpumasks of any |
824 | * that need an update. | 859 | * that need an update. |
825 | */ | 860 | */ |
826 | scan.cg = cs->css.cgroup; | 861 | retval = update_tasks_cpumask(cs); |
827 | scan.test_task = cpuset_test_cpumask; | 862 | if (retval < 0) |
828 | scan.process_task = cpuset_change_cpumask; | 863 | return retval; |
829 | scan.heap = &heap; | ||
830 | cgroup_scan_tasks(&scan); | ||
831 | heap_free(&heap); | ||
832 | 864 | ||
833 | if (is_load_balanced) | 865 | if (is_load_balanced) |
834 | rebuild_sched_domains(); | 866 | rebuild_sched_domains(); |
@@ -884,74 +916,25 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | |||
884 | mutex_unlock(&callback_mutex); | 916 | mutex_unlock(&callback_mutex); |
885 | } | 917 | } |
886 | 918 | ||
887 | /* | ||
888 | * Handle user request to change the 'mems' memory placement | ||
889 | * of a cpuset. Needs to validate the request, update the | ||
890 | * cpusets mems_allowed and mems_generation, and for each | ||
891 | * task in the cpuset, rebind any vma mempolicies and if | ||
892 | * the cpuset is marked 'memory_migrate', migrate the tasks | ||
893 | * pages to the new memory. | ||
894 | * | ||
895 | * Call with cgroup_mutex held. May take callback_mutex during call. | ||
896 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | ||
897 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | ||
898 | * their mempolicies to the cpusets new mems_allowed. | ||
899 | */ | ||
900 | |||
901 | static void *cpuset_being_rebound; | 919 | static void *cpuset_being_rebound; |
902 | 920 | ||
903 | static int update_nodemask(struct cpuset *cs, char *buf) | 921 | /** |
922 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. | ||
923 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed | ||
924 | * @oldmem: old mems_allowed of cpuset cs | ||
925 | * | ||
926 | * Called with cgroup_mutex held | ||
927 | * Return 0 if successful, -errno if not. | ||
928 | */ | ||
929 | static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) | ||
904 | { | 930 | { |
905 | struct cpuset trialcs; | ||
906 | nodemask_t oldmem; | ||
907 | struct task_struct *p; | 931 | struct task_struct *p; |
908 | struct mm_struct **mmarray; | 932 | struct mm_struct **mmarray; |
909 | int i, n, ntasks; | 933 | int i, n, ntasks; |
910 | int migrate; | 934 | int migrate; |
911 | int fudge; | 935 | int fudge; |
912 | int retval; | ||
913 | struct cgroup_iter it; | 936 | struct cgroup_iter it; |
914 | 937 | int retval; | |
915 | /* | ||
916 | * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; | ||
917 | * it's read-only | ||
918 | */ | ||
919 | if (cs == &top_cpuset) | ||
920 | return -EACCES; | ||
921 | |||
922 | trialcs = *cs; | ||
923 | |||
924 | /* | ||
925 | * An empty mems_allowed is ok iff there are no tasks in the cpuset. | ||
926 | * Since nodelist_parse() fails on an empty mask, we special case | ||
927 | * that parsing. The validate_change() call ensures that cpusets | ||
928 | * with tasks have memory. | ||
929 | */ | ||
930 | buf = strstrip(buf); | ||
931 | if (!*buf) { | ||
932 | nodes_clear(trialcs.mems_allowed); | ||
933 | } else { | ||
934 | retval = nodelist_parse(buf, trialcs.mems_allowed); | ||
935 | if (retval < 0) | ||
936 | goto done; | ||
937 | |||
938 | if (!nodes_subset(trialcs.mems_allowed, | ||
939 | node_states[N_HIGH_MEMORY])) | ||
940 | return -EINVAL; | ||
941 | } | ||
942 | oldmem = cs->mems_allowed; | ||
943 | if (nodes_equal(oldmem, trialcs.mems_allowed)) { | ||
944 | retval = 0; /* Too easy - nothing to do */ | ||
945 | goto done; | ||
946 | } | ||
947 | retval = validate_change(cs, &trialcs); | ||
948 | if (retval < 0) | ||
949 | goto done; | ||
950 | |||
951 | mutex_lock(&callback_mutex); | ||
952 | cs->mems_allowed = trialcs.mems_allowed; | ||
953 | cs->mems_generation = cpuset_mems_generation++; | ||
954 | mutex_unlock(&callback_mutex); | ||
955 | 938 | ||
956 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ | 939 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ |
957 | 940 | ||
@@ -1018,7 +1001,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
1018 | 1001 | ||
1019 | mpol_rebind_mm(mm, &cs->mems_allowed); | 1002 | mpol_rebind_mm(mm, &cs->mems_allowed); |
1020 | if (migrate) | 1003 | if (migrate) |
1021 | cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed); | 1004 | cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); |
1022 | mmput(mm); | 1005 | mmput(mm); |
1023 | } | 1006 | } |
1024 | 1007 | ||
@@ -1030,6 +1013,70 @@ done: | |||
1030 | return retval; | 1013 | return retval; |
1031 | } | 1014 | } |
1032 | 1015 | ||
1016 | /* | ||
1017 | * Handle user request to change the 'mems' memory placement | ||
1018 | * of a cpuset. Needs to validate the request, update the | ||
1019 | * cpusets mems_allowed and mems_generation, and for each | ||
1020 | * task in the cpuset, rebind any vma mempolicies and if | ||
1021 | * the cpuset is marked 'memory_migrate', migrate the tasks | ||
1022 | * pages to the new memory. | ||
1023 | * | ||
1024 | * Call with cgroup_mutex held. May take callback_mutex during call. | ||
1025 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | ||
1026 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | ||
1027 | * their mempolicies to the cpusets new mems_allowed. | ||
1028 | */ | ||
1029 | static int update_nodemask(struct cpuset *cs, const char *buf) | ||
1030 | { | ||
1031 | struct cpuset trialcs; | ||
1032 | nodemask_t oldmem; | ||
1033 | int retval; | ||
1034 | |||
1035 | /* | ||
1036 | * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; | ||
1037 | * it's read-only | ||
1038 | */ | ||
1039 | if (cs == &top_cpuset) | ||
1040 | return -EACCES; | ||
1041 | |||
1042 | trialcs = *cs; | ||
1043 | |||
1044 | /* | ||
1045 | * An empty mems_allowed is ok iff there are no tasks in the cpuset. | ||
1046 | * Since nodelist_parse() fails on an empty mask, we special case | ||
1047 | * that parsing. The validate_change() call ensures that cpusets | ||
1048 | * with tasks have memory. | ||
1049 | */ | ||
1050 | if (!*buf) { | ||
1051 | nodes_clear(trialcs.mems_allowed); | ||
1052 | } else { | ||
1053 | retval = nodelist_parse(buf, trialcs.mems_allowed); | ||
1054 | if (retval < 0) | ||
1055 | goto done; | ||
1056 | |||
1057 | if (!nodes_subset(trialcs.mems_allowed, | ||
1058 | node_states[N_HIGH_MEMORY])) | ||
1059 | return -EINVAL; | ||
1060 | } | ||
1061 | oldmem = cs->mems_allowed; | ||
1062 | if (nodes_equal(oldmem, trialcs.mems_allowed)) { | ||
1063 | retval = 0; /* Too easy - nothing to do */ | ||
1064 | goto done; | ||
1065 | } | ||
1066 | retval = validate_change(cs, &trialcs); | ||
1067 | if (retval < 0) | ||
1068 | goto done; | ||
1069 | |||
1070 | mutex_lock(&callback_mutex); | ||
1071 | cs->mems_allowed = trialcs.mems_allowed; | ||
1072 | cs->mems_generation = cpuset_mems_generation++; | ||
1073 | mutex_unlock(&callback_mutex); | ||
1074 | |||
1075 | retval = update_tasks_nodemask(cs, &oldmem); | ||
1076 | done: | ||
1077 | return retval; | ||
1078 | } | ||
1079 | |||
1033 | int current_cpuset_is_being_rebound(void) | 1080 | int current_cpuset_is_being_rebound(void) |
1034 | { | 1081 | { |
1035 | return task_cs(current) == cpuset_being_rebound; | 1082 | return task_cs(current) == cpuset_being_rebound; |
@@ -1042,7 +1089,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
1042 | 1089 | ||
1043 | if (val != cs->relax_domain_level) { | 1090 | if (val != cs->relax_domain_level) { |
1044 | cs->relax_domain_level = val; | 1091 | cs->relax_domain_level = val; |
1045 | rebuild_sched_domains(); | 1092 | if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) |
1093 | rebuild_sched_domains(); | ||
1046 | } | 1094 | } |
1047 | 1095 | ||
1048 | return 0; | 1096 | return 0; |
@@ -1254,72 +1302,14 @@ typedef enum { | |||
1254 | FILE_SPREAD_SLAB, | 1302 | FILE_SPREAD_SLAB, |
1255 | } cpuset_filetype_t; | 1303 | } cpuset_filetype_t; |
1256 | 1304 | ||
1257 | static ssize_t cpuset_common_file_write(struct cgroup *cont, | ||
1258 | struct cftype *cft, | ||
1259 | struct file *file, | ||
1260 | const char __user *userbuf, | ||
1261 | size_t nbytes, loff_t *unused_ppos) | ||
1262 | { | ||
1263 | struct cpuset *cs = cgroup_cs(cont); | ||
1264 | cpuset_filetype_t type = cft->private; | ||
1265 | char *buffer; | ||
1266 | int retval = 0; | ||
1267 | |||
1268 | /* Crude upper limit on largest legitimate cpulist user might write. */ | ||
1269 | if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES)) | ||
1270 | return -E2BIG; | ||
1271 | |||
1272 | /* +1 for nul-terminator */ | ||
1273 | buffer = kmalloc(nbytes + 1, GFP_KERNEL); | ||
1274 | if (!buffer) | ||
1275 | return -ENOMEM; | ||
1276 | |||
1277 | if (copy_from_user(buffer, userbuf, nbytes)) { | ||
1278 | retval = -EFAULT; | ||
1279 | goto out1; | ||
1280 | } | ||
1281 | buffer[nbytes] = 0; /* nul-terminate */ | ||
1282 | |||
1283 | cgroup_lock(); | ||
1284 | |||
1285 | if (cgroup_is_removed(cont)) { | ||
1286 | retval = -ENODEV; | ||
1287 | goto out2; | ||
1288 | } | ||
1289 | |||
1290 | switch (type) { | ||
1291 | case FILE_CPULIST: | ||
1292 | retval = update_cpumask(cs, buffer); | ||
1293 | break; | ||
1294 | case FILE_MEMLIST: | ||
1295 | retval = update_nodemask(cs, buffer); | ||
1296 | break; | ||
1297 | default: | ||
1298 | retval = -EINVAL; | ||
1299 | goto out2; | ||
1300 | } | ||
1301 | |||
1302 | if (retval == 0) | ||
1303 | retval = nbytes; | ||
1304 | out2: | ||
1305 | cgroup_unlock(); | ||
1306 | out1: | ||
1307 | kfree(buffer); | ||
1308 | return retval; | ||
1309 | } | ||
1310 | |||
1311 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | 1305 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) |
1312 | { | 1306 | { |
1313 | int retval = 0; | 1307 | int retval = 0; |
1314 | struct cpuset *cs = cgroup_cs(cgrp); | 1308 | struct cpuset *cs = cgroup_cs(cgrp); |
1315 | cpuset_filetype_t type = cft->private; | 1309 | cpuset_filetype_t type = cft->private; |
1316 | 1310 | ||
1317 | cgroup_lock(); | 1311 | if (!cgroup_lock_live_group(cgrp)) |
1318 | |||
1319 | if (cgroup_is_removed(cgrp)) { | ||
1320 | cgroup_unlock(); | ||
1321 | return -ENODEV; | 1312 | return -ENODEV; |
1322 | } | ||
1323 | 1313 | ||
1324 | switch (type) { | 1314 | switch (type) { |
1325 | case FILE_CPU_EXCLUSIVE: | 1315 | case FILE_CPU_EXCLUSIVE: |
@@ -1365,12 +1355,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | |||
1365 | struct cpuset *cs = cgroup_cs(cgrp); | 1355 | struct cpuset *cs = cgroup_cs(cgrp); |
1366 | cpuset_filetype_t type = cft->private; | 1356 | cpuset_filetype_t type = cft->private; |
1367 | 1357 | ||
1368 | cgroup_lock(); | 1358 | if (!cgroup_lock_live_group(cgrp)) |
1369 | |||
1370 | if (cgroup_is_removed(cgrp)) { | ||
1371 | cgroup_unlock(); | ||
1372 | return -ENODEV; | 1359 | return -ENODEV; |
1373 | } | 1360 | |
1374 | switch (type) { | 1361 | switch (type) { |
1375 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | 1362 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: |
1376 | retval = update_relax_domain_level(cs, val); | 1363 | retval = update_relax_domain_level(cs, val); |
@@ -1384,6 +1371,32 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | |||
1384 | } | 1371 | } |
1385 | 1372 | ||
1386 | /* | 1373 | /* |
1374 | * Common handling for a write to a "cpus" or "mems" file. | ||
1375 | */ | ||
1376 | static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | ||
1377 | const char *buf) | ||
1378 | { | ||
1379 | int retval = 0; | ||
1380 | |||
1381 | if (!cgroup_lock_live_group(cgrp)) | ||
1382 | return -ENODEV; | ||
1383 | |||
1384 | switch (cft->private) { | ||
1385 | case FILE_CPULIST: | ||
1386 | retval = update_cpumask(cgroup_cs(cgrp), buf); | ||
1387 | break; | ||
1388 | case FILE_MEMLIST: | ||
1389 | retval = update_nodemask(cgroup_cs(cgrp), buf); | ||
1390 | break; | ||
1391 | default: | ||
1392 | retval = -EINVAL; | ||
1393 | break; | ||
1394 | } | ||
1395 | cgroup_unlock(); | ||
1396 | return retval; | ||
1397 | } | ||
1398 | |||
1399 | /* | ||
1387 | * These ascii lists should be read in a single call, by using a user | 1400 | * These ascii lists should be read in a single call, by using a user |
1388 | * buffer large enough to hold the entire map. If read in smaller | 1401 | * buffer large enough to hold the entire map. If read in smaller |
1389 | * chunks, there is no guarantee of atomicity. Since the display format | 1402 | * chunks, there is no guarantee of atomicity. Since the display format |
@@ -1502,14 +1515,16 @@ static struct cftype files[] = { | |||
1502 | { | 1515 | { |
1503 | .name = "cpus", | 1516 | .name = "cpus", |
1504 | .read = cpuset_common_file_read, | 1517 | .read = cpuset_common_file_read, |
1505 | .write = cpuset_common_file_write, | 1518 | .write_string = cpuset_write_resmask, |
1519 | .max_write_len = (100U + 6 * NR_CPUS), | ||
1506 | .private = FILE_CPULIST, | 1520 | .private = FILE_CPULIST, |
1507 | }, | 1521 | }, |
1508 | 1522 | ||
1509 | { | 1523 | { |
1510 | .name = "mems", | 1524 | .name = "mems", |
1511 | .read = cpuset_common_file_read, | 1525 | .read = cpuset_common_file_read, |
1512 | .write = cpuset_common_file_write, | 1526 | .write_string = cpuset_write_resmask, |
1527 | .max_write_len = (100U + 6 * MAX_NUMNODES), | ||
1513 | .private = FILE_MEMLIST, | 1528 | .private = FILE_MEMLIST, |
1514 | }, | 1529 | }, |
1515 | 1530 | ||
@@ -1790,7 +1805,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) | |||
1790 | scan.scan.heap = NULL; | 1805 | scan.scan.heap = NULL; |
1791 | scan.to = to->css.cgroup; | 1806 | scan.to = to->css.cgroup; |
1792 | 1807 | ||
1793 | if (cgroup_scan_tasks((struct cgroup_scanner *)&scan)) | 1808 | if (cgroup_scan_tasks(&scan.scan)) |
1794 | printk(KERN_ERR "move_member_tasks_to_cpuset: " | 1809 | printk(KERN_ERR "move_member_tasks_to_cpuset: " |
1795 | "cgroup_scan_tasks failed\n"); | 1810 | "cgroup_scan_tasks failed\n"); |
1796 | } | 1811 | } |
@@ -1846,29 +1861,29 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
1846 | */ | 1861 | */ |
1847 | static void scan_for_empty_cpusets(const struct cpuset *root) | 1862 | static void scan_for_empty_cpusets(const struct cpuset *root) |
1848 | { | 1863 | { |
1864 | LIST_HEAD(queue); | ||
1849 | struct cpuset *cp; /* scans cpusets being updated */ | 1865 | struct cpuset *cp; /* scans cpusets being updated */ |
1850 | struct cpuset *child; /* scans child cpusets of cp */ | 1866 | struct cpuset *child; /* scans child cpusets of cp */ |
1851 | struct list_head queue; | ||
1852 | struct cgroup *cont; | 1867 | struct cgroup *cont; |
1853 | 1868 | nodemask_t oldmems; | |
1854 | INIT_LIST_HEAD(&queue); | ||
1855 | 1869 | ||
1856 | list_add_tail((struct list_head *)&root->stack_list, &queue); | 1870 | list_add_tail((struct list_head *)&root->stack_list, &queue); |
1857 | 1871 | ||
1858 | while (!list_empty(&queue)) { | 1872 | while (!list_empty(&queue)) { |
1859 | cp = container_of(queue.next, struct cpuset, stack_list); | 1873 | cp = list_first_entry(&queue, struct cpuset, stack_list); |
1860 | list_del(queue.next); | 1874 | list_del(queue.next); |
1861 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | 1875 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { |
1862 | child = cgroup_cs(cont); | 1876 | child = cgroup_cs(cont); |
1863 | list_add_tail(&child->stack_list, &queue); | 1877 | list_add_tail(&child->stack_list, &queue); |
1864 | } | 1878 | } |
1865 | cont = cp->css.cgroup; | ||
1866 | 1879 | ||
1867 | /* Continue past cpusets with all cpus, mems online */ | 1880 | /* Continue past cpusets with all cpus, mems online */ |
1868 | if (cpus_subset(cp->cpus_allowed, cpu_online_map) && | 1881 | if (cpus_subset(cp->cpus_allowed, cpu_online_map) && |
1869 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) | 1882 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) |
1870 | continue; | 1883 | continue; |
1871 | 1884 | ||
1885 | oldmems = cp->mems_allowed; | ||
1886 | |||
1872 | /* Remove offline cpus and mems from this cpuset. */ | 1887 | /* Remove offline cpus and mems from this cpuset. */ |
1873 | mutex_lock(&callback_mutex); | 1888 | mutex_lock(&callback_mutex); |
1874 | cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); | 1889 | cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); |
@@ -1880,6 +1895,10 @@ static void scan_for_empty_cpusets(const struct cpuset *root) | |||
1880 | if (cpus_empty(cp->cpus_allowed) || | 1895 | if (cpus_empty(cp->cpus_allowed) || |
1881 | nodes_empty(cp->mems_allowed)) | 1896 | nodes_empty(cp->mems_allowed)) |
1882 | remove_tasks_in_empty_cpuset(cp); | 1897 | remove_tasks_in_empty_cpuset(cp); |
1898 | else { | ||
1899 | update_tasks_cpumask(cp); | ||
1900 | update_tasks_nodemask(cp, &oldmems); | ||
1901 | } | ||
1883 | } | 1902 | } |
1884 | } | 1903 | } |
1885 | 1904 | ||
@@ -1972,7 +1991,6 @@ void __init cpuset_init_smp(void) | |||
1972 | } | 1991 | } |
1973 | 1992 | ||
1974 | /** | 1993 | /** |
1975 | |||
1976 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. | 1994 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. |
1977 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. | 1995 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. |
1978 | * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. | 1996 | * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. |